diff --git a/sys/kern/kern_cons.c b/sys/kern/kern_cons.c
index 24952561449b..a8f6b689bff7 100644
--- a/sys/kern/kern_cons.c
+++ b/sys/kern/kern_cons.c
@@ -1,773 +1,773 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  *
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)cons.c	7.2 (Berkeley) 5/9/91
  */
 
 #include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_syscons.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/fcntl.h>
 #include <sys/kbio.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/reboot.h>
 #include <sys/sysctl.h>
 #include <sys/sbuf.h>
 #include <sys/tslog.h>
 #include <sys/tty.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 
 #include <ddb/ddb.h>
 
 #include <dev/kbd/kbdreg.h>
 
 #include <machine/cpu.h>
 #include <machine/clock.h>
 
 static MALLOC_DEFINE(M_TTYCONS, "tty console", "tty console handling");
 
 struct cn_device {
 	STAILQ_ENTRY(cn_device) cnd_next;
 	struct		consdev *cnd_cn;
 };
 
 #define CNDEVPATHMAX	32
 #define CNDEVTAB_SIZE	4
 static struct cn_device cn_devtab[CNDEVTAB_SIZE];
 static STAILQ_HEAD(, cn_device) cn_devlist =
     STAILQ_HEAD_INITIALIZER(cn_devlist);
 
 int	cons_avail_mask = 0;	/* Bit mask. Each registered low level console
 				 * which is currently unavailable for inpit
 				 * (i.e., if it is in graphics mode) will have
 				 * this bit cleared.
 				 */
 
 static int cn_mute;
 SYSCTL_INT(_kern, OID_AUTO, consmute, CTLFLAG_RW, &cn_mute, 0,
     "State of the console muting");
 
 static char *consbuf;			/* buffer used by `consmsgbuf' */
 static struct callout conscallout;	/* callout for outputting to constty */
 struct msgbuf consmsgbuf;		/* message buffer for console tty */
 static bool console_pausing;		/* pause after each line during probe */
 static const char console_pausestr[] =
 "<pause; press any key to proceed to next line or '.' to end pause mode>";
 struct tty *constty;			/* pointer to console "window" tty */
 static struct mtx constty_mtx;		/* Mutex for constty assignment. */
 MTX_SYSINIT(constty_mtx, &constty_mtx, "constty_mtx", MTX_DEF);
 static struct mtx cnputs_mtx;		/* Mutex for cnputs(). */
 MTX_SYSINIT(cnputs_mtx, &cnputs_mtx, "cnputs_mtx", MTX_SPIN | MTX_NOWITNESS);
 
 static void constty_timeout(void *arg);
 
 static struct consdev cons_consdev;
 DATA_SET(cons_set, cons_consdev);
 SET_DECLARE(cons_set, struct consdev);
 
 /*
  * Stub for configurations that don't actually have a keyboard driver. Inclusion
  * of kbd.c is contingent on any number of keyboard/console drivers being
  * present in the kernel; rather than trying to catch them all, we'll just
  * maintain this weak kbdinit that will be overridden by the strong version in
  * kbd.c if it's present.
  */
 __weak_symbol void
 kbdinit(void)
 {
 
 }
 
 void
 cninit(void)
 {
 	struct consdev *best_cn, *cn, **list;
 
 	TSENTER();
 	/*
 	 * Check if we should mute the console (for security reasons perhaps)
 	 * It can be changes dynamically using sysctl kern.consmute
 	 * once we are up and going.
 	 * 
 	 */
         cn_mute = ((boothowto & (RB_MUTE
 			|RB_SINGLE
 			|RB_VERBOSE
 			|RB_ASKNAME)) == RB_MUTE);
 
 	/*
 	 * Bring up the kbd layer just in time for cnprobe.  Console drivers
 	 * have a dependency on kbd being ready, so this fits nicely between the
 	 * machdep callers of cninit() and MI probing/initialization of consoles
 	 * here.
 	 */
 	kbdinit();
 
 	/*
 	 * Find the first console with the highest priority.
 	 */
 	best_cn = NULL;
 	SET_FOREACH(list, cons_set) {
 		cn = *list;
 		cnremove(cn);
 		/* Skip cons_consdev. */
 		if (cn->cn_ops == NULL)
 			continue;
 		cn->cn_ops->cn_probe(cn);
 		if (cn->cn_pri == CN_DEAD)
 			continue;
 		if (best_cn == NULL || cn->cn_pri > best_cn->cn_pri)
 			best_cn = cn;
 		if (boothowto & RB_MULTIPLE) {
 			/*
 			 * Initialize console, and attach to it.
 			 */
 			cn->cn_ops->cn_init(cn);
 			cnadd(cn);
 		}
 	}
 	if (best_cn == NULL)
 		return;
 	if ((boothowto & RB_MULTIPLE) == 0) {
 		best_cn->cn_ops->cn_init(best_cn);
 		cnadd(best_cn);
 	}
 	if (boothowto & RB_PAUSE)
 		console_pausing = true;
 	/*
 	 * Make the best console the preferred console.
 	 */
 	cnselect(best_cn);
 
 #ifdef EARLY_PRINTF
 	/*
 	 * Release early console.
 	 */
 	early_putc = NULL;
 #endif
 	TSEXIT();
 }
 
 void
 cninit_finish(void)
 {
 	console_pausing = false;
 } 
 
 /* add a new physical console to back the virtual console */
 int
 cnadd(struct consdev *cn)
 {
 	struct cn_device *cnd;
 	int i;
 
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
 		if (cnd->cnd_cn == cn)
 			return (0);
 	for (i = 0; i < CNDEVTAB_SIZE; i++) {
 		cnd = &cn_devtab[i];
 		if (cnd->cnd_cn == NULL)
 			break;
 	}
 	if (cnd->cnd_cn != NULL)
 		return (ENOMEM);
 	cnd->cnd_cn = cn;
 	if (cn->cn_name[0] == '\0') {
 		/* XXX: it is unclear if/where this print might output */
 		printf("WARNING: console at %p has no name\n", cn);
 	}
 	STAILQ_INSERT_TAIL(&cn_devlist, cnd, cnd_next);
 	if (STAILQ_FIRST(&cn_devlist) == cnd)
 		ttyconsdev_select(cnd->cnd_cn->cn_name);
 
 	/* Add device to the active mask. */
 	cnavailable(cn, (cn->cn_flags & CN_FLAG_NOAVAIL) == 0);
 
 	return (0);
 }
 
 void
 cnremove(struct consdev *cn)
 {
 	struct cn_device *cnd;
 	int i;
 
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
 		if (cnd->cnd_cn != cn)
 			continue;
 		if (STAILQ_FIRST(&cn_devlist) == cnd)
 			ttyconsdev_select(NULL);
 		STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
 		cnd->cnd_cn = NULL;
 
 		/* Remove this device from available mask. */
 		for (i = 0; i < CNDEVTAB_SIZE; i++) 
 			if (cnd == &cn_devtab[i]) {
 				cons_avail_mask &= ~(1 << i);
 				break;
 			}
 #if 0
 		/*
 		 * XXX
 		 * syscons gets really confused if console resources are
 		 * freed after the system has initialized.
 		 */
 		if (cn->cn_term != NULL)
 			cn->cn_ops->cn_term(cn);
 #endif
 		return;
 	}
 }
 
 void
 cnselect(struct consdev *cn)
 {
 	struct cn_device *cnd;
 
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
 		if (cnd->cnd_cn != cn)
 			continue;
 		if (cnd == STAILQ_FIRST(&cn_devlist))
 			return;
 		STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
 		STAILQ_INSERT_HEAD(&cn_devlist, cnd, cnd_next);
 		ttyconsdev_select(cnd->cnd_cn->cn_name);
 		return;
 	}
 }
 
 void
 cnavailable(struct consdev *cn, int available)
 {
 	int i;
 
 	for (i = 0; i < CNDEVTAB_SIZE; i++) {
 		if (cn_devtab[i].cnd_cn == cn)
 			break;
 	}
 	if (available) {
 		if (i < CNDEVTAB_SIZE)
 			cons_avail_mask |= (1 << i); 
 		cn->cn_flags &= ~CN_FLAG_NOAVAIL;
 	} else {
 		if (i < CNDEVTAB_SIZE)
 			cons_avail_mask &= ~(1 << i);
 		cn->cn_flags |= CN_FLAG_NOAVAIL;
 	}
 }
 
 int
 cnunavailable(void)
 {
 
 	return (cons_avail_mask == 0);
 }
 
 /*
  * sysctl_kern_console() provides output parseable in conscontrol(1).
  */
 static int
 sysctl_kern_console(SYSCTL_HANDLER_ARGS)
 {
 	struct cn_device *cnd;
 	struct consdev *cp, **list;
 	char *p;
 	bool delete;
 	int error;
 	struct sbuf *sb;
 
 	sb = sbuf_new(NULL, NULL, CNDEVPATHMAX * 2, SBUF_AUTOEXTEND |
 	    SBUF_INCLUDENUL);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_clear(sb);
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
 		sbuf_printf(sb, "%s,", cnd->cnd_cn->cn_name);
-	sbuf_printf(sb, "/");
+	sbuf_putc(sb, '/');
 	SET_FOREACH(list, cons_set) {
 		cp = *list;
 		if (cp->cn_name[0] != '\0')
 			sbuf_printf(sb, "%s,", cp->cn_name);
 	}
 	sbuf_finish(sb);
 	error = sysctl_handle_string(oidp, sbuf_data(sb), sbuf_len(sb), req);
 	if (error == 0 && req->newptr != NULL) {
 		p = sbuf_data(sb);
 		error = ENXIO;
 		delete = false;
 		if (*p == '-') {
 			delete = true;
 			p++;
 		}
 		SET_FOREACH(list, cons_set) {
 			cp = *list;
 			if (strcmp(p, cp->cn_name) != 0)
 				continue;
 			if (delete) {
 				cnremove(cp);
 				error = 0;
 			} else {
 				error = cnadd(cp);
 				if (error == 0)
 					cnselect(cp);
 			}
 			break;
 		}
 	}
 	sbuf_delete(sb);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, console,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0,
     sysctl_kern_console, "A",
     "Console device control");
 
 void
 cngrab(void)
 {
 	struct cn_device *cnd;
 	struct consdev *cn;
 
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
 		cn = cnd->cnd_cn;
 		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG))
 			cn->cn_ops->cn_grab(cn);
 	}
 }
 
 void
 cnungrab(void)
 {
 	struct cn_device *cnd;
 	struct consdev *cn;
 
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
 		cn = cnd->cnd_cn;
 		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG))
 			cn->cn_ops->cn_ungrab(cn);
 	}
 }
 
 void
 cnresume(void)
 {
 	struct cn_device *cnd;
 	struct consdev *cn;
 
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
 		cn = cnd->cnd_cn;
 		if (cn->cn_ops->cn_resume != NULL)
 			cn->cn_ops->cn_resume(cn);
 	}
 }
 
 /*
  * Low level console routines.
  */
 int
 cngetc(void)
 {
 	int c;
 
 	if (cn_mute)
 		return (-1);
 	while ((c = cncheckc()) == -1)
 		cpu_spinwait();
 	if (c == '\r')
 		c = '\n';		/* console input is always ICRNL */
 	return (c);
 }
 
 int
 cncheckc(void)
 {
 	struct cn_device *cnd;
 	struct consdev *cn;
 	int c;
 
 	if (cn_mute)
 		return (-1);
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
 		cn = cnd->cnd_cn;
 		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) {
 			c = cn->cn_ops->cn_getc(cn);
 			if (c != -1)
 				return (c);
 		}
 	}
 	return (-1);
 }
 
 void
 cngets(char *cp, size_t size, int visible)
 {
 	char *lp, *end;
 	int c;
 
 	cngrab();
 
 	lp = cp;
 	end = cp + size - 1;
 	for (;;) {
 		c = cngetc() & 0177;
 		switch (c) {
 		case '\n':
 		case '\r':
 			cnputc(c);
 			*lp = '\0';
 			cnungrab();
 			return;
 		case '\b':
 		case '\177':
 			if (lp > cp) {
 				if (visible)
 					cnputs("\b \b");
 				lp--;
 			}
 			continue;
 		case '\0':
 			continue;
 		default:
 			if (lp < end) {
 				switch (visible) {
 				case GETS_NOECHO:
 					break;
 				case GETS_ECHOPASS:
 					cnputc('*');
 					break;
 				default:
 					cnputc(c);
 					break;
 				}
 				*lp++ = c;
 			}
 		}
 	}
 }
 
 void
 cnputc(int c)
 {
 	struct cn_device *cnd;
 	struct consdev *cn;
 	const char *cp;
 
 #ifdef EARLY_PRINTF
 	if (early_putc != NULL) {
 		if (c == '\n')
 			early_putc('\r');
 		early_putc(c);
 		return;
 	}
 #endif
 
 	if (cn_mute || c == '\0')
 		return;
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
 		cn = cnd->cnd_cn;
 		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) {
 			if (c == '\n')
 				cn->cn_ops->cn_putc(cn, '\r');
 			cn->cn_ops->cn_putc(cn, c);
 		}
 	}
 	if (console_pausing && c == '\n' && !kdb_active) {
 		for (cp = console_pausestr; *cp != '\0'; cp++)
 			cnputc(*cp);
 		cngrab();
 		if (cngetc() == '.')
 			console_pausing = false;
 		cnungrab();
 		cnputc('\r');
 		for (cp = console_pausestr; *cp != '\0'; cp++)
 			cnputc(' ');
 		cnputc('\r');
 	}
 }
 
 void
 cnputsn(const char *p, size_t n)
 {
 	size_t i;
 	bool unlock_reqd = false;
 
 	if (mtx_initialized(&cnputs_mtx)) {
 		/*
 		 * NOTE: Debug prints and/or witness printouts in
 		 * console driver clients can cause the "cnputs_mtx"
 		 * mutex to recurse. Simply return if that happens.
 		 */
 		if (mtx_owned(&cnputs_mtx))
 			return;
 		mtx_lock_spin(&cnputs_mtx);
 		unlock_reqd = true;
 	}
 
 	for (i = 0; i < n; i++)
 		cnputc(p[i]);
 
 	if (unlock_reqd)
 		mtx_unlock_spin(&cnputs_mtx);
 }
 
 void
 cnputs(const char *p)
 {
 	cnputsn(p, strlen(p));
 }
 
 static unsigned int consmsgbuf_size = 65536;
 SYSCTL_UINT(_kern, OID_AUTO, consmsgbuf_size, CTLFLAG_RWTUN, &consmsgbuf_size,
     0, "Console tty buffer size");
 
 /*
  * Redirect console output to a tty.
  */
 int
 constty_set(struct tty *tp)
 {
 	int size = consmsgbuf_size;
 	void *buf = NULL;
 
 	tty_assert_locked(tp);
 	if (constty == tp)
 		return (0);
 	if (constty != NULL)
 		return (EBUSY);
 
 	if (consbuf == NULL) {
 		tty_unlock(tp);
 		buf = malloc(size, M_TTYCONS, M_WAITOK);
 		tty_lock(tp);
 	}
 	mtx_lock(&constty_mtx);
 	if (constty != NULL) {
 		mtx_unlock(&constty_mtx);
 		free(buf, M_TTYCONS);
 		return (EBUSY);
 	}
 	if (consbuf == NULL) {
 		consbuf = buf;
 		msgbuf_init(&consmsgbuf, buf, size);
 	} else
 		free(buf, M_TTYCONS);
 	constty = tp;
 	mtx_unlock(&constty_mtx);
 
 	callout_init_mtx(&conscallout, tty_getlock(tp), 0);
 	constty_timeout(tp);
 	return (0);
 }
 
 /*
  * Disable console redirection to a tty.
  */
 int
 constty_clear(struct tty *tp)
 {
 	int c;
 
 	tty_assert_locked(tp);
 	if (constty != tp)
 		return (ENXIO);
 	callout_stop(&conscallout);
 	mtx_lock(&constty_mtx);
 	constty = NULL;
 	mtx_unlock(&constty_mtx);
 	while ((c = msgbuf_getchar(&consmsgbuf)) != -1)
 		cnputc(c);
 	/* We never free consbuf because it can still be in use. */
 	return (0);
 }
 
 /* Times per second to check for pending console tty messages. */
 static int constty_wakeups_per_second = 15;
 SYSCTL_INT(_kern, OID_AUTO, constty_wakeups_per_second, CTLFLAG_RW,
     &constty_wakeups_per_second, 0,
     "Times per second to check for pending console tty messages");
 
 static void
 constty_timeout(void *arg)
 {
 	struct tty *tp = arg;
 	int c;
 
 	tty_assert_locked(tp);
 	while ((c = msgbuf_getchar(&consmsgbuf)) != -1) {
 		if (tty_putchar(tp, c) < 0) {
 			constty_clear(tp);
 			return;
 		}
 	}
 	callout_reset_sbt(&conscallout, SBT_1S / constty_wakeups_per_second,
 	    0, constty_timeout, tp, C_PREL(1));
 }
 
 /*
  * Sysbeep(), if we have hardware for it
  */
 
 #ifdef HAS_TIMER_SPKR
 
 static bool beeping;
 static struct callout beeping_timer;
 
 static void
 sysbeepstop(void *chan)
 {
 
 	timer_spkr_release();
 	beeping = false;
 }
 
 int
 sysbeep(int pitch, sbintime_t duration)
 {
 
 	if (timer_spkr_acquire()) {
 		if (!beeping) {
 			/* Something else owns it. */
 			return (EBUSY);
 		}
 	}
 	timer_spkr_setfreq(pitch);
 	if (!beeping) {
 		beeping = true;
 		callout_reset_sbt(&beeping_timer, duration, 0, sysbeepstop,
 		    NULL, C_PREL(5));
 	}
 	return (0);
 }
 
 static void
 sysbeep_init(void *unused)
 {
 
 	callout_init(&beeping_timer, 1);
 }
 SYSINIT(sysbeep, SI_SUB_SOFTINTR, SI_ORDER_ANY, sysbeep_init, NULL);
 #else
 
 /*
  * No hardware, no sound
  */
 
 int
 sysbeep(int pitch __unused, sbintime_t duration __unused)
 {
 
 	return (ENODEV);
 }
 
 #endif
 
 /*
  * Temporary support for sc(4) to vt(4) transition.
  */
 static unsigned vty_prefer;
 static char vty_name[16];
 SYSCTL_STRING(_kern, OID_AUTO, vty, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, vty_name,
     0, "Console vty driver");
 
 int
 vty_enabled(unsigned vty)
 {
 	static unsigned vty_selected = 0;
 
 	if (vty_selected == 0) {
 		TUNABLE_STR_FETCH("kern.vty", vty_name, sizeof(vty_name));
 		do {
 #if defined(DEV_SC)
 			if (strcmp(vty_name, "sc") == 0) {
 				vty_selected = VTY_SC;
 				break;
 			}
 #endif
 #if defined(DEV_VT)
 			if (strcmp(vty_name, "vt") == 0) {
 				vty_selected = VTY_VT;
 				break;
 			}
 #endif
 			if (vty_prefer != 0) {
 				vty_selected = vty_prefer;
 				break;
 			}
 #if defined(DEV_VT)
 			vty_selected = VTY_VT;
 #elif defined(DEV_SC)
 			vty_selected = VTY_SC;
 #endif
 		} while (0);
 
 		if (vty_selected == VTY_VT)
 			strcpy(vty_name, "vt");
 		else if (vty_selected == VTY_SC)
 			strcpy(vty_name, "sc");
 	}
 	return ((vty_selected & vty) != 0);
 }
 
 void
 vty_set_preferred(unsigned vty)
 {
 
 	vty_prefer = vty;
 #if !defined(DEV_SC)
 	vty_prefer &= ~VTY_SC;
 #endif
 #if !defined(DEV_VT)
 	vty_prefer &= ~VTY_VT;
 #endif
 }
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index f9445a481d92..112f9c7b0f33 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -1,1471 +1,1471 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
  */
 
 #include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/jail.h>
 #include <sys/tty.h>
 #include <sys/wait.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/ptrace.h>
 #include <sys/acct.h>		/* for acct_process() function prototype */
 #include <sys/filedesc.h>
 #include <sys/sdt.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 #include <sys/sysent.h>
 #include <sys/timers.h>
 #include <sys/umtxvar.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exit;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, , , exit, "int");
 
 static int kern_kill_on_dbg_exit = 1;
 SYSCTL_INT(_kern, OID_AUTO, kill_on_debugger_exit, CTLFLAG_RWTUN,
     &kern_kill_on_dbg_exit, 0,
     "Kill ptraced processes when debugger exits");
 
 static bool kern_wait_dequeue_sigchld = 1;
 SYSCTL_BOOL(_kern, OID_AUTO, wait_dequeue_sigchld, CTLFLAG_RWTUN,
     &kern_wait_dequeue_sigchld, 0,
     "Dequeue SIGCHLD on wait(2) for live process");
 
 struct proc *
 proc_realparent(struct proc *child)
 {
 	struct proc *p, *parent;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	if ((child->p_treeflag & P_TREE_ORPHANED) == 0)
 		return (child->p_pptr->p_pid == child->p_oppid ?
 		    child->p_pptr : child->p_reaper);
 	for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) {
 		/* Cannot use LIST_PREV(), since the list head is not known. */
 		p = __containerof(p->p_orphan.le_prev, struct proc,
 		    p_orphan.le_next);
 		KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0,
 		    ("missing P_ORPHAN %p", p));
 	}
 	parent = __containerof(p->p_orphan.le_prev, struct proc,
 	    p_orphans.lh_first);
 	return (parent);
 }
 
 void
 reaper_abandon_children(struct proc *p, bool exiting)
 {
 	struct proc *p1, *p2, *ptmp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	KASSERT(p != initproc, ("reaper_abandon_children for initproc"));
 	if ((p->p_treeflag & P_TREE_REAPER) == 0)
 		return;
 	p1 = p->p_reaper;
 	LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
 		LIST_REMOVE(p2, p_reapsibling);
 		p2->p_reaper = p1;
 		p2->p_reapsubtree = p->p_reapsubtree;
 		LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling);
 		if (exiting && p2->p_pptr == p) {
 			PROC_LOCK(p2);
 			proc_reparent(p2, p1, true);
 			PROC_UNLOCK(p2);
 		}
 	}
 	KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty"));
 	p->p_treeflag &= ~P_TREE_REAPER;
 }
 
 static void
 reaper_clear(struct proc *p)
 {
 	struct proc *p1;
 	bool clear;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	LIST_REMOVE(p, p_reapsibling);
 	if (p->p_reapsubtree == 1)
 		return;
 	clear = true;
 	LIST_FOREACH(p1, &p->p_reaper->p_reaplist, p_reapsibling) {
 		if (p1->p_reapsubtree == p->p_reapsubtree) {
 			clear = false;
 			break;
 		}
 	}
 	if (clear)
 		proc_id_clear(PROC_ID_REAP, p->p_reapsubtree);
 }
 
 void
 proc_clear_orphan(struct proc *p)
 {
 	struct proc *p1;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	if ((p->p_treeflag & P_TREE_ORPHANED) == 0)
 		return;
 	if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) {
 		p1 = LIST_NEXT(p, p_orphan);
 		if (p1 != NULL)
 			p1->p_treeflag |= P_TREE_FIRST_ORPHAN;
 		p->p_treeflag &= ~P_TREE_FIRST_ORPHAN;
 	}
 	LIST_REMOVE(p, p_orphan);
 	p->p_treeflag &= ~P_TREE_ORPHANED;
 }
 
 void
 exit_onexit(struct proc *p)
 {
 	MPASS(p->p_numthreads == 1);
 	umtx_thread_exit(FIRST_THREAD_IN_PROC(p));
 }
 
 /*
  * exit -- death of process.
  */
 int
 sys_exit(struct thread *td, struct exit_args *uap)
 {
 
 	exit1(td, uap->rval, 0);
 	__unreachable();
 }
 
 void
 proc_set_p2_wexit(struct proc *p)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag2 |= P2_WEXIT;
 }
 
 /*
  * Exit: deallocate address space and other resources, change proc state to
  * zombie, and unlink proc from allproc and parent's lists.  Save exit status
  * and rusage for wait().  Check for child processes and orphan them.
  */
 void
 exit1(struct thread *td, int rval, int signo)
 {
 	struct proc *p, *nq, *q, *t;
 	struct thread *tdt;
 	ksiginfo_t *ksi, *ksi1;
 	int signal_parent;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT(rval == 0 || signo == 0, ("exit1 rv %d sig %d", rval, signo));
 	TSPROCEXIT(td->td_proc->p_pid);
 
 	p = td->td_proc;
 	/*
 	 * In case we're rebooting we just let init die in order to
 	 * work around an issues where pid 1 might get a fatal signal.
 	 * For instance, if network interface serving NFS root is
 	 * going down due to reboot, page-in requests for text are
 	 * failing.
 	 */
 	if (p == initproc && rebooting == 0) {
 		printf("init died (signal %d, exit %d)\n", signo, rval);
 		panic("Going nowhere without my init!");
 	}
 
 	/*
 	 * Process deferred operations, designated with ASTF_KCLEAR.
 	 * For instance, we need to deref SU mp, since the thread does
 	 * not return to userspace, and wait for geom to stabilize.
 	 */
 	ast_kclear(td);
 
 	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
 	proc_set_p2_wexit(p);
 
 	/*
 	 * First check if some other thread or external request got
 	 * here before us.  If so, act appropriately: exit or suspend.
 	 * We must ensure that stop requests are handled before we set
 	 * P_WEXIT.
 	 */
 	thread_suspend_check(0);
 	while (p->p_flag & P_HADTHREADS) {
 		/*
 		 * Kill off the other threads. This requires
 		 * some co-operation from other parts of the kernel
 		 * so it may not be instantaneous.  With this state set
 		 * any thread attempting to interruptibly
 		 * sleep will return immediately with EINTR or EWOULDBLOCK
 		 * which will hopefully force them to back out to userland
 		 * freeing resources as they go.  Any thread attempting
 		 * to return to userland will thread_exit() from ast().
 		 * thread_exit() will unsuspend us when the last of the
 		 * other threads exits.
 		 * If there is already a thread singler after resumption,
 		 * calling thread_single() will fail; in that case, we just
 		 * re-check all suspension request, the thread should
 		 * either be suspended there or exit.
 		 */
 		if (!thread_single(p, SINGLE_EXIT))
 			/*
 			 * All other activity in this process is now
 			 * stopped.  Threading support has been turned
 			 * off.
 			 */
 			break;
 		/*
 		 * Recheck for new stop or suspend requests which
 		 * might appear while process lock was dropped in
 		 * thread_single().
 		 */
 		thread_suspend_check(0);
 	}
 	KASSERT(p->p_numthreads == 1,
 	    ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
 	racct_sub(p, RACCT_NTHR, 1);
 
 	/* Let event handler change exit status */
 	p->p_xexit = rval;
 	p->p_xsig = signo;
 
 	/*
 	 * Ignore any pending request to stop due to a stop signal.
 	 * Once P_WEXIT is set, future requests will be ignored as
 	 * well.
 	 */
 	p->p_flag &= ~P_STOPPED_SIG;
 	KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped"));
 
 	/* Note that we are exiting. */
 	p->p_flag |= P_WEXIT;
 
 	/*
 	 * Wait for any processes that have a hold on our vmspace to
 	 * release their reference.
 	 */
 	while (p->p_lock > 0)
 		msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
 
 	PROC_UNLOCK(p);
 	/* Drain the limit callout while we don't have the proc locked */
 	callout_drain(&p->p_limco);
 
 #ifdef AUDIT
 	/*
 	 * The Sun BSM exit token contains two components: an exit status as
 	 * passed to exit(), and a return value to indicate what sort of exit
 	 * it was.  The exit status is WEXITSTATUS(rv), but it's not clear
 	 * what the return value is.
 	 */
 	AUDIT_ARG_EXIT(rval, 0);
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 
 	/* Are we a task leader with peers? */
 	if (p->p_peers != NULL && p == p->p_leader) {
 		mtx_lock(&ppeers_lock);
 		q = p->p_peers;
 		while (q != NULL) {
 			PROC_LOCK(q);
 			kern_psignal(q, SIGKILL);
 			PROC_UNLOCK(q);
 			q = q->p_peers;
 		}
 		while (p->p_peers != NULL)
 			msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
 		mtx_unlock(&ppeers_lock);
 	}
 
 	itimers_exit(p);
 
 	/*
 	 * Check if any loadable modules need anything done at process exit.
 	 * E.g. SYSV IPC stuff.
 	 * Event handler could change exit status.
 	 * XXX what if one of these generates an error?
 	 */
 	EVENTHANDLER_DIRECT_INVOKE(process_exit, p);
 
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
 	 */
 	PROC_LOCK(p);
 	stopprofclock(p);
 	p->p_ptevents = 0;
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
 	 * executing, prevent it from rearming itself and let it finish.
 	 */
 	if (timevalisset(&p->p_realtimer.it_value) &&
 	    callout_stop(&p->p_itcallout) == 0) {
 		timevalclear(&p->p_realtimer.it_interval);
 		PROC_UNLOCK(p);
 		callout_drain(&p->p_itcallout);
 	} else {
 		PROC_UNLOCK(p);
 	}
 
 	if (p->p_sysent->sv_onexit != NULL)
 		p->p_sysent->sv_onexit(p);
 	seltdfini(td);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pid.  The P_WEXIT flag interlocks with fsetown().
 	 */
 	funsetownlst(&p->p_sigiolst);
 
 	/*
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
 	pdescfree(td);
 	fdescfree(td);
 
 	/*
 	 * Remove ourself from our leader's peer list and wake our leader.
 	 */
 	if (p->p_leader->p_peers != NULL) {
 		mtx_lock(&ppeers_lock);
 		if (p->p_leader->p_peers != NULL) {
 			q = p->p_leader;
 			while (q->p_peers != p)
 				q = q->p_peers;
 			q->p_peers = p->p_peers;
 			wakeup(p->p_leader);
 		}
 		mtx_unlock(&ppeers_lock);
 	}
 
 	exec_free_abi_mappings(p);
 	vmspace_exit(td);
 	(void)acct_process(td);
 
 #ifdef KTRACE
 	ktrprocexit(td);
 #endif
 	/*
 	 * Release reference to text vnode etc
 	 */
 	if (p->p_textvp != NULL) {
 		vrele(p->p_textvp);
 		p->p_textvp = NULL;
 	}
 	if (p->p_textdvp != NULL) {
 		vrele(p->p_textdvp);
 		p->p_textdvp = NULL;
 	}
 	if (p->p_binname != NULL) {
 		free(p->p_binname, M_PARGS);
 		p->p_binname = NULL;
 	}
 
 	/*
 	 * Release our limits structure.
 	 */
 	lim_free(p->p_limit);
 	p->p_limit = NULL;
 
 	tidhash_remove(td);
 
 	/*
 	 * Call machine-dependent code to release any
 	 * machine-dependent resources other than the address space.
 	 * The address space is released by "vmspace_exitfree(p)" in
 	 * vm_waitproc().
 	 */
 	cpu_exit(td);
 
 	WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
 
 	/*
 	 * Remove from allproc. It still sits in the hash.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);
 
 #ifdef DDB
 	/*
 	 * Used by ddb's 'ps' command to find this process via the
 	 * pidhash.
 	 */
 	p->p_list.le_prev = NULL;
 #endif
 	prison_proc_unlink(p->p_ucred->cr_prison, p);
 	sx_xunlock(&allproc_lock);
 
 	sx_xlock(&proctree_lock);
 	if ((p->p_flag & (P_TRACED | P_PPWAIT | P_PPTRACE)) != 0) {
 		PROC_LOCK(p);
 		p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE);
 		PROC_UNLOCK(p);
 	}
 
 	/*
 	 * killjobc() might drop and re-acquire proctree_lock to
 	 * revoke control tty if exiting process was a session leader.
 	 */
 	killjobc();
 
 	/*
 	 * Reparent all children processes:
 	 * - traced ones to the original parent (or init if we are that parent)
 	 * - the rest to init
 	 */
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
 		wakeup(q->p_reaper);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		ksi = ksiginfo_alloc(M_WAITOK);
 		PROC_LOCK(q);
 		q->p_sigparent = SIGCHLD;
 
 		if ((q->p_flag & P_TRACED) == 0) {
 			proc_reparent(q, q->p_reaper, true);
 			if (q->p_state == PRS_ZOMBIE) {
 				/*
 				 * Inform reaper about the reparented
 				 * zombie, since wait(2) has something
 				 * new to report.  Guarantee queueing
 				 * of the SIGCHLD signal, similar to
 				 * the _exit() behaviour, by providing
 				 * our ksiginfo.  Ksi is freed by the
 				 * signal delivery.
 				 */
 				if (q->p_ksi == NULL) {
 					ksi1 = NULL;
 				} else {
 					ksiginfo_copy(q->p_ksi, ksi);
 					ksi->ksi_flags |= KSI_INS;
 					ksi1 = ksi;
 					ksi = NULL;
 				}
 				PROC_LOCK(q->p_reaper);
 				pksignal(q->p_reaper, SIGCHLD, ksi1);
 				PROC_UNLOCK(q->p_reaper);
 			} else if (q->p_pdeathsig > 0) {
 				/*
 				 * The child asked to received a signal
 				 * when we exit.
 				 */
 				kern_psignal(q, q->p_pdeathsig);
 			}
 		} else {
 			/*
 			 * Traced processes are killed by default
 			 * since their existence means someone is
 			 * screwing up.
 			 */
 			t = proc_realparent(q);
 			if (t == p) {
 				proc_reparent(q, q->p_reaper, true);
 			} else {
 				PROC_LOCK(t);
 				proc_reparent(q, t, true);
 				PROC_UNLOCK(t);
 			}
 			/*
 			 * Since q was found on our children list, the
 			 * proc_reparent() call moved q to the orphan
 			 * list due to present P_TRACED flag. Clear
 			 * orphan link for q now while q is locked.
 			 */
 			proc_clear_orphan(q);
 			q->p_flag &= ~P_TRACED;
 			q->p_flag2 &= ~P2_PTRACE_FSTP;
 			q->p_ptevents = 0;
 			p->p_xthread = NULL;
 			FOREACH_THREAD_IN_PROC(q, tdt) {
 				tdt->td_dbgflags &= ~(TDB_SUSPEND | TDB_XSIG |
 				    TDB_FSTP);
 				tdt->td_xsig = 0;
 			}
 			if (kern_kill_on_dbg_exit) {
 				q->p_flag &= ~P_STOPPED_TRACE;
 				kern_psignal(q, SIGKILL);
 			} else if ((q->p_flag & (P_STOPPED_TRACE |
 			    P_STOPPED_SIG)) != 0) {
 				sigqueue_delete_proc(q, SIGTRAP);
 				ptrace_unsuspend(q);
 			}
 		}
 		PROC_UNLOCK(q);
 		if (ksi != NULL)
 			ksiginfo_free(ksi);
 	}
 
 	/*
 	 * Also get rid of our orphans.
 	 */
 	while ((q = LIST_FIRST(&p->p_orphans)) != NULL) {
 		PROC_LOCK(q);
 		KASSERT(q->p_oppid == p->p_pid,
 		    ("orphan %p of %p has unexpected oppid %d", q, p,
 		    q->p_oppid));
 		q->p_oppid = q->p_reaper->p_pid;
 
 		/*
 		 * If we are the real parent of this process
 		 * but it has been reparented to a debugger, then
 		 * check if it asked for a signal when we exit.
 		 */
 		if (q->p_pdeathsig > 0)
 			kern_psignal(q, q->p_pdeathsig);
 		CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid,
 		    q->p_pid);
 		proc_clear_orphan(q);
 		PROC_UNLOCK(q);
 	}
 
 #ifdef KDTRACE_HOOKS
 	if (SDT_PROBES_ENABLED()) {
 		int reason = CLD_EXITED;
 		if (WCOREDUMP(signo))
 			reason = CLD_DUMPED;
 		else if (WIFSIGNALED(signo))
 			reason = CLD_KILLED;
 		SDT_PROBE1(proc, , , exit, reason);
 	}
 #endif
 
 	/* Save exit status. */
 	PROC_LOCK(p);
 	p->p_xthread = td;
 
 	if (p->p_sysent->sv_ontdexit != NULL)
 		p->p_sysent->sv_ontdexit(td);
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exit if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exit)
 		dtrace_fasttrap_exit(p);
 #endif
 
 	/*
 	 * Notify interested parties of our demise.
 	 */
 	KNOTE_LOCKED(p->p_klist, NOTE_EXIT);
 
 	/*
 	 * If this is a process with a descriptor, we may not need to deliver
 	 * a signal to the parent.  proctree_lock is held over
 	 * procdesc_exit() to serialize concurrent calls to close() and
 	 * exit().
 	 */
 	signal_parent = 0;
 	if (p->p_procdesc == NULL || procdesc_exit(p)) {
 		/*
 		 * Notify parent that we're gone.  If parent has the
 		 * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
 		 * notify process 1 instead (and hope it will handle this
 		 * situation).
 		 */
 		PROC_LOCK(p->p_pptr);
 		mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
 		if (p->p_pptr->p_sigacts->ps_flag &
 		    (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
 			struct proc *pp;
 
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 			pp = p->p_pptr;
 			PROC_UNLOCK(pp);
 			proc_reparent(p, p->p_reaper, true);
 			p->p_sigparent = SIGCHLD;
 			PROC_LOCK(p->p_pptr);
 
 			/*
 			 * Notify parent, so in case he was wait(2)ing or
 			 * executing waitpid(2) with our pid, he will
 			 * continue.
 			 */
 			wakeup(pp);
 		} else
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 
 		if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) {
 			signal_parent = 1;
 		} else if (p->p_sigparent != 0) {
 			if (p->p_sigparent == SIGCHLD) {
 				signal_parent = 1;
 			} else { /* LINUX thread */
 				signal_parent = 2;
 			}
 		}
 	} else
 		PROC_LOCK(p->p_pptr);
 	sx_xunlock(&proctree_lock);
 
 	if (signal_parent == 1) {
 		childproc_exited(p);
 	} else if (signal_parent == 2) {
 		kern_psignal(p->p_pptr, p->p_sigparent);
 	}
 
 	/* Tell the prison that we are gone. */
 	prison_proc_free(p->p_ucred->cr_prison);
 
 	/*
 	 * The state PRS_ZOMBIE prevents other processes from sending
 	 * signal to the process, to avoid memory leak, we free memory
 	 * for signal queue at the time when the state is set.
 	 */
 	sigqueue_flush(&p->p_sigqueue);
 	sigqueue_flush(&td->td_sigqueue);
 
 	/*
 	 * We have to wait until after acquiring all locks before
 	 * changing p_state.  We need to avoid all possible context
 	 * switches (including ones from blocking on a mutex) while
 	 * marked as a zombie.  We also have to set the zombie state
 	 * before we release the parent process' proc lock to avoid
 	 * a lost wakeup.  So, we first call wakeup, then we grab the
 	 * sched lock, update the state, and release the parent process'
 	 * proc lock.
 	 */
 	wakeup(p->p_pptr);
 	cv_broadcast(&p->p_pwait);
 	sched_exit(p->p_pptr, td);
 	PROC_SLOCK(p);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
 
 	/*
 	 * Save our children's rusage information in our exit rusage.
 	 */
 	PROC_STATLOCK(p);
 	ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
 	PROC_STATUNLOCK(p);
 
 	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
 	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
 	 */
 	thread_exit();
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct abort2_args {
 	char *why;
 	int nargs;
 	void **args;
 };
 #endif
 
 int
 sys_abort2(struct thread *td, struct abort2_args *uap)
 {
 	void *uargs[16];
 	void **uargsp;
 	int error, nargs;
 
 	nargs = uap->nargs;
 	if (nargs < 0 || nargs > nitems(uargs))
 		nargs = -1;
 	uargsp = NULL;
 	if (nargs > 0) {
 		if (uap->args != NULL) {
 			error = copyin(uap->args, uargs,
 			    nargs * sizeof(void *));
 			if (error != 0)
 				nargs = -1;
 			else
 				uargsp = uargs;
 		} else
 			nargs = -1;
 	}
 	return (kern_abort2(td, uap->why, nargs, uargsp));
 }
 
 /*
  * kern_abort2()
  * Arguments:
  *  why - user pointer to why
  *  nargs - number of arguments copied or -1 if an error occurred in copying
  *  args - pointer to an array of pointers in kernel format
  */
 int
 kern_abort2(struct thread *td, const char *why, int nargs, void **uargs)
 {
 	struct proc *p = td->td_proc;
 	struct sbuf *sb;
 	int error, i, sig;
 
 	/*
 	 * Do it right now so we can log either proper call of abort2(), or
 	 * note, that invalid argument was passed. 512 is big enough to
 	 * handle 16 arguments' descriptions with additional comments.
 	 */
 	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
 	sbuf_clear(sb);
 	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
 	    p->p_comm, p->p_pid, td->td_ucred->cr_uid);
 	/*
 	 * Since we can't return from abort2(), send SIGKILL in cases, where
 	 * abort2() was called improperly
 	 */
 	sig = SIGKILL;
 	/* Prevent from DoSes from user-space. */
 	if (nargs == -1)
 		goto out;
 	KASSERT(nargs >= 0 && nargs <= 16, ("called with too many args (%d)",
 	    nargs));
 	/*
 	 * Limit size of 'reason' string to 128. Will fit even when
 	 * maximal number of arguments was chosen to be logged.
 	 */
 	if (why != NULL) {
 		error = sbuf_copyin(sb, why, 128);
 		if (error < 0)
 			goto out;
 	} else {
-		sbuf_printf(sb, "(null)");
+		sbuf_cat(sb, "(null)");
 	}
 	if (nargs > 0) {
-		sbuf_printf(sb, "(");
+		sbuf_putc(sb, '(');
 		for (i = 0;i < nargs; i++)
 			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
-		sbuf_printf(sb, ")");
+		sbuf_putc(sb, ')');
 	}
 	/*
 	 * Final stage: arguments were proper, string has been
 	 * successfully copied from userspace, and copying pointers
 	 * from user-space succeed.
 	 */
 	sig = SIGABRT;
 out:
 	if (sig == SIGKILL) {
 		sbuf_trim(sb);
-		sbuf_printf(sb, " (Reason text inaccessible)");
+		sbuf_cat(sb, " (Reason text inaccessible)");
 	}
 	sbuf_cat(sb, "\n");
 	sbuf_finish(sb);
 	log(LOG_INFO, "%s", sbuf_data(sb));
 	sbuf_delete(sb);
 	PROC_LOCK(p);
 	sigexit(td, sig);
 	/* NOTREACHED */
 }
 
 #ifdef COMPAT_43
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 owait(struct thread *td, struct owait_args *uap __unused)
 {
 	int error, status;
 
 	error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
 	if (error == 0)
 		td->td_retval[1] = status;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 sys_wait4(struct thread *td, struct wait4_args *uap)
 {
 	struct rusage ru, *rup;
 	int error, status;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 sys_wait6(struct thread *td, struct wait6_args *uap)
 {
 	struct __wrusage wru, *wrup;
 	siginfo_t si, *sip;
 	idtype_t idtype;
 	id_t id;
 	int error, status;
 
 	idtype = uap->idtype;
 	id = uap->id;
 
 	if (uap->wrusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 
 	if (uap->info != NULL) {
 		sip = &si;
 		bzero(sip, sizeof(*sip));
 	} else
 		sip = NULL;
 
 	/*
 	 *  We expect all callers of wait6() to know about WEXITED and
 	 *  WTRAPPED.
 	 */
 	error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
 
 	if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->wrusage != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&wru, uap->wrusage, sizeof(wru));
 	if (uap->info != NULL && error == 0)
 		error = copyout(&si, uap->info, sizeof(si));
 	return (error);
 }
 
 /*
  * Reap the remains of a zombie process and optionally return status and
  * rusage.  Asserts and will release both the proctree_lock and the process
  * lock as part of its work.
  */
 void
 proc_reap(struct thread *td, struct proc *p, int *status, int options)
 {
 	struct proc *q, *t;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
 
 	mtx_spin_wait_unlocked(&p->p_slock);
 
 	q = td->td_proc;
 
 	if (status)
 		*status = KW_EXITCODE(p->p_xexit, p->p_xsig);
 	if (options & WNOWAIT) {
 		/*
 		 *  Only poll, returning the status.  Caller does not wish to
 		 * release the proc struct just yet.
 		 */
 		PROC_UNLOCK(p);
 		sx_xunlock(&proctree_lock);
 		return;
 	}
 
 	PROC_LOCK(q);
 	sigqueue_take(p->p_ksi);
 	PROC_UNLOCK(q);
 
 	/*
 	 * If we got the child via a ptrace 'attach', we need to give it back
 	 * to the old parent.
 	 */
 	if (p->p_oppid != p->p_pptr->p_pid) {
 		PROC_UNLOCK(p);
 		t = proc_realparent(p);
 		PROC_LOCK(t);
 		PROC_LOCK(p);
 		CTR2(KTR_PTRACE,
 		    "wait: traced child %d moved back to parent %d", p->p_pid,
 		    t->p_pid);
 		proc_reparent(p, t, false);
 		PROC_UNLOCK(p);
 		pksignal(t, SIGCHLD, p->p_ksi);
 		wakeup(t);
 		cv_broadcast(&p->p_pwait);
 		PROC_UNLOCK(t);
 		sx_xunlock(&proctree_lock);
 		return;
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * Remove other references to this process to ensure we have an
 	 * exclusive reference.
 	 */
 	sx_xlock(PIDHASHLOCK(p->p_pid));
 	LIST_REMOVE(p, p_hash);
 	sx_xunlock(PIDHASHLOCK(p->p_pid));
 	LIST_REMOVE(p, p_sibling);
 	reaper_abandon_children(p, true);
 	reaper_clear(p);
 	PROC_LOCK(p);
 	proc_clear_orphan(p);
 	PROC_UNLOCK(p);
 	leavepgrp(p);
 	if (p->p_procdesc != NULL)
 		procdesc_reap(p);
 	sx_xunlock(&proctree_lock);
 
 	proc_id_clear(PROC_ID_PID, p->p_pid);
 
 	PROC_LOCK(p);
 	knlist_detach(p->p_klist);
 	p->p_klist = NULL;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Removal from allproc list and process group list paired with
 	 * PROC_LOCK which was executed during that time should guarantee
 	 * nothing can reach this process anymore. As such further locking
 	 * is unnecessary.
 	 */
 	p->p_xexit = p->p_xsig = 0;		/* XXX: why? */
 
 	PROC_LOCK(q);
 	ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux);
 	PROC_UNLOCK(q);
 
 	/*
 	 * Decrement the count of procs running with this uid.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
 
 	/*
 	 * Destroy resource accounting information associated with the process.
 	 */
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_sub(p, RACCT_NPROC, 1);
 		PROC_UNLOCK(p);
 	}
 #endif
 	racct_proc_exit(p);
 
 	/*
 	 * Free credentials, arguments, and sigacts.
 	 */
 	proc_unset_cred(p);
 	pargs_drop(p->p_args);
 	p->p_args = NULL;
 	sigacts_free(p->p_sigacts);
 	p->p_sigacts = NULL;
 
 	/*
 	 * Do any thread-system specific cleanups.
 	 */
 	thread_wait(p);
 
 	/*
 	 * Give vm and machine-dependent layer a chance to free anything that
 	 * cpu_exit couldn't release while still running in process context.
 	 */
 	vm_waitproc(p);
 #ifdef MAC
 	mac_proc_destroy(p);
 #endif
 
 	KASSERT(FIRST_THREAD_IN_PROC(p),
 	    ("proc_reap: no residual thread!"));
 	uma_zfree(proc_zone, p);
 	atomic_add_int(&nprocs, -1);
 }
 
 static int
 proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
     int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo,
     int check_only)
 {
 	struct rusage *rup;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 
 	PROC_LOCK(p);
 
 	switch (idtype) {
 	case P_ALL:
 		if (p->p_procdesc == NULL ||
 		   (p->p_pptr == td->td_proc &&
 		   (p->p_flag & P_TRACED) != 0)) {
 			break;
 		}
 
 		PROC_UNLOCK(p);
 		return (0);
 	case P_PID:
 		if (p->p_pid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_PGID:
 		if (p->p_pgid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_SID:
 		if (p->p_session->s_sid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_UID:
 		if (p->p_ucred->cr_uid != (uid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_GID:
 		if (p->p_ucred->cr_gid != (gid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_JAILID:
 		if (p->p_ucred->cr_prison->pr_id != (int)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	/*
 	 * It seems that the thread structures get zeroed out
 	 * at process exit.  This makes it impossible to
 	 * support P_SETID, P_CID or P_CPUID.
 	 */
 	default:
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (p_canwait(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	/*
 	 * This special case handles a kthread spawned by linux_clone
 	 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
 	 * functions need to be able to distinguish between waiting
 	 * on a process and waiting on a thread.  It is a thread if
 	 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
 	 * signifies we want to wait for threads and not processes.
 	 */
 	if ((p->p_sigparent != SIGCHLD) ^
 	    ((options & WLINUXCLONE) != 0)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (siginfo != NULL) {
 		bzero(siginfo, sizeof(*siginfo));
 		siginfo->si_errno = 0;
 
 		/*
 		 * SUSv4 requires that the si_signo value is always
 		 * SIGCHLD. Obey it despite the rfork(2) interface
 		 * allows to request other signal for child exit
 		 * notification.
 		 */
 		siginfo->si_signo = SIGCHLD;
 
 		/*
 		 *  This is still a rough estimate.  We will fix the
 		 *  cases TRAPPED, STOPPED, and CONTINUED later.
 		 */
 		if (WCOREDUMP(p->p_xsig)) {
 			siginfo->si_code = CLD_DUMPED;
 			siginfo->si_status = WTERMSIG(p->p_xsig);
 		} else if (WIFSIGNALED(p->p_xsig)) {
 			siginfo->si_code = CLD_KILLED;
 			siginfo->si_status = WTERMSIG(p->p_xsig);
 		} else {
 			siginfo->si_code = CLD_EXITED;
 			siginfo->si_status = p->p_xexit;
 		}
 
 		siginfo->si_pid = p->p_pid;
 		siginfo->si_uid = p->p_ucred->cr_uid;
 
 		/*
 		 * The si_addr field would be useful additional
 		 * detail, but apparently the PC value may be lost
 		 * when we reach this point.  bzero() above sets
 		 * siginfo->si_addr to NULL.
 		 */
 	}
 
 	/*
 	 * There should be no reason to limit resources usage info to
 	 * exited processes only.  A snapshot about any resources used
 	 * by a stopped process may be exactly what is needed.
 	 */
 	if (wrusage != NULL) {
 		rup = &wrusage->wru_self;
 		*rup = p->p_ru;
 		PROC_STATLOCK(p);
 		calcru(p, &rup->ru_utime, &rup->ru_stime);
 		PROC_STATUNLOCK(p);
 
 		rup = &wrusage->wru_children;
 		*rup = p->p_stats->p_cru;
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 	}
 
 	if (p->p_state == PRS_ZOMBIE && !check_only) {
 		proc_reap(td, p, status, options);
 		return (-1);
 	}
 	return (1);
 }
 
 int
 kern_wait(struct thread *td, pid_t pid, int *status, int options,
     struct rusage *rusage)
 {
 	struct __wrusage wru, *wrup;
 	idtype_t idtype;
 	id_t id;
 	int ret;
 
 	/*
 	 * Translate the special pid values into the (idtype, pid)
 	 * pair for kern_wait6.  The WAIT_MYPGRP case is handled by
 	 * kern_wait6() on its own.
 	 */
 	if (pid == WAIT_ANY) {
 		idtype = P_ALL;
 		id = 0;
 	} else if (pid < 0) {
 		idtype = P_PGID;
 		id = (id_t)-pid;
 	} else {
 		idtype = P_PID;
 		id = (id_t)pid;
 	}
 
 	if (rusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 
 	/*
 	 * For backward compatibility we implicitly add flags WEXITED
 	 * and WTRAPPED here.
 	 */
 	options |= WEXITED | WTRAPPED;
 	ret = kern_wait6(td, idtype, id, status, options, wrup, NULL);
 	if (rusage != NULL)
 		*rusage = wru.wru_self;
 	return (ret);
 }
 
 static void
 report_alive_proc(struct thread *td, struct proc *p, siginfo_t *siginfo,
     int *status, int options, int si_code)
 {
 	bool cont;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	MPASS(si_code == CLD_TRAPPED || si_code == CLD_STOPPED ||
 	    si_code == CLD_CONTINUED);
 
 	cont = si_code == CLD_CONTINUED;
 	if ((options & WNOWAIT) == 0) {
 		if (cont)
 			p->p_flag &= ~P_CONTINUED;
 		else
 			p->p_flag |= P_WAITED;
 		if (kern_wait_dequeue_sigchld &&
 		    (td->td_proc->p_sysent->sv_flags & SV_SIG_WAITNDQ) == 0) {
 			PROC_LOCK(td->td_proc);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	sx_xunlock(&proctree_lock);
 	if (siginfo != NULL) {
 		siginfo->si_code = si_code;
 		siginfo->si_status = cont ? SIGCONT : p->p_xsig;
 	}
 	if (status != NULL)
 		*status = cont ? SIGCONT : W_STOPCODE(p->p_xsig);
 	td->td_retval[0] = p->p_pid;
 	PROC_UNLOCK(p);
 }
 
 int
 kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status,
     int options, struct __wrusage *wrusage, siginfo_t *siginfo)
 {
 	struct proc *p, *q;
 	pid_t pid;
 	int error, nfound, ret;
 	bool report;
 
 	AUDIT_ARG_VALUE((int)idtype);	/* XXX - This is likely wrong! */
 	AUDIT_ARG_PID((pid_t)id);	/* XXX - This may be wrong! */
 	AUDIT_ARG_VALUE(options);
 
 	q = td->td_proc;
 
 	if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
 		PROC_LOCK(q);
 		id = (id_t)q->p_pgid;
 		PROC_UNLOCK(q);
 		idtype = P_PGID;
 	}
 
 	/* If we don't know the option, just return. */
 	if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT |
 	    WEXITED | WTRAPPED | WLINUXCLONE)) != 0)
 		return (EINVAL);
 	if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) {
 		/*
 		 * We will be unable to find any matching processes,
 		 * because there are no known events to look for.
 		 * Prefer to return error instead of blocking
 		 * indefinitely.
 		 */
 		return (EINVAL);
 	}
 
 loop:
 	if (q->p_flag & P_STATCHILD) {
 		PROC_LOCK(q);
 		q->p_flag &= ~P_STATCHILD;
 		PROC_UNLOCK(q);
 	}
 	sx_xlock(&proctree_lock);
 loop_locked:
 	nfound = 0;
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		pid = p->p_pid;
 		ret = proc_to_reap(td, p, idtype, id, status, options,
 		    wrusage, siginfo, 0);
 		if (ret == 0)
 			continue;
 		else if (ret != 1) {
 			td->td_retval[0] = pid;
 			return (0);
 		}
 
 		nfound++;
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 		if ((options & WTRAPPED) != 0 &&
 		    (p->p_flag & P_TRACED) != 0) {
 			PROC_SLOCK(p);
 			report =
 			    ((p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) &&
 			    p->p_suspcount == p->p_numthreads &&
 			    (p->p_flag & P_WAITED) == 0);
 			PROC_SUNLOCK(p);
 			if (report) {
 			CTR4(KTR_PTRACE,
 			    "wait: returning trapped pid %d status %#x "
 			    "(xstat %d) xthread %d",
 			    p->p_pid, W_STOPCODE(p->p_xsig), p->p_xsig,
 			    p->p_xthread != NULL ?
 			    p->p_xthread->td_tid : -1);
 				report_alive_proc(td, p, siginfo, status,
 				    options, CLD_TRAPPED);
 				return (0);
 			}
 		}
 		if ((options & WUNTRACED) != 0 &&
 		    (p->p_flag & P_STOPPED_SIG) != 0) {
 			PROC_SLOCK(p);
 			report = (p->p_suspcount == p->p_numthreads &&
 			    ((p->p_flag & P_WAITED) == 0));
 			PROC_SUNLOCK(p);
 			if (report) {
 				report_alive_proc(td, p, siginfo, status,
 				    options, CLD_STOPPED);
 				return (0);
 			}
 		}
 		if ((options & WCONTINUED) != 0 &&
 		    (p->p_flag & P_CONTINUED) != 0) {
 			report_alive_proc(td, p, siginfo, status, options,
 			    CLD_CONTINUED);
 			return (0);
 		}
 		PROC_UNLOCK(p);
 	}
 
 	/*
 	 * Look in the orphans list too, to allow the parent to
 	 * collect it's child exit status even if child is being
 	 * debugged.
 	 *
 	 * Debugger detaches from the parent upon successful
 	 * switch-over from parent to child.  At this point due to
 	 * re-parenting the parent loses the child to debugger and a
 	 * wait4(2) call would report that it has no children to wait
 	 * for.  By maintaining a list of orphans we allow the parent
 	 * to successfully wait until the child becomes a zombie.
 	 */
 	if (nfound == 0) {
 		LIST_FOREACH(p, &q->p_orphans, p_orphan) {
 			ret = proc_to_reap(td, p, idtype, id, NULL, options,
 			    NULL, NULL, 1);
 			if (ret != 0) {
 				KASSERT(ret != -1, ("reaped an orphan (pid %d)",
 				    (int)td->td_retval[0]));
 				PROC_UNLOCK(p);
 				nfound++;
 				break;
 			}
 		}
 	}
 	if (nfound == 0) {
 		sx_xunlock(&proctree_lock);
 		return (ECHILD);
 	}
 	if (options & WNOHANG) {
 		sx_xunlock(&proctree_lock);
 		td->td_retval[0] = 0;
 		return (0);
 	}
 	PROC_LOCK(q);
 	if (q->p_flag & P_STATCHILD) {
 		q->p_flag &= ~P_STATCHILD;
 		PROC_UNLOCK(q);
 		goto loop_locked;
 	}
 	sx_xunlock(&proctree_lock);
 	error = msleep(q, &q->p_mtx, PWAIT | PCATCH | PDROP, "wait", 0);
 	if (error)
 		return (error);
 	goto loop;
 }
 
 void
 proc_add_orphan(struct proc *child, struct proc *parent)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	KASSERT((child->p_flag & P_TRACED) != 0,
 	    ("proc_add_orphan: not traced"));
 
 	if (LIST_EMPTY(&parent->p_orphans)) {
 		child->p_treeflag |= P_TREE_FIRST_ORPHAN;
 		LIST_INSERT_HEAD(&parent->p_orphans, child, p_orphan);
 	} else {
 		LIST_INSERT_AFTER(LIST_FIRST(&parent->p_orphans),
 		    child, p_orphan);
 	}
 	child->p_treeflag |= P_TREE_ORPHANED;
 }
 
 /*
  * Make process 'parent' the new parent of process 'child'.
  * Must be called with an exclusive hold of proctree lock.
  */
 void
 proc_reparent(struct proc *child, struct proc *parent, bool set_oppid)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	if (child->p_pptr == parent)
 		return;
 
 	PROC_LOCK(child->p_pptr);
 	sigqueue_take(child->p_ksi);
 	PROC_UNLOCK(child->p_pptr);
 	LIST_REMOVE(child, p_sibling);
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 
 	proc_clear_orphan(child);
 	if ((child->p_flag & P_TRACED) != 0) {
 		proc_add_orphan(child, child->p_pptr);
 	}
 
 	child->p_pptr = parent;
 	if (set_oppid)
 		child->p_oppid = parent->p_pid;
 }
diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c
index f60500b22ef4..883b664aef0d 100644
--- a/sys/kern/kern_fail.c
+++ b/sys/kern/kern_fail.c
@@ -1,1145 +1,1145 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2009 Isilon Inc http://www.isilon.com/
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /**
  * @file
  *
  * fail(9) Facility.
  *
  * @ingroup failpoint_private
  */
 /**
  * @defgroup failpoint fail(9) Facility
  *
  * Failpoints allow for injecting fake errors into running code on the fly,
  * without modifying code or recompiling with flags.  Failpoints are always
  * present, and are very efficient when disabled.  Failpoints are described
  * in man fail(9).
  */
 /**
  * @defgroup failpoint_private Private fail(9) Implementation functions
  *
  * Private implementations for the actual failpoint code.
  *
  * @ingroup failpoint
  */
 /**
  * @addtogroup failpoint_private
  * @{
  */
 
 #include <sys/cdefs.h>
 #include "opt_stack.h"
 
 #include <sys/ctype.h>
 #include <sys/errno.h>
 #include <sys/fail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
 
 #include <machine/atomic.h>
 #include <machine/stdarg.h>
 
 #ifdef ILOG_DEFINE_FOR_FILE
 ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point);
 #endif
 
 static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system");
 #define fp_free(ptr) free(ptr, M_FAIL_POINT)
 #define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags))
 #define fs_free(ptr) fp_free(ptr)
 #define fs_malloc() fp_malloc(sizeof(struct fail_point_setting), \
     M_WAITOK | M_ZERO)
 
 /**
  * These define the wchans that are used for sleeping, pausing respectively.
  * They are chosen arbitrarily but need to be distinct to the failpoint and
  * the sleep/pause distinction.
  */
 #define FP_SLEEP_CHANNEL(fp) (void*)(fp)
 #define FP_PAUSE_CHANNEL(fp) __DEVOLATILE(void*, &fp->fp_setting)
 
 /**
  * Don't allow more than this many entries in a fail point set by sysctl.
  * The 99.99...% case is to have 1 entry.  I can't imagine having this many
  * entries, so it should not limit us.  Saves on re-mallocs while holding
  * a non-sleepable lock.
  */
 #define FP_MAX_ENTRY_COUNT 20
 
 /* Used to drain sbufs to the sysctl output */
 int fail_sysctl_drain_func(void *, const char *, int);
 
 /* Head of tailq of struct fail_point_entry */
 TAILQ_HEAD(fail_point_entry_queue, fail_point_entry);
 
 /**
  * fp entries garbage list; outstanding entries are cleaned up in the
  * garbage collector
  */
 STAILQ_HEAD(fail_point_setting_garbage, fail_point_setting);
 static struct fail_point_setting_garbage fp_setting_garbage =
         STAILQ_HEAD_INITIALIZER(fp_setting_garbage);
 static struct mtx mtx_garbage_list;
 MTX_SYSINIT(mtx_garbage_list, &mtx_garbage_list, "fail point garbage mtx",
         MTX_SPIN);
 
 static struct sx sx_fp_set;
 SX_SYSINIT(sx_fp_set, &sx_fp_set, "fail point set sx");
 
 /**
  * Failpoint types.
  * Don't change these without changing fail_type_strings in fail.c.
  * @ingroup failpoint_private
  */
 enum fail_point_t {
 	FAIL_POINT_OFF,		/**< don't fail */
 	FAIL_POINT_PANIC,	/**< panic */
 	FAIL_POINT_RETURN,	/**< return an errorcode */
 	FAIL_POINT_BREAK,	/**< break into the debugger */
 	FAIL_POINT_PRINT,	/**< print a message */
 	FAIL_POINT_SLEEP,	/**< sleep for some msecs */
 	FAIL_POINT_PAUSE,	/**< sleep until failpoint is set to off */
 	FAIL_POINT_YIELD,	/**< yield the cpu */
 	FAIL_POINT_DELAY,	/**< busy wait the cpu */
 	FAIL_POINT_NUMTYPES,
 	FAIL_POINT_INVALID = -1
 };
 
 static struct {
 	const char *name;
 	int	nmlen;
 } fail_type_strings[] = {
 #define	FP_TYPE_NM_LEN(s)	{ s, sizeof(s) - 1 }
 	[FAIL_POINT_OFF] =	FP_TYPE_NM_LEN("off"),
 	[FAIL_POINT_PANIC] =	FP_TYPE_NM_LEN("panic"),
 	[FAIL_POINT_RETURN] =	FP_TYPE_NM_LEN("return"),
 	[FAIL_POINT_BREAK] =	FP_TYPE_NM_LEN("break"),
 	[FAIL_POINT_PRINT] =	FP_TYPE_NM_LEN("print"),
 	[FAIL_POINT_SLEEP] =	FP_TYPE_NM_LEN("sleep"),
 	[FAIL_POINT_PAUSE] =	FP_TYPE_NM_LEN("pause"),
 	[FAIL_POINT_YIELD] =	FP_TYPE_NM_LEN("yield"),
 	[FAIL_POINT_DELAY] =	FP_TYPE_NM_LEN("delay"),
 };
 
 #define FE_COUNT_UNTRACKED (INT_MIN)
 
 /**
  * Internal structure tracking a single term of a complete failpoint.
  * @ingroup failpoint_private
  */
 struct fail_point_entry {
 	volatile bool	fe_stale;
 	enum fail_point_t	fe_type;	/**< type of entry */
 	int		fe_arg;		/**< argument to type (e.g. return value) */
 	int		fe_prob;	/**< likelihood of firing in millionths */
 	int32_t		fe_count;	/**< number of times to fire, -1 means infinite */
 	pid_t		fe_pid;		/**< only fail for this process */
 	struct fail_point	*fe_parent;	/**< backpointer to fp */
 	TAILQ_ENTRY(fail_point_entry)	fe_entries; /**< next entry ptr */
 };
 
 struct fail_point_setting {
 	STAILQ_ENTRY(fail_point_setting) fs_garbage_link;
 	struct fail_point_entry_queue fp_entry_queue;
 	struct fail_point * fs_parent;
 	struct mtx feq_mtx; /* Gives fail_point_pause something to do.  */
 };
 
 /**
  * Defines stating the equivalent of probablilty one (100%)
  */
 enum {
 	PROB_MAX = 1000000,	/* probability between zero and this number */
 	PROB_DIGITS = 6		/* number of zero's in above number */
 };
 
 /* Get a ref on an fp's fp_setting */
 static inline struct fail_point_setting *fail_point_setting_get_ref(
         struct fail_point *fp);
 /* Release a ref on an fp_setting */
 static inline void fail_point_setting_release_ref(struct fail_point *fp);
 /* Allocate and initialize a struct fail_point_setting */
 static struct fail_point_setting *fail_point_setting_new(struct
         fail_point *);
 /* Free a struct fail_point_setting */
 static void fail_point_setting_destroy(struct fail_point_setting *fp_setting);
 /* Allocate and initialize a struct fail_point_entry */
 static struct fail_point_entry *fail_point_entry_new(struct
         fail_point_setting *);
 /* Free a struct fail_point_entry */
 static void fail_point_entry_destroy(struct fail_point_entry *fp_entry);
 /* Append fp setting to garbage list */
 static inline void fail_point_setting_garbage_append(
         struct fail_point_setting *fp_setting);
 /* Swap fp's setting with fp_setting_new */
 static inline struct fail_point_setting *
         fail_point_swap_settings(struct fail_point *fp,
         struct fail_point_setting *fp_setting_new);
 /* Free up any zero-ref setting in the garbage queue */
 static void fail_point_garbage_collect(void);
 /* If this fail point's setting are empty, then swap it out to NULL. */
 static inline void fail_point_eval_swap_out(struct fail_point *fp,
         struct fail_point_setting *fp_setting);
 
 bool
 fail_point_is_off(struct fail_point *fp)
 {
 	bool return_val;
 	struct fail_point_setting *fp_setting;
 	struct fail_point_entry *ent;
 
 	return_val = true;
 
 	fp_setting = fail_point_setting_get_ref(fp);
 	if (fp_setting != NULL) {
 		TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue,
 		    fe_entries) {
 			if (!ent->fe_stale) {
 				return_val = false;
 				break;
 			}
 		}
 	}
 	fail_point_setting_release_ref(fp);
 
 	return (return_val);
 }
 
 /* Allocate and initialize a struct fail_point_setting */
 static struct fail_point_setting *
 fail_point_setting_new(struct fail_point *fp)
 {
 	struct fail_point_setting *fs_new;
 
 	fs_new = fs_malloc();
 	fs_new->fs_parent = fp;
 	TAILQ_INIT(&fs_new->fp_entry_queue);
 	mtx_init(&fs_new->feq_mtx, "fail point entries", NULL, MTX_SPIN);
 
 	fail_point_setting_garbage_append(fs_new);
 
 	return (fs_new);
 }
 
 /* Free a struct fail_point_setting */
 static void
 fail_point_setting_destroy(struct fail_point_setting *fp_setting)
 {
 	struct fail_point_entry *ent;
 
 	while (!TAILQ_EMPTY(&fp_setting->fp_entry_queue)) {
 		ent = TAILQ_FIRST(&fp_setting->fp_entry_queue);
 		TAILQ_REMOVE(&fp_setting->fp_entry_queue, ent, fe_entries);
 		fail_point_entry_destroy(ent);
 	}
 
 	fs_free(fp_setting);
 }
 
 /* Allocate and initialize a struct fail_point_entry */
 static struct fail_point_entry *
 fail_point_entry_new(struct fail_point_setting *fp_setting)
 {
 	struct fail_point_entry *fp_entry;
 
 	fp_entry = fp_malloc(sizeof(struct fail_point_entry),
 	        M_WAITOK | M_ZERO);
 	fp_entry->fe_parent = fp_setting->fs_parent;
 	fp_entry->fe_prob = PROB_MAX;
 	fp_entry->fe_pid = NO_PID;
 	fp_entry->fe_count = FE_COUNT_UNTRACKED;
 	TAILQ_INSERT_TAIL(&fp_setting->fp_entry_queue, fp_entry,
 	        fe_entries);
 
 	return (fp_entry);
 }
 
 /* Free a struct fail_point_entry */
 static void
 fail_point_entry_destroy(struct fail_point_entry *fp_entry)
 {
 
 	fp_free(fp_entry);
 }
 
 /* Get a ref on an fp's fp_setting */
 static inline struct fail_point_setting *
 fail_point_setting_get_ref(struct fail_point *fp)
 {
 	struct fail_point_setting *fp_setting;
 
 	/* Invariant: if we have a ref, our pointer to fp_setting is safe */
 	atomic_add_acq_32(&fp->fp_ref_cnt, 1);
 	fp_setting = fp->fp_setting;
 
 	return (fp_setting);
 }
 
 /* Release a ref on an fp_setting */
 static inline void
 fail_point_setting_release_ref(struct fail_point *fp)
 {
 
 	KASSERT(&fp->fp_ref_cnt > 0, ("Attempting to deref w/no refs"));
 	atomic_subtract_rel_32(&fp->fp_ref_cnt, 1);
 }
 
 /* Append fp entries to fp garbage list */
 static inline void
 fail_point_setting_garbage_append(struct fail_point_setting *fp_setting)
 {
 
 	mtx_lock_spin(&mtx_garbage_list);
 	STAILQ_INSERT_TAIL(&fp_setting_garbage, fp_setting,
 	        fs_garbage_link);
 	mtx_unlock_spin(&mtx_garbage_list);
 }
 
 /* Swap fp's entries with fp_setting_new */
 static struct fail_point_setting *
 fail_point_swap_settings(struct fail_point *fp,
         struct fail_point_setting *fp_setting_new)
 {
 	struct fail_point_setting *fp_setting_old;
 
 	fp_setting_old = fp->fp_setting;
 	fp->fp_setting = fp_setting_new;
 
 	return (fp_setting_old);
 }
 
 static inline void
 fail_point_eval_swap_out(struct fail_point *fp,
         struct fail_point_setting *fp_setting)
 {
 
 	/* We may have already been swapped out and replaced; ignore. */
 	if (fp->fp_setting == fp_setting)
 		fail_point_swap_settings(fp, NULL);
 }
 
 /* Free up any zero-ref entries in the garbage queue */
 static void
 fail_point_garbage_collect(void)
 {
 	struct fail_point_setting *fs_current, *fs_next;
 	struct fail_point_setting_garbage fp_ents_free_list;
 
 	/**
 	  * We will transfer the entries to free to fp_ents_free_list while holding
 	  * the spin mutex, then free it after we drop the lock. This avoids
 	  * triggering witness due to sleepable mutexes in the memory
 	  * allocator.
 	  */
 	STAILQ_INIT(&fp_ents_free_list);
 
 	mtx_lock_spin(&mtx_garbage_list);
 	STAILQ_FOREACH_SAFE(fs_current, &fp_setting_garbage, fs_garbage_link,
 	    fs_next) {
 		if (fs_current->fs_parent->fp_setting != fs_current &&
 		        fs_current->fs_parent->fp_ref_cnt == 0) {
 			STAILQ_REMOVE(&fp_setting_garbage, fs_current,
 			        fail_point_setting, fs_garbage_link);
 			STAILQ_INSERT_HEAD(&fp_ents_free_list, fs_current,
 			        fs_garbage_link);
 		}
 	}
 	mtx_unlock_spin(&mtx_garbage_list);
 
 	STAILQ_FOREACH_SAFE(fs_current, &fp_ents_free_list, fs_garbage_link,
 	        fs_next)
 		fail_point_setting_destroy(fs_current);
 }
 
 /* Drain out all refs from this fail point */
 static inline void
 fail_point_drain(struct fail_point *fp, int expected_ref)
 {
 	struct fail_point_setting *entries;
 
 	entries = fail_point_swap_settings(fp, NULL);
 	/**
 	 * We have unpaused all threads; so we will wait no longer
 	 * than the time taken for the longest remaining sleep, or
 	 * the length of time of a long-running code block.
 	 */
 	while (fp->fp_ref_cnt > expected_ref) {
 		wakeup(FP_PAUSE_CHANNEL(fp));
 		tsleep(&fp, PWAIT, "fail_point_drain", hz / 100);
 	}
 	if (fp->fp_callout)
 		callout_drain(fp->fp_callout);
 	fail_point_swap_settings(fp, entries);
 }
 
 static inline void
 fail_point_pause(struct fail_point *fp, enum fail_point_return_code *pret,
         struct mtx *mtx_sleep)
 {
 
 	if (fp->fp_pre_sleep_fn)
 		fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
 
 	msleep_spin(FP_PAUSE_CHANNEL(fp), mtx_sleep, "failpt", 0);
 
 	if (fp->fp_post_sleep_fn)
 		fp->fp_post_sleep_fn(fp->fp_post_sleep_arg);
 }
 
 static inline void
 fail_point_sleep(struct fail_point *fp, int msecs,
         enum fail_point_return_code *pret)
 {
 	int timo;
 
 	/* Convert from millisecs to ticks, rounding up */
 	timo = howmany((int64_t)msecs * hz, 1000L);
 
 	if (timo > 0) {
 		if (!(fp->fp_flags & FAIL_POINT_USE_TIMEOUT_PATH)) {
 			if (fp->fp_pre_sleep_fn)
 				fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
 
 			tsleep(FP_SLEEP_CHANNEL(fp), PWAIT, "failpt", timo);
 
 			if (fp->fp_post_sleep_fn)
 				fp->fp_post_sleep_fn(fp->fp_post_sleep_arg);
 		} else {
 			if (fp->fp_pre_sleep_fn)
 				fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
 
 			callout_reset(fp->fp_callout, timo,
 			    fp->fp_post_sleep_fn, fp->fp_post_sleep_arg);
 			*pret = FAIL_POINT_RC_QUEUED;
 		}
 	}
 }
 
 static char *parse_fail_point(struct fail_point_setting *, char *);
 static char *parse_term(struct fail_point_setting *, char *);
 static char *parse_number(int *out_units, int *out_decimal, char *);
 static char *parse_type(struct fail_point_entry *, char *);
 
 /**
  * Initialize a fail_point.  The name is formed in a printf-like fashion
  * from "fmt" and subsequent arguments.  This function is generally used
  * for custom failpoints located at odd places in the sysctl tree, and is
  * not explicitly needed for standard in-line-declared failpoints.
  *
  * @ingroup failpoint
  */
 void
 fail_point_init(struct fail_point *fp, const char *fmt, ...)
 {
 	va_list ap;
 	char *name;
 	int n;
 
 	fp->fp_setting = NULL;
 	fp->fp_flags = 0;
 
 	/* Figure out the size of the name. */
 	va_start(ap, fmt);
 	n = vsnprintf(NULL, 0, fmt, ap);
 	va_end(ap);
 
 	/* Allocate the name and fill it in. */
 	name = fp_malloc(n + 1, M_WAITOK);
 	if (name != NULL) {
 		va_start(ap, fmt);
 		vsnprintf(name, n + 1, fmt, ap);
 		va_end(ap);
 	}
 	fp->fp_name = name;
 	fp->fp_location = "";
 	fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME;
 	fp->fp_pre_sleep_fn = NULL;
 	fp->fp_pre_sleep_arg = NULL;
 	fp->fp_post_sleep_fn = NULL;
 	fp->fp_post_sleep_arg = NULL;
 }
 
 void
 fail_point_alloc_callout(struct fail_point *fp)
 {
 
 	/**
 	 * This assumes that calls to fail_point_use_timeout_path()
 	 * will not race.
 	 */
 	if (fp->fp_callout != NULL)
 		return;
 	fp->fp_callout = fp_malloc(sizeof(*fp->fp_callout), M_WAITOK);
 	callout_init(fp->fp_callout, CALLOUT_MPSAFE);
 }
 
 /**
  * Free the resources held by a fail_point, and wake any paused threads.
  * Thou shalt not allow threads to hit this fail point after you enter this
  * function, nor shall you call this multiple times for a given fp.
  * @ingroup failpoint
  */
 void
 fail_point_destroy(struct fail_point *fp)
 {
 
 	fail_point_drain(fp, 0);
 
 	if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) {
 		fp_free(__DECONST(void *, fp->fp_name));
 		fp->fp_name = NULL;
 	}
 	fp->fp_flags = 0;
 	if (fp->fp_callout) {
 		fp_free(fp->fp_callout);
 		fp->fp_callout = NULL;
 	}
 
 	sx_xlock(&sx_fp_set);
 	fail_point_garbage_collect();
 	sx_xunlock(&sx_fp_set);
 }
 
 /**
  * This does the real work of evaluating a fail point. If the fail point tells
  * us to return a value, this function returns 1 and fills in 'return_value'
  * (return_value is allowed to be null). If the fail point tells us to panic,
  * we never return. Otherwise we just return 0 after doing some work, which
  * means "keep going".
  */
 enum fail_point_return_code
 fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
 {
 	bool execute = false;
 	struct fail_point_entry *ent;
 	struct fail_point_setting *fp_setting;
 	enum fail_point_return_code ret;
 	int cont;
 	int count;
 	int msecs;
 	int usecs;
 
 	ret = FAIL_POINT_RC_CONTINUE;
 	cont = 0; /* don't continue by default */
 
 	fp_setting = fail_point_setting_get_ref(fp);
 	if (fp_setting == NULL)
 		goto abort;
 
 	TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) {
 		if (ent->fe_stale)
 			continue;
 
 		if (ent->fe_prob < PROB_MAX &&
 		    ent->fe_prob < random() % PROB_MAX)
 			continue;
 
 		if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid)
 			continue;
 
 		if (ent->fe_count != FE_COUNT_UNTRACKED) {
 			count = ent->fe_count;
 			while (count > 0) {
 				if (atomic_cmpset_32(&ent->fe_count, count, count - 1)) {
 					count--;
 					execute = true;
 					break;
 				}
 				count = ent->fe_count;
 			}
 			if (execute == false)
 				/* We lost the race; consider the entry stale and bail now */
 				continue;
 			if (count == 0)
 				ent->fe_stale = true;
 		}
 
 		switch (ent->fe_type) {
 		case FAIL_POINT_PANIC:
 			panic("fail point %s panicking", fp->fp_name);
 			/* NOTREACHED */
 
 		case FAIL_POINT_RETURN:
 			if (return_value != NULL)
 				*return_value = ent->fe_arg;
 			ret = FAIL_POINT_RC_RETURN;
 			break;
 
 		case FAIL_POINT_BREAK:
 			printf("fail point %s breaking to debugger\n",
 			        fp->fp_name);
 			breakpoint();
 			break;
 
 		case FAIL_POINT_PRINT:
 			printf("fail point %s executing\n", fp->fp_name);
 			cont = ent->fe_arg;
 			break;
 
 		case FAIL_POINT_SLEEP:
 			msecs = ent->fe_arg;
 			if (msecs)
 				fail_point_sleep(fp, msecs, &ret);
 			break;
 
 		case FAIL_POINT_PAUSE:
 			/**
 			 * Pausing is inherently strange with multiple
 			 * entries given our design.  That is because some
 			 * entries could be unreachable, for instance in cases like:
 			 * pause->return. We can never reach the return entry.
 			 * The sysctl layer actually truncates all entries after
 			 * a pause for this reason.
 			 */
 			mtx_lock_spin(&fp_setting->feq_mtx);
 			fail_point_pause(fp, &ret, &fp_setting->feq_mtx);
 			mtx_unlock_spin(&fp_setting->feq_mtx);
 			break;
 
 		case FAIL_POINT_YIELD:
 			kern_yield(PRI_UNCHANGED);
 			break;
 
 		case FAIL_POINT_DELAY:
 			usecs = ent->fe_arg;
 			DELAY(usecs);
 			break;
 
 		default:
 			break;
 		}
 
 		if (cont == 0)
 			break;
 	}
 
 	if (fail_point_is_off(fp))
 		fail_point_eval_swap_out(fp, fp_setting);
 
 abort:
 	fail_point_setting_release_ref(fp);
 
 	return (ret);
 }
 
 /**
  * Translate internal fail_point structure into human-readable text.
  */
 static void
 fail_point_get(struct fail_point *fp, struct sbuf *sb,
         bool verbose)
 {
 	struct fail_point_entry *ent;
 	struct fail_point_setting *fp_setting;
 	struct fail_point_entry *fp_entry_cpy;
 	int cnt_sleeping;
 	int idx;
 	int printed_entry_count;
 
 	cnt_sleeping = 0;
 	idx = 0;
 	printed_entry_count = 0;
 
 	fp_entry_cpy = fp_malloc(sizeof(struct fail_point_entry) *
 	        (FP_MAX_ENTRY_COUNT + 1), M_WAITOK);
 
 	fp_setting = fail_point_setting_get_ref(fp);
 
 	if (fp_setting != NULL) {
 		TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) {
 			if (ent->fe_stale)
 				continue;
 
 			KASSERT(printed_entry_count < FP_MAX_ENTRY_COUNT,
 			        ("FP entry list larger than allowed"));
 
 			fp_entry_cpy[printed_entry_count] = *ent;
 			++printed_entry_count;
 		}
 	}
 	fail_point_setting_release_ref(fp);
 
 	/* This is our equivalent of a NULL terminator */
 	fp_entry_cpy[printed_entry_count].fe_type = FAIL_POINT_INVALID;
 
 	while (idx < printed_entry_count) {
 		ent = &fp_entry_cpy[idx];
 		++idx;
 		if (ent->fe_prob < PROB_MAX) {
 			int decimal = ent->fe_prob % (PROB_MAX / 100);
 			int units = ent->fe_prob / (PROB_MAX / 100);
 			sbuf_printf(sb, "%d", units);
 			if (decimal) {
 				int digits = PROB_DIGITS - 2;
 				while (!(decimal % 10)) {
 					digits--;
 					decimal /= 10;
 				}
 				sbuf_printf(sb, ".%0*d", digits, decimal);
 			}
 			sbuf_printf(sb, "%%");
 		}
 		if (ent->fe_count >= 0)
 			sbuf_printf(sb, "%d*", ent->fe_count);
 		sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name);
 		if (ent->fe_arg)
 			sbuf_printf(sb, "(%d)", ent->fe_arg);
 		if (ent->fe_pid != NO_PID)
 			sbuf_printf(sb, "[pid %d]", ent->fe_pid);
 		if (TAILQ_NEXT(ent, fe_entries))
-			sbuf_printf(sb, "->");
+			sbuf_cat(sb, "->");
 	}
 	if (!printed_entry_count)
-		sbuf_printf(sb, "off");
+		sbuf_cat(sb, "off");
 
 	fp_free(fp_entry_cpy);
 	if (verbose) {
 #ifdef STACK
 		/* Print number of sleeping threads. queue=0 is the argument
 		 * used by msleep when sending our threads to sleep. */
-		sbuf_printf(sb, "\nsleeping_thread_stacks = {\n");
+		sbuf_cat(sb, "\nsleeping_thread_stacks = {\n");
 		sleepq_sbuf_print_stacks(sb, FP_SLEEP_CHANNEL(fp), 0,
 		        &cnt_sleeping);
 
-		sbuf_printf(sb, "},\n");
+		sbuf_cat(sb, "},\n");
 #endif
 		sbuf_printf(sb, "sleeping_thread_count = %d,\n",
 		        cnt_sleeping);
 
 #ifdef STACK
-		sbuf_printf(sb, "paused_thread_stacks = {\n");
+		sbuf_cat(sb, "paused_thread_stacks = {\n");
 		sleepq_sbuf_print_stacks(sb, FP_PAUSE_CHANNEL(fp), 0,
 		        &cnt_sleeping);
 
-		sbuf_printf(sb, "},\n");
+		sbuf_cat(sb, "},\n");
 #endif
 		sbuf_printf(sb, "paused_thread_count = %d\n",
 		        cnt_sleeping);
 	}
 }
 
 /**
  * Set an internal fail_point structure from a human-readable failpoint string
  * in a lock-safe manner.
  */
 static int
 fail_point_set(struct fail_point *fp, char *buf)
 {
 	struct fail_point_entry *ent, *ent_next;
 	struct fail_point_setting *entries;
 	bool should_wake_paused;
 	bool should_truncate;
 	int error;
 
 	error = 0;
 	should_wake_paused = false;
 	should_truncate = false;
 
 	/* Parse new entries. */
 	/**
 	 * ref protects our new malloc'd stuff from being garbage collected
 	 * before we link it.
 	 */
 	fail_point_setting_get_ref(fp);
 	entries = fail_point_setting_new(fp);
 	if (parse_fail_point(entries, buf) == NULL) {
 		STAILQ_REMOVE(&fp_setting_garbage, entries,
 		        fail_point_setting, fs_garbage_link);
 		fail_point_setting_destroy(entries);
 		error = EINVAL;
 		goto end;
 	}
 
 	/**
 	 * Transfer the entries we are going to keep to a new list.
 	 * Get rid of useless zero probability entries, and entries with hit
 	 * count 0.
 	 * If 'off' is present, and it has no hit count set, then all entries
 	 *       after it are discarded since they are unreachable.
 	 */
 	TAILQ_FOREACH_SAFE(ent, &entries->fp_entry_queue, fe_entries, ent_next) {
 		if (ent->fe_prob == 0 || ent->fe_count == 0) {
 			printf("Discarding entry which cannot execute %s\n",
 			        fail_type_strings[ent->fe_type].name);
 			TAILQ_REMOVE(&entries->fp_entry_queue, ent,
 			        fe_entries);
 			fp_free(ent);
 			continue;
 		} else if (should_truncate) {
 			printf("Discarding unreachable entry %s\n",
 			        fail_type_strings[ent->fe_type].name);
 			TAILQ_REMOVE(&entries->fp_entry_queue, ent,
 			        fe_entries);
 			fp_free(ent);
 			continue;
 		}
 
 		if (ent->fe_type == FAIL_POINT_OFF) {
 			should_wake_paused = true;
 			if (ent->fe_count == FE_COUNT_UNTRACKED) {
 				should_truncate = true;
 				TAILQ_REMOVE(&entries->fp_entry_queue, ent,
 				        fe_entries);
 				fp_free(ent);
 			}
 		} else if (ent->fe_type == FAIL_POINT_PAUSE) {
 			should_truncate = true;
 		} else if (ent->fe_type == FAIL_POINT_SLEEP && (fp->fp_flags &
 		        FAIL_POINT_NONSLEEPABLE)) {
 			/**
 			 * If this fail point is annotated as being in a
 			 * non-sleepable ctx, convert sleep to delay and
 			 * convert the msec argument to usecs.
 			 */
 			printf("Sleep call request on fail point in "
 			        "non-sleepable context; using delay instead "
 			        "of sleep\n");
 			ent->fe_type = FAIL_POINT_DELAY;
 			ent->fe_arg *= 1000;
 		}
 	}
 
 	if (TAILQ_EMPTY(&entries->fp_entry_queue)) {
 		entries = fail_point_swap_settings(fp, NULL);
 		if (entries != NULL)
 			wakeup(FP_PAUSE_CHANNEL(fp));
 	} else {
 		if (should_wake_paused)
 			wakeup(FP_PAUSE_CHANNEL(fp));
 		fail_point_swap_settings(fp, entries);
 	}
 
 end:
 #ifdef IWARNING
 	if (error)
 		IWARNING("Failed to set %s %s to %s",
 		    fp->fp_name, fp->fp_location, buf);
 	else
 		INOTICE("Set %s %s to %s",
 		    fp->fp_name, fp->fp_location, buf);
 #endif /* IWARNING */
 
 	fail_point_setting_release_ref(fp);
 	return (error);
 }
 
 #define MAX_FAIL_POINT_BUF	1023
 
 /**
  * Handle kernel failpoint set/get.
  */
 int
 fail_point_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct fail_point *fp;
 	char *buf;
 	struct sbuf sb, *sb_check;
 	int error;
 
 	buf = NULL;
 	error = 0;
 	fp = arg1;
 
 	sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND);
 	if (sb_check != &sb)
 		return (ENOMEM);
 
 	sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req);
 
 	/* Setting */
 	/**
 	 * Lock protects any new entries from being garbage collected before we
 	 * can link them to the fail point.
 	 */
 	sx_xlock(&sx_fp_set);
 	if (req->newptr) {
 		if (req->newlen > MAX_FAIL_POINT_BUF) {
 			error = EINVAL;
 			goto out;
 		}
 
 		buf = fp_malloc(req->newlen + 1, M_WAITOK);
 
 		error = SYSCTL_IN(req, buf, req->newlen);
 		if (error)
 			goto out;
 		buf[req->newlen] = '\0';
 
 		error = fail_point_set(fp, buf);
 	}
 
 	fail_point_garbage_collect();
 	sx_xunlock(&sx_fp_set);
 
 	/* Retrieving. */
 	fail_point_get(fp, &sb, false);
 
 out:
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (buf)
 		fp_free(buf);
 
 	return (error);
 }
 
 int
 fail_point_sysctl_status(SYSCTL_HANDLER_ARGS)
 {
 	struct fail_point *fp;
 	struct sbuf sb, *sb_check;
 
 	fp = arg1;
 
 	sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND);
 	if (sb_check != &sb)
 		return (ENOMEM);
 
 	sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req);
 
 	/* Retrieving. */
 	fail_point_get(fp, &sb, true);
 
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	/**
 	 * Lock protects any new entries from being garbage collected before we
 	 * can link them to the fail point.
 	 */
 	sx_xlock(&sx_fp_set);
 	fail_point_garbage_collect();
 	sx_xunlock(&sx_fp_set);
 
 	return (0);
 }
 
 int
 fail_sysctl_drain_func(void *sysctl_args, const char *buf, int len)
 {
 	struct sysctl_req *sa;
 	int error;
 
 	sa = sysctl_args;
 
 	error = SYSCTL_OUT(sa, buf, len);
 
 	if (error == ENOMEM)
 		return (-1);
 	else
 		return (len);
 }
 
 /**
  * Internal helper function to translate a human-readable failpoint string
  * into a internally-parsable fail_point structure.
  */
 static char *
 parse_fail_point(struct fail_point_setting *ents, char *p)
 {
 	/*  <fail_point> ::
 	 *      <term> ( "->" <term> )*
 	 */
 	uint8_t term_count;
 
 	term_count = 1;
 
 	p = parse_term(ents, p);
 	if (p == NULL)
 		return (NULL);
 
 	while (*p != '\0') {
 		term_count++;
 		if (p[0] != '-' || p[1] != '>' ||
 		        (p = parse_term(ents, p+2)) == NULL ||
 		        term_count > FP_MAX_ENTRY_COUNT)
 			return (NULL);
 	}
 	return (p);
 }
 
 /**
  * Internal helper function to parse an individual term from a failpoint.
  */
 static char *
 parse_term(struct fail_point_setting *ents, char *p)
 {
 	struct fail_point_entry *ent;
 
 	ent = fail_point_entry_new(ents);
 
 	/*
 	 * <term> ::
 	 *     ( (<float> "%") | (<integer> "*" ) )*
 	 *     <type>
 	 *     [ "(" <integer> ")" ]
 	 *     [ "[pid " <integer> "]" ]
 	 */
 
 	/* ( (<float> "%") | (<integer> "*" ) )* */
 	while (isdigit(*p) || *p == '.') {
 		int units, decimal;
 
 		p = parse_number(&units, &decimal, p);
 		if (p == NULL)
 			return (NULL);
 
 		if (*p == '%') {
 			if (units > 100) /* prevent overflow early */
 				units = 100;
 			ent->fe_prob = units * (PROB_MAX / 100) + decimal;
 			if (ent->fe_prob > PROB_MAX)
 				ent->fe_prob = PROB_MAX;
 		} else if (*p == '*') {
 			if (!units || units < 0 || decimal)
 				return (NULL);
 			ent->fe_count = units;
 		} else
 			return (NULL);
 		p++;
 	}
 
 	/* <type> */
 	p = parse_type(ent, p);
 	if (p == NULL)
 		return (NULL);
 	if (*p == '\0')
 		return (p);
 
 	/* [ "(" <integer> ")" ] */
 	if (*p != '(')
 		return (p);
 	p++;
 	if (!isdigit(*p) && *p != '-')
 		return (NULL);
 	ent->fe_arg = strtol(p, &p, 0);
 	if (*p++ != ')')
 		return (NULL);
 
 	/* [ "[pid " <integer> "]" ] */
 #define PID_STRING "[pid "
 	if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0)
 		return (p);
 	p += sizeof(PID_STRING) - 1;
 	if (!isdigit(*p))
 		return (NULL);
 	ent->fe_pid = strtol(p, &p, 0);
 	if (*p++ != ']')
 		return (NULL);
 
 	return (p);
 }
 
 /**
  * Internal helper function to parse a numeric for a failpoint term.
  */
 static char *
 parse_number(int *out_units, int *out_decimal, char *p)
 {
 	char *old_p;
 
 	/**
 	 *  <number> ::
 	 *      <integer> [ "." <integer> ] |
 	 *      "." <integer>
 	 */
 
 	/* whole part */
 	old_p = p;
 	*out_units = strtol(p, &p, 10);
 	if (p == old_p && *p != '.')
 		return (NULL);
 
 	/* fractional part */
 	*out_decimal = 0;
 	if (*p == '.') {
 		int digits = 0;
 		p++;
 		while (isdigit(*p)) {
 			int digit = *p - '0';
 			if (digits < PROB_DIGITS - 2)
 				*out_decimal = *out_decimal * 10 + digit;
 			else if (digits == PROB_DIGITS - 2 && digit >= 5)
 				(*out_decimal)++;
 			digits++;
 			p++;
 		}
 		if (!digits) /* need at least one digit after '.' */
 			return (NULL);
 		while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */
 			*out_decimal *= 10;
 	}
 
 	return (p); /* success */
 }
 
 /**
  * Internal helper function to parse an individual type for a failpoint term.
  */
 static char *
 parse_type(struct fail_point_entry *ent, char *beg)
 {
 	enum fail_point_t type;
 	int len;
 
 	for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) {
 		len = fail_type_strings[type].nmlen;
 		if (strncmp(fail_type_strings[type].name, beg, len) == 0) {
 			ent->fe_type = type;
 			return (beg + len);
 		}
 	}
 	return (NULL);
 }
 
 /* The fail point sysctl tree. */
 SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "fail points");
 
 /* Debugging/testing stuff for fail point */
 static int
 sysctl_test_fail_point(SYSCTL_HANDLER_ARGS)
 {
 
 	KFAIL_POINT_RETURN(DEBUG_FP, test_fail_point);
 	return (0);
 }
 SYSCTL_OID(_debug_fail_point, OID_AUTO, test_trigger_fail_point,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0,
     sysctl_test_fail_point, "A",
     "Trigger test fail points");
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
index 8e65fabeddc9..f6f781ade697 100644
--- a/sys/kern/kern_rctl.c
+++ b/sys/kern/kern_rctl.c
@@ -1,2246 +1,2246 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2010 The FreeBSD Foundation
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifdef RCTL
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/devctl.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/loginclass.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/rctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <vm/uma.h>
 
 #ifndef RACCT
 #error "The RCTL option requires the RACCT option"
 #endif
 
 FEATURE(rctl, "Resource Limits");
 
 #define	HRF_DEFAULT		0
 #define	HRF_DONT_INHERIT	1
 #define	HRF_DONT_ACCUMULATE	2
 
 #define	RCTL_MAX_INBUFSIZE	4 * 1024
 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
 #define	RCTL_LOG_BUFSIZE	128
 
 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
 
 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
 static int rctl_log_rate_limit = 10;
 static int rctl_devctl_rate_limit = 10;
 
 /*
  * Values below are initialized in rctl_init().
  */
 static int rctl_throttle_min = -1;
 static int rctl_throttle_max = -1;
 static int rctl_throttle_pct = -1;
 static int rctl_throttle_pct2 = -1;
 
 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Resource Limits");
 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
     &rctl_maxbufsize, 0, "Maximum output buffer size");
 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
     &rctl_throttle_min_sysctl, "IU",
     "Shortest throttling duration, in hz");
 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
     &rctl_throttle_max_sysctl, "IU",
     "Longest throttling duration, in hz");
 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
     &rctl_throttle_pct_sysctl, "IU",
     "Throttling penalty for process consumption, in percent");
 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
     &rctl_throttle_pct2_sysctl, "IU",
     "Throttling penalty for container consumption, in percent");
 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
 
 /*
  * 'rctl_rule_link' connects a rule with every racct it's related to.
  * For example, rule 'user:X:openfiles:deny=N/process' is linked
  * with uidinfo for user X, and to each process of that user.
  */
 struct rctl_rule_link {
 	LIST_ENTRY(rctl_rule_link)	rrl_next;
 	struct rctl_rule		*rrl_rule;
 	int				rrl_exceeded;
 };
 
 struct dict {
 	const char	*d_name;
 	int		d_value;
 };
 
 static struct dict subjectnames[] = {
 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
 	{ "user", RCTL_SUBJECT_TYPE_USER },
 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
 	{ NULL, -1 }};
 
 static struct dict resourcenames[] = {
 	{ "cputime", RACCT_CPU },
 	{ "datasize", RACCT_DATA },
 	{ "stacksize", RACCT_STACK },
 	{ "coredumpsize", RACCT_CORE },
 	{ "memoryuse", RACCT_RSS },
 	{ "memorylocked", RACCT_MEMLOCK },
 	{ "maxproc", RACCT_NPROC },
 	{ "openfiles", RACCT_NOFILE },
 	{ "vmemoryuse", RACCT_VMEM },
 	{ "pseudoterminals", RACCT_NPTS },
 	{ "swapuse", RACCT_SWAP },
 	{ "nthr", RACCT_NTHR },
 	{ "msgqqueued", RACCT_MSGQQUEUED },
 	{ "msgqsize", RACCT_MSGQSIZE },
 	{ "nmsgq", RACCT_NMSGQ },
 	{ "nsem", RACCT_NSEM },
 	{ "nsemop", RACCT_NSEMOP },
 	{ "nshm", RACCT_NSHM },
 	{ "shmsize", RACCT_SHMSIZE },
 	{ "wallclock", RACCT_WALLCLOCK },
 	{ "pcpu", RACCT_PCTCPU },
 	{ "readbps", RACCT_READBPS },
 	{ "writebps", RACCT_WRITEBPS },
 	{ "readiops", RACCT_READIOPS },
 	{ "writeiops", RACCT_WRITEIOPS },
 	{ NULL, -1 }};
 
 static struct dict actionnames[] = {
 	{ "sighup", RCTL_ACTION_SIGHUP },
 	{ "sigint", RCTL_ACTION_SIGINT },
 	{ "sigquit", RCTL_ACTION_SIGQUIT },
 	{ "sigill", RCTL_ACTION_SIGILL },
 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
 	{ "sigabrt", RCTL_ACTION_SIGABRT },
 	{ "sigemt", RCTL_ACTION_SIGEMT },
 	{ "sigfpe", RCTL_ACTION_SIGFPE },
 	{ "sigkill", RCTL_ACTION_SIGKILL },
 	{ "sigbus", RCTL_ACTION_SIGBUS },
 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
 	{ "sigsys", RCTL_ACTION_SIGSYS },
 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
 	{ "sigalrm", RCTL_ACTION_SIGALRM },
 	{ "sigterm", RCTL_ACTION_SIGTERM },
 	{ "sigurg", RCTL_ACTION_SIGURG },
 	{ "sigstop", RCTL_ACTION_SIGSTOP },
 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
 	{ "sigchld", RCTL_ACTION_SIGCHLD },
 	{ "sigttin", RCTL_ACTION_SIGTTIN },
 	{ "sigttou", RCTL_ACTION_SIGTTOU },
 	{ "sigio", RCTL_ACTION_SIGIO },
 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
 	{ "sigprof", RCTL_ACTION_SIGPROF },
 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
 	{ "siginfo", RCTL_ACTION_SIGINFO },
 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
 	{ "sigthr", RCTL_ACTION_SIGTHR },
 	{ "deny", RCTL_ACTION_DENY },
 	{ "log", RCTL_ACTION_LOG },
 	{ "devctl", RCTL_ACTION_DEVCTL },
 	{ "throttle", RCTL_ACTION_THROTTLE },
 	{ NULL, -1 }};
 
 static void rctl_init(void);
 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
 
 static uma_zone_t rctl_rule_zone;
 static uma_zone_t rctl_rule_link_zone;
 
 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
 
 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
 
 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, val = rctl_throttle_min;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (val < 1 || val > rctl_throttle_max)
 		return (EINVAL);
 
 	RACCT_LOCK();
 	rctl_throttle_min = val;
 	RACCT_UNLOCK();
 
 	return (0);
 }
 
 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, val = rctl_throttle_max;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (val < rctl_throttle_min)
 		return (EINVAL);
 
 	RACCT_LOCK();
 	rctl_throttle_max = val;
 	RACCT_UNLOCK();
 
 	return (0);
 }
 
 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, val = rctl_throttle_pct;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (val < 0)
 		return (EINVAL);
 
 	RACCT_LOCK();
 	rctl_throttle_pct = val;
 	RACCT_UNLOCK();
 
 	return (0);
 }
 
 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, val = rctl_throttle_pct2;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (val < 0)
 		return (EINVAL);
 
 	RACCT_LOCK();
 	rctl_throttle_pct2 = val;
 	RACCT_UNLOCK();
 
 	return (0);
 }
 
 static const char *
 rctl_subject_type_name(int subject)
 {
 	int i;
 
 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
 		if (subjectnames[i].d_value == subject)
 			return (subjectnames[i].d_name);
 	}
 
 	panic("rctl_subject_type_name: unknown subject type %d", subject);
 }
 
 static const char *
 rctl_action_name(int action)
 {
 	int i;
 
 	for (i = 0; actionnames[i].d_name != NULL; i++) {
 		if (actionnames[i].d_value == action)
 			return (actionnames[i].d_name);
 	}
 
 	panic("rctl_action_name: unknown action %d", action);
 }
 
 const char *
 rctl_resource_name(int resource)
 {
 	int i;
 
 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
 		if (resourcenames[i].d_value == resource)
 			return (resourcenames[i].d_name);
 	}
 
 	panic("rctl_resource_name: unknown resource %d", resource);
 }
 
 static struct racct *
 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
 {
 	struct ucred *cred = p->p_ucred;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	switch (rule->rr_per) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		return (p->p_racct);
 	case RCTL_SUBJECT_TYPE_USER:
 		return (cred->cr_ruidinfo->ui_racct);
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		return (cred->cr_loginclass->lc_racct);
 	case RCTL_SUBJECT_TYPE_JAIL:
 		return (cred->cr_prison->pr_prison_racct->prr_racct);
 	default:
 		panic("%s: unknown per %d", __func__, rule->rr_per);
 	}
 }
 
 /*
  * Return the amount of resource that can be allocated by 'p' before
  * hitting 'rule'.
  */
 static int64_t
 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
 {
 	const struct racct *racct;
 	int64_t available;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	racct = rctl_proc_rule_to_racct(p, rule);
 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
 
 	return (available);
 }
 
 /*
  * Called every second for proc, uidinfo, loginclass, and jail containers.
  * If the limit isn't exceeded, it decreases the usage amount to zero.
  * Otherwise, it decreases it by the value of the limit.  This way
  * resource consumption exceeding the limit "carries over" to the next
  * period.
  */
 void
 rctl_throttle_decay(struct racct *racct, int resource)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int64_t minavailable;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	minavailable = INT64_MAX;
 
 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 
 		if (rule->rr_resource != resource)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
 			continue;
 
 		if (rule->rr_amount < minavailable)
 			minavailable = rule->rr_amount;
 	}
 
 	if (racct->r_resources[resource] < minavailable) {
 		racct->r_resources[resource] = 0;
 	} else {
 		/*
 		 * Cap utilization counter at ten times the limit.  Otherwise,
 		 * if we changed the rule lowering the allowed amount, it could
 		 * take unreasonably long time for the accumulated resource
 		 * usage to drop.
 		 */
 		if (racct->r_resources[resource] > minavailable * 10)
 			racct->r_resources[resource] = minavailable * 10;
 
 		racct->r_resources[resource] -= minavailable;
 	}
 }
 
 /*
  * Special version of rctl_get_available() for the %CPU resource.
  * We slightly cheat here and return less than we normally would.
  */
 int64_t
 rctl_pcpu_available(const struct proc *p) {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int64_t available, minavailable, limit;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	minavailable = INT64_MAX;
 	limit = 0;
 
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != RACCT_PCTCPU)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		available = rctl_available_resource(p, rule);
 		if (available < minavailable) {
 			minavailable = available;
 			limit = rule->rr_amount;
 		}
 	}
 
 	/*
 	 * Return slightly less than actual value of the available
 	 * %cpu resource.  This makes %cpu throttling more aggressive
 	 * and lets us act sooner than the limits are already exceeded.
 	 */
 	if (limit != 0) {
 		if (limit > 2 * RCTL_PCPU_SHIFT)
 			minavailable -= RCTL_PCPU_SHIFT;
 		else
 			minavailable -= (limit / 2);
 	}
 
 	return (minavailable);
 }
 
 static uint64_t
 xadd(uint64_t a, uint64_t b)
 {
 	uint64_t c;
 
 	c = a + b;
 
 	/*
 	 * Detect overflow.
 	 */
 	if (c < a || c < b)
 		return (UINT64_MAX);
 
 	return (c);
 }
 
 static uint64_t
 xmul(uint64_t a, uint64_t b)
 {
 
 	if (b != 0 && a > UINT64_MAX / b)
 		return (UINT64_MAX);
 
 	return (a * b);
 }
 
 /*
  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
  * to what it keeps allocated now.  Returns non-zero if the allocation should
  * be denied, 0 otherwise.
  */
 int
 rctl_enforce(struct proc *p, int resource, uint64_t amount)
 {
 	static struct timeval log_lasttime, devctl_lasttime;
 	static int log_curtime = 0, devctl_curtime = 0;
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	struct sbuf sb;
 	char *buf;
 	int64_t available;
 	uint64_t sleep_ms, sleep_ratio;
 	int should_deny = 0;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 
 		available = rctl_available_resource(p, rule);
 		if (available >= (int64_t)amount) {
 			link->rrl_exceeded = 0;
 			continue;
 		}
 
 		switch (rule->rr_action) {
 		case RCTL_ACTION_DENY:
 			should_deny = 1;
 			continue;
 		case RCTL_ACTION_LOG:
 			/*
 			 * If rrl_exceeded != 0, it means we've already
 			 * logged a warning for this process.
 			 */
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			/*
 			 * If the process state is not fully initialized yet,
 			 * we can't access most of the required fields, e.g.
 			 * p->p_comm.  This happens when called from fork1().
 			 * Ignore this rule for now; it will be processed just
 			 * after fork, when called from racct_proc_fork_done().
 			 */
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			if (!ppsratecheck(&log_lasttime, &log_curtime,
 			    rctl_log_rate_limit))
 				continue;
 
 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
 			if (buf == NULL) {
 				printf("rctl_enforce: out of memory\n");
 				continue;
 			}
 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
 			rctl_rule_to_sbuf(&sb, rule);
 			sbuf_finish(&sb);
 			printf("rctl: rule \"%s\" matched by pid %d "
 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
 			sbuf_delete(&sb);
 			free(buf, M_RCTL);
 			link->rrl_exceeded = 1;
 			continue;
 		case RCTL_ACTION_DEVCTL:
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
 			    rctl_devctl_rate_limit))
 				continue;
 
 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
 			if (buf == NULL) {
 				printf("rctl_enforce: out of memory\n");
 				continue;
 			}
 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
-			sbuf_printf(&sb, "rule=");
+			sbuf_cat(&sb, "rule=");
 			rctl_rule_to_sbuf(&sb, rule);
 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
 			    p->p_pid, p->p_ucred->cr_ruid,
 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
 			sbuf_finish(&sb);
 			devctl_notify("RCTL", "rule", "matched",
 			    sbuf_data(&sb));
 			sbuf_delete(&sb);
 			free(buf, M_RCTL);
 			link->rrl_exceeded = 1;
 			continue;
 		case RCTL_ACTION_THROTTLE:
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			if (rule->rr_amount == 0) {
 				racct_proc_throttle(p, rctl_throttle_max);
 				continue;
 			}
 
 			/*
 			 * Make the process sleep for a fraction of second
 			 * proportional to the ratio of process' resource
 			 * utilization compared to the limit.  The point is
 			 * to penalize resource hogs: processes that consume
 			 * more of the available resources sleep for longer.
 			 *
 			 * We're trying to defer division until the very end,
 			 * to minimize the rounding effects.  The following
 			 * calculation could have been written in a clearer
 			 * way like this:
 			 *
 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
 			 *     rule->rr_amount;
 			 * sleep_ms *= rctl_throttle_pct / 100;
 			 * if (sleep_ms < rctl_throttle_min)
 			 *         sleep_ms = rctl_throttle_min;
 			 *
 			 */
 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
 				sleep_ms = rctl_throttle_min * rule->rr_amount;
 
 			/*
 			 * Multiply that by the ratio of the resource
 			 * consumption for the container compared to the limit,
 			 * squared.  In other words, a process in a container
 			 * that is two times over the limit will be throttled
 			 * four times as much for hitting the same rule.  The
 			 * point is to penalize processes more if the container
 			 * itself (eg certain UID or jail) is above the limit.
 			 */
 			if (available < 0)
 				sleep_ratio = -available / rule->rr_amount;
 			else
 				sleep_ratio = 0;
 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
 
 			/*
 			 * Finally the division.
 			 */
 			sleep_ms /= rule->rr_amount;
 
 			if (sleep_ms > rctl_throttle_max)
 				sleep_ms = rctl_throttle_max;
 #if 0
 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
 			   __func__, p->p_pid, p->p_comm,
 			   p->p_racct->r_resources[resource],
 			   rule->rr_amount, (uintmax_t)sleep_ms,
 			   (uintmax_t)sleep_ratio, (intmax_t)available);
 #endif
 
 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
 			racct_proc_throttle(p, sleep_ms);
 			continue;
 		default:
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			KASSERT(rule->rr_action > 0 &&
 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
 			    ("rctl_enforce: unknown action %d",
 			     rule->rr_action));
 
 			/*
 			 * We're using the fact that RCTL_ACTION_SIG* values
 			 * are equal to their counterparts from sys/signal.h.
 			 */
 			kern_psignal(p, rule->rr_action);
 			link->rrl_exceeded = 1;
 			continue;
 		}
 	}
 
 	if (should_deny) {
 		/*
 		 * Return fake error code; the caller should change it
 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
 		 */
 		return (EDOOFUS);
 	}
 
 	return (0);
 }
 
 uint64_t
 rctl_get_limit(struct proc *p, int resource)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	uint64_t amount = UINT64_MAX;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		if (rule->rr_amount < amount)
 			amount = rule->rr_amount;
 	}
 
 	return (amount);
 }
 
 uint64_t
 rctl_get_available(struct proc *p, int resource)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int64_t available, minavailable, allocated;
 
 	minavailable = INT64_MAX;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		available = rctl_available_resource(p, rule);
 		if (available < minavailable)
 			minavailable = available;
 	}
 
 	/*
 	 * XXX: Think about this _hard_.
 	 */
 	allocated = p->p_racct->r_resources[resource];
 	if (minavailable < INT64_MAX - allocated)
 		minavailable += allocated;
 	if (minavailable < 0)
 		minavailable = 0;
 
 	return (minavailable);
 }
 
 static int
 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
 		if (rule->rr_subject_type != filter->rr_subject_type)
 			return (0);
 
 		switch (filter->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_PROCESS:
 			if (filter->rr_subject.rs_proc != NULL &&
 			    rule->rr_subject.rs_proc !=
 			    filter->rr_subject.rs_proc)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_USER:
 			if (filter->rr_subject.rs_uip != NULL &&
 			    rule->rr_subject.rs_uip !=
 			    filter->rr_subject.rs_uip)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			if (filter->rr_subject.rs_loginclass != NULL &&
 			    rule->rr_subject.rs_loginclass !=
 			    filter->rr_subject.rs_loginclass)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			if (filter->rr_subject.rs_prison_racct != NULL &&
 			    rule->rr_subject.rs_prison_racct !=
 			    filter->rr_subject.rs_prison_racct)
 				return (0);
 			break;
 		default:
 			panic("rctl_rule_matches: unknown subject type %d",
 			    filter->rr_subject_type);
 		}
 	}
 
 	if (filter->rr_resource != RACCT_UNDEFINED) {
 		if (rule->rr_resource != filter->rr_resource)
 			return (0);
 	}
 
 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
 		if (rule->rr_action != filter->rr_action)
 			return (0);
 	}
 
 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
 		if (rule->rr_amount != filter->rr_amount)
 			return (0);
 	}
 
 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
 		if (rule->rr_per != filter->rr_per)
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 str2value(const char *str, int *value, struct dict *table)
 {
 	int i;
 
 	if (value == NULL)
 		return (EINVAL);
 
 	for (i = 0; table[i].d_name != NULL; i++) {
 		if (strcasecmp(table[i].d_name, str) == 0) {
 			*value =  table[i].d_value;
 			return (0);
 		}
 	}
 
 	return (EINVAL);
 }
 
 static int
 str2id(const char *str, id_t *value)
 {
 	char *end;
 
 	if (str == NULL)
 		return (EINVAL);
 
 	*value = strtoul(str, &end, 10);
 	if ((size_t)(end - str) != strlen(str))
 		return (EINVAL);
 
 	return (0);
 }
 
 static int
 str2int64(const char *str, int64_t *value)
 {
 	char *end;
 
 	if (str == NULL)
 		return (EINVAL);
 
 	*value = strtoul(str, &end, 10);
 	if ((size_t)(end - str) != strlen(str))
 		return (EINVAL);
 
 	if (*value < 0)
 		return (ERANGE);
 
 	return (0);
 }
 
 /*
  * Connect the rule to the racct, increasing refcount for the rule.
  */
 static void
 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
 {
 	struct rctl_rule_link *link;
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 
 	rctl_rule_acquire(rule);
 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
 	link->rrl_rule = rule;
 	link->rrl_exceeded = 0;
 
 	RACCT_LOCK();
 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
 	RACCT_UNLOCK();
 }
 
 static int
 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
 {
 	struct rctl_rule_link *link;
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 	RACCT_LOCK_ASSERT();
 
 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
 	if (link == NULL)
 		return (ENOMEM);
 	rctl_rule_acquire(rule);
 	link->rrl_rule = rule;
 	link->rrl_exceeded = 0;
 
 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
 
 	return (0);
 }
 
 /*
  * Remove limits for a rules matching the filter and release
  * the refcounts for the rules, possibly freeing them.  Returns
  * the number of limit structures removed.
  */
 static int
 rctl_racct_remove_rules(struct racct *racct,
     const struct rctl_rule *filter)
 {
 	struct rctl_rule_link *link, *linktmp;
 	int removed = 0;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
 		if (!rctl_rule_matches(link->rrl_rule, filter))
 			continue;
 
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 		removed++;
 	}
 	return (removed);
 }
 
 static void
 rctl_rule_acquire_subject(struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct != NULL)
 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip != NULL)
 			uihold(rule->rr_subject.rs_uip);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass != NULL)
 			loginclass_hold(rule->rr_subject.rs_loginclass);
 		break;
 	default:
 		panic("rctl_rule_acquire_subject: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 }
 
 static void
 rctl_rule_release_subject(struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct != NULL)
 			prison_racct_free(rule->rr_subject.rs_prison_racct);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip != NULL)
 			uifree(rule->rr_subject.rs_uip);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass != NULL)
 			loginclass_free(rule->rr_subject.rs_loginclass);
 		break;
 	default:
 		panic("rctl_rule_release_subject: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 }
 
 struct rctl_rule *
 rctl_rule_alloc(int flags)
 {
 	struct rctl_rule *rule;
 
 	ASSERT_RACCT_ENABLED();
 
 	rule = uma_zalloc(rctl_rule_zone, flags);
 	if (rule == NULL)
 		return (NULL);
 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
 	rule->rr_subject.rs_proc = NULL;
 	rule->rr_subject.rs_uip = NULL;
 	rule->rr_subject.rs_loginclass = NULL;
 	rule->rr_subject.rs_prison_racct = NULL;
 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
 	rule->rr_resource = RACCT_UNDEFINED;
 	rule->rr_action = RCTL_ACTION_UNDEFINED;
 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
 	refcount_init(&rule->rr_refcount, 1);
 
 	return (rule);
 }
 
 struct rctl_rule *
 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
 {
 	struct rctl_rule *copy;
 
 	ASSERT_RACCT_ENABLED();
 
 	copy = uma_zalloc(rctl_rule_zone, flags);
 	if (copy == NULL)
 		return (NULL);
 	copy->rr_subject_type = rule->rr_subject_type;
 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
 	copy->rr_per = rule->rr_per;
 	copy->rr_resource = rule->rr_resource;
 	copy->rr_action = rule->rr_action;
 	copy->rr_amount = rule->rr_amount;
 	refcount_init(&copy->rr_refcount, 1);
 	rctl_rule_acquire_subject(copy);
 
 	return (copy);
 }
 
 void
 rctl_rule_acquire(struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
 
 	refcount_acquire(&rule->rr_refcount);
 }
 
 static void
 rctl_rule_free(void *context, int pending)
 {
 	struct rctl_rule *rule;
 
 	rule = (struct rctl_rule *)context;
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
 
 	/*
 	 * We don't need locking here; rule is guaranteed to be inaccessible.
 	 */
 
 	rctl_rule_release_subject(rule);
 	uma_zfree(rctl_rule_zone, rule);
 }
 
 void
 rctl_rule_release(struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
 
 	if (refcount_release(&rule->rr_refcount)) {
 		/*
 		 * rctl_rule_release() is often called when iterating
 		 * over all the uidinfo structures in the system,
 		 * holding uihashtbl_lock.  Since rctl_rule_free()
 		 * might end up calling uifree(), this would lead
 		 * to lock recursion.  Use taskqueue to avoid this.
 		 */
 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
 	}
 }
 
 static int
 rctl_rule_fully_specified(const struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 		return (0);
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		if (rule->rr_subject.rs_proc == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct == NULL)
 			return (0);
 		break;
 	default:
 		panic("rctl_rule_fully_specified: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 	if (rule->rr_resource == RACCT_UNDEFINED)
 		return (0);
 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
 		return (0);
 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
 		return (0);
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
 		return (0);
 
 	return (1);
 }
 
 static int
 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
 {
 	struct rctl_rule *rule;
 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
 	     *amountstr, *perstr;
 	id_t id;
 	int error = 0;
 
 	ASSERT_RACCT_ENABLED();
 
 	rule = rctl_rule_alloc(M_WAITOK);
 
 	subjectstr = strsep(&rulestr, ":");
 	subject_idstr = strsep(&rulestr, ":");
 	resourcestr = strsep(&rulestr, ":");
 	actionstr = strsep(&rulestr, "=/");
 	amountstr = strsep(&rulestr, "/");
 	perstr = rulestr;
 
 	if (subjectstr == NULL || subjectstr[0] == '\0')
 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
 	else {
 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
 		rule->rr_subject.rs_proc = NULL;
 		rule->rr_subject.rs_uip = NULL;
 		rule->rr_subject.rs_loginclass = NULL;
 		rule->rr_subject.rs_prison_racct = NULL;
 	} else {
 		switch (rule->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_UNDEFINED:
 			error = EINVAL;
 			goto out;
 		case RCTL_SUBJECT_TYPE_PROCESS:
 			error = str2id(subject_idstr, &id);
 			if (error != 0)
 				goto out;
 			sx_assert(&allproc_lock, SA_LOCKED);
 			rule->rr_subject.rs_proc = pfind(id);
 			if (rule->rr_subject.rs_proc == NULL) {
 				error = ESRCH;
 				goto out;
 			}
 			PROC_UNLOCK(rule->rr_subject.rs_proc);
 			break;
 		case RCTL_SUBJECT_TYPE_USER:
 			error = str2id(subject_idstr, &id);
 			if (error != 0)
 				goto out;
 			rule->rr_subject.rs_uip = uifind(id);
 			break;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			rule->rr_subject.rs_loginclass =
 			    loginclass_find(subject_idstr);
 			if (rule->rr_subject.rs_loginclass == NULL) {
 				error = ENAMETOOLONG;
 				goto out;
 			}
 			break;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			rule->rr_subject.rs_prison_racct =
 			    prison_racct_find(subject_idstr);
 			if (rule->rr_subject.rs_prison_racct == NULL) {
 				error = ENAMETOOLONG;
 				goto out;
 			}
 			break;
                default:
                        panic("rctl_string_to_rule: unknown subject type %d",
                            rule->rr_subject_type);
                }
 	}
 
 	if (resourcestr == NULL || resourcestr[0] == '\0')
 		rule->rr_resource = RACCT_UNDEFINED;
 	else {
 		error = str2value(resourcestr, &rule->rr_resource,
 		    resourcenames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (actionstr == NULL || actionstr[0] == '\0')
 		rule->rr_action = RCTL_ACTION_UNDEFINED;
 	else {
 		error = str2value(actionstr, &rule->rr_action, actionnames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (amountstr == NULL || amountstr[0] == '\0')
 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
 	else {
 		error = str2int64(amountstr, &rule->rr_amount);
 		if (error != 0)
 			goto out;
 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
 			if (rule->rr_amount > INT64_MAX / 1000000) {
 				error = ERANGE;
 				goto out;
 			}
 			rule->rr_amount *= 1000000;
 		}
 	}
 
 	if (perstr == NULL || perstr[0] == '\0')
 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
 	else {
 		error = str2value(perstr, &rule->rr_per, subjectnames);
 		if (error != 0)
 			goto out;
 	}
 
 out:
 	if (error == 0)
 		*rulep = rule;
 	else
 		rctl_rule_release(rule);
 
 	return (error);
 }
 
 /*
  * Link a rule with all the subjects it applies to.
  */
 int
 rctl_rule_add(struct rctl_rule *rule)
 {
 	struct proc *p;
 	struct ucred *cred;
 	struct uidinfo *uip;
 	struct prison *pr;
 	struct prison_racct *prr;
 	struct loginclass *lc;
 	struct rctl_rule *rule2;
 	int match;
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 
 	/*
 	 * Some rules just don't make sense, like "deny" rule for an undeniable
 	 * resource.  The exception are the RSS and %CPU resources - they are
 	 * not deniable in the racct sense, but the limit is enforced in
 	 * a different way.
 	 */
 	if (rule->rr_action == RCTL_ACTION_DENY &&
 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
 	    rule->rr_resource != RACCT_RSS &&
 	    rule->rr_resource != RACCT_PCTCPU) {
 		return (EOPNOTSUPP);
 	}
 
 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
 		return (EOPNOTSUPP);
 	}
 
 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
 	    rule->rr_resource == RACCT_PCTCPU) {
 		return (EOPNOTSUPP);
 	}
 
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Make sure there are no duplicated rules.  Also, for the "deny"
 	 * rules, remove ones differing only by "amount".
 	 */
 	if (rule->rr_action == RCTL_ACTION_DENY) {
 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
 		rctl_rule_remove(rule2);
 		rctl_rule_release(rule2);
 	} else
 		rctl_rule_remove(rule);
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		p = rule->rr_subject.rs_proc;
 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
 
 		rctl_racct_add_rule(p->p_racct, rule);
 		/*
 		 * In case of per-process rule, we don't have anything more
 		 * to do.
 		 */
 		return (0);
 
 	case RCTL_SUBJECT_TYPE_USER:
 		uip = rule->rr_subject.rs_uip;
 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
 		rctl_racct_add_rule(uip->ui_racct, rule);
 		break;
 
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		lc = rule->rr_subject.rs_loginclass;
 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
 		rctl_racct_add_rule(lc->lc_racct, rule);
 		break;
 
 	case RCTL_SUBJECT_TYPE_JAIL:
 		prr = rule->rr_subject.rs_prison_racct;
 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
 		rctl_racct_add_rule(prr->prr_racct, rule);
 		break;
 
 	default:
 		panic("rctl_rule_add: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 
 	/*
 	 * Now go through all the processes and add the new rule to the ones
 	 * it applies to.
 	 */
 	sx_assert(&allproc_lock, SA_LOCKED);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		cred = p->p_ucred;
 		switch (rule->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_USER:
 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
 				break;
 			continue;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
 				break;
 			continue;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			match = 0;
 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
 					match = 1;
 					break;
 				}
 			}
 			if (match)
 				break;
 			continue;
 		default:
 			panic("rctl_rule_add: unknown subject type %d",
 			    rule->rr_subject_type);
 		}
 
 		rctl_racct_add_rule(p->p_racct, rule);
 	}
 
 	return (0);
 }
 
 static void
 rctl_rule_pre_callback(void)
 {
 
 	RACCT_LOCK();
 }
 
 static void
 rctl_rule_post_callback(void)
 {
 
 	RACCT_UNLOCK();
 }
 
 static void
 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
 {
 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
 	int found = 0;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	found += rctl_racct_remove_rules(racct, filter);
 
 	*((int *)arg3) += found;
 }
 
 /*
  * Remove all rules that match the filter.
  */
 int
 rctl_rule_remove(struct rctl_rule *filter)
 {
 	struct proc *p;
 	int found = 0;
 
 	ASSERT_RACCT_ENABLED();
 
 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
 	    filter->rr_subject.rs_proc != NULL) {
 		p = filter->rr_subject.rs_proc;
 		RACCT_LOCK();
 		found = rctl_racct_remove_rules(p->p_racct, filter);
 		RACCT_UNLOCK();
 		if (found)
 			return (0);
 		return (ESRCH);
 	}
 
 	loginclass_racct_foreach(rctl_rule_remove_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, (void *)&found);
 	ui_racct_foreach(rctl_rule_remove_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, (void *)&found);
 	prison_racct_foreach(rctl_rule_remove_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, (void *)&found);
 
 	sx_assert(&allproc_lock, SA_LOCKED);
 	RACCT_LOCK();
 	FOREACH_PROC_IN_SYSTEM(p) {
 		found += rctl_racct_remove_rules(p->p_racct, filter);
 	}
 	RACCT_UNLOCK();
 
 	if (found)
 		return (0);
 	return (ESRCH);
 }
 
 /*
  * Appends a rule to the sbuf.
  */
 static void
 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
 {
 	int64_t amount;
 
 	ASSERT_RACCT_ENABLED();
 
 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		if (rule->rr_subject.rs_proc == NULL)
-			sbuf_printf(sb, ":");
+			sbuf_putc(sb, ':');
 		else
 			sbuf_printf(sb, "%d:",
 			    rule->rr_subject.rs_proc->p_pid);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip == NULL)
-			sbuf_printf(sb, ":");
+			sbuf_putc(sb, ':');
 		else
 			sbuf_printf(sb, "%d:",
 			    rule->rr_subject.rs_uip->ui_uid);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass == NULL)
-			sbuf_printf(sb, ":");
+			sbuf_putc(sb, ':');
 		else
 			sbuf_printf(sb, "%s:",
 			    rule->rr_subject.rs_loginclass->lc_name);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct == NULL)
-			sbuf_printf(sb, ":");
+			sbuf_putc(sb, ':');
 		else
 			sbuf_printf(sb, "%s:",
 			    rule->rr_subject.rs_prison_racct->prr_name);
 		break;
 	default:
 		panic("rctl_rule_to_sbuf: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 
 	amount = rule->rr_amount;
 	if (amount != RCTL_AMOUNT_UNDEFINED &&
 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
 		amount /= 1000000;
 
 	sbuf_printf(sb, "%s:%s=%jd",
 	    rctl_resource_name(rule->rr_resource),
 	    rctl_action_name(rule->rr_action),
 	    amount);
 
 	if (rule->rr_per != rule->rr_subject_type)
 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
 }
 
 /*
  * Routine used by RCTL syscalls to read in input string.
  */
 static int
 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
 {
 	char *str;
 	int error;
 
 	ASSERT_RACCT_ENABLED();
 
 	if (inbuflen <= 0)
 		return (EINVAL);
 	if (inbuflen > RCTL_MAX_INBUFSIZE)
 		return (E2BIG);
 
 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
 	error = copyinstr(inbufp, str, inbuflen, NULL);
 	if (error != 0) {
 		free(str, M_RCTL);
 		return (error);
 	}
 
 	*inputstr = str;
 
 	return (0);
 }
 
 /*
  * Routine used by RCTL syscalls to write out output string.
  */
 static int
 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
 {
 	int error;
 
 	ASSERT_RACCT_ENABLED();
 
 	if (outputsbuf == NULL)
 		return (0);
 
 	sbuf_finish(outputsbuf);
 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
 		sbuf_delete(outputsbuf);
 		return (ERANGE);
 	}
 	error = copyout(sbuf_data(outputsbuf), outbufp,
 	    sbuf_len(outputsbuf) + 1);
 	sbuf_delete(outputsbuf);
 	return (error);
 }
 
 static struct sbuf *
 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
 {
 	struct sbuf *sb;
 	int64_t amount;
 	int i;
 
 	ASSERT_RACCT_ENABLED();
 
 	sb = sbuf_new_auto();
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
 			continue;
 		RACCT_LOCK();
 		amount = racct->r_resources[i];
 		RACCT_UNLOCK();
 		if (RACCT_IS_IN_MILLIONS(i))
 			amount /= 1000000;
 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
 	}
 	sbuf_setpos(sb, sbuf_len(sb) - 1);
 	return (sb);
 }
 
 int
 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
 {
 	struct rctl_rule *filter;
 	struct sbuf *outputsbuf = NULL;
 	struct proc *p;
 	struct uidinfo *uip;
 	struct loginclass *lc;
 	struct prison_racct *prr;
 	char *inputstr;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	switch (filter->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		p = filter->rr_subject.rs_proc;
 		if (p == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		uip = filter->rr_subject.rs_uip;
 		if (uip == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		lc = filter->rr_subject.rs_loginclass;
 		if (lc == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		prr = filter->rr_subject.rs_prison_racct;
 		if (prr == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
 		break;
 	default:
 		error = EINVAL;
 	}
 out:
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	if (error != 0)
 		return (error);
 
 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
 
 	return (error);
 }
 
 static void
 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
 {
 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
 	struct rctl_rule_link *link;
 	struct sbuf *sb = (struct sbuf *)arg3;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
 		if (!rctl_rule_matches(link->rrl_rule, filter))
 			continue;
 		rctl_rule_to_sbuf(sb, link->rrl_rule);
-		sbuf_printf(sb, ",");
+		sbuf_putc(sb, ',');
 	}
 }
 
 int
 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
 {
 	struct sbuf *sb;
 	struct rctl_rule *filter;
 	struct rctl_rule_link *link;
 	struct proc *p;
 	char *inputstr, *buf;
 	size_t bufsize;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_GET_RULES);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	bufsize = uap->outbuflen;
 	if (bufsize > rctl_maxbufsize) {
 		sx_sunlock(&allproc_lock);
 		return (E2BIG);
 	}
 
 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
 	KASSERT(sb != NULL, ("sbuf_new failed"));
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		RACCT_LOCK();
 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 			/*
 			 * Non-process rules will be added to the buffer later.
 			 * Adding them here would result in duplicated output.
 			 */
 			if (link->rrl_rule->rr_subject_type !=
 			    RCTL_SUBJECT_TYPE_PROCESS)
 				continue;
 			if (!rctl_rule_matches(link->rrl_rule, filter))
 				continue;
 			rctl_rule_to_sbuf(sb, link->rrl_rule);
-			sbuf_printf(sb, ",");
+			sbuf_putc(sb, ',');
 		}
 		RACCT_UNLOCK();
 	}
 
 	loginclass_racct_foreach(rctl_get_rules_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, sb);
 	ui_racct_foreach(rctl_get_rules_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, sb);
 	prison_racct_foreach(rctl_get_rules_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, sb);
 	if (sbuf_error(sb) == ENOMEM) {
 		error = ERANGE;
 		goto out;
 	}
 
 	/*
 	 * Remove trailing ",".
 	 */
 	if (sbuf_len(sb) > 0)
 		sbuf_setpos(sb, sbuf_len(sb) - 1);
 
 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
 out:
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	free(buf, M_RCTL);
 	return (error);
 }
 
 int
 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
 {
 	struct sbuf *sb;
 	struct rctl_rule *filter;
 	struct rctl_rule_link *link;
 	char *inputstr, *buf;
 	size_t bufsize;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EINVAL);
 	}
 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EOPNOTSUPP);
 	}
 	if (filter->rr_subject.rs_proc == NULL) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EINVAL);
 	}
 
 	bufsize = uap->outbuflen;
 	if (bufsize > rctl_maxbufsize) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (E2BIG);
 	}
 
 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
 	KASSERT(sb != NULL, ("sbuf_new failed"));
 
 	RACCT_LOCK();
 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
 	    rrl_next) {
 		rctl_rule_to_sbuf(sb, link->rrl_rule);
-		sbuf_printf(sb, ",");
+		sbuf_putc(sb, ',');
 	}
 	RACCT_UNLOCK();
 	if (sbuf_error(sb) == ENOMEM) {
 		error = ERANGE;
 		sbuf_delete(sb);
 		goto out;
 	}
 
 	/*
 	 * Remove trailing ",".
 	 */
 	if (sbuf_len(sb) > 0)
 		sbuf_setpos(sb, sbuf_len(sb) - 1);
 
 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
 out:
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	free(buf, M_RCTL);
 	return (error);
 }
 
 int
 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
 {
 	struct rctl_rule *rule;
 	char *inputstr;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &rule);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 	/*
 	 * The 'per' part of a rule is optional.
 	 */
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
 		rule->rr_per = rule->rr_subject_type;
 
 	if (!rctl_rule_fully_specified(rule)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	error = rctl_rule_add(rule);
 
 out:
 	rctl_rule_release(rule);
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 int
 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
 {
 	struct rctl_rule *filter;
 	char *inputstr;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	error = rctl_rule_remove(filter);
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 
 	return (error);
 }
 
 /*
  * Update RCTL rule list after credential change.
  */
 void
 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
 {
 	LIST_HEAD(, rctl_rule_link) newrules;
 	struct rctl_rule_link *link, *newlink;
 	struct uidinfo *newuip;
 	struct loginclass *newlc;
 	struct prison_racct *newprr;
 	int rulecnt, i;
 
 	if (!racct_enable)
 		return;
 
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 
 	newuip = newcred->cr_ruidinfo;
 	newlc = newcred->cr_loginclass;
 	newprr = newcred->cr_prison->pr_prison_racct;
 
 	LIST_INIT(&newrules);
 
 again:
 	/*
 	 * First, count the rules that apply to the process with new
 	 * credentials.
 	 */
 	rulecnt = 0;
 	RACCT_LOCK();
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS)
 			rulecnt++;
 	}
 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	RACCT_UNLOCK();
 
 	/*
 	 * Create temporary list.  We've dropped the rctl_lock in order
 	 * to use M_WAITOK.
 	 */
 	for (i = 0; i < rulecnt; i++) {
 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
 		newlink->rrl_rule = NULL;
 		newlink->rrl_exceeded = 0;
 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
 	}
 
 	newlink = LIST_FIRST(&newrules);
 
 	/*
 	 * Assign rules to the newly allocated list entries.
 	 */
 	RACCT_LOCK();
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS) {
 			if (newlink == NULL)
 				goto goaround;
 			rctl_rule_acquire(link->rrl_rule);
 			newlink->rrl_rule = link->rrl_rule;
 			newlink->rrl_exceeded = link->rrl_exceeded;
 			newlink = LIST_NEXT(newlink, rrl_next);
 			rulecnt--;
 		}
 	}
 
 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink->rrl_exceeded = link->rrl_exceeded;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink->rrl_exceeded = link->rrl_exceeded;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink->rrl_exceeded = link->rrl_exceeded;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	if (rulecnt == 0) {
 		/*
 		 * Free the old rule list.
 		 */
 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
 			link = LIST_FIRST(&p->p_racct->r_rule_links);
 			LIST_REMOVE(link, rrl_next);
 			rctl_rule_release(link->rrl_rule);
 			uma_zfree(rctl_rule_link_zone, link);
 		}
 
 		/*
 		 * Replace lists and we're done.
 		 *
 		 * XXX: Is there any way to switch list heads instead
 		 *      of iterating here?
 		 */
 		while (!LIST_EMPTY(&newrules)) {
 			newlink = LIST_FIRST(&newrules);
 			LIST_REMOVE(newlink, rrl_next);
 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
 			    newlink, rrl_next);
 		}
 
 		RACCT_UNLOCK();
 
 		return;
 	}
 
 goaround:
 	RACCT_UNLOCK();
 
 	/*
 	 * Rule list changed while we were not holding the rctl_lock.
 	 * Free the new list and try again.
 	 */
 	while (!LIST_EMPTY(&newrules)) {
 		newlink = LIST_FIRST(&newrules);
 		LIST_REMOVE(newlink, rrl_next);
 		if (newlink->rrl_rule != NULL)
 			rctl_rule_release(newlink->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, newlink);
 	}
 
 	goto again;
 }
 
 /*
  * Assign RCTL rules to the newly created process.
  */
 int
 rctl_proc_fork(struct proc *parent, struct proc *child)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int error;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
 
 	LIST_INIT(&child->p_racct->r_rule_links);
 
 	/*
 	 * Go through limits applicable to the parent and assign them
 	 * to the child.  Rules with 'process' subject have to be duplicated
 	 * in order to make their rr_subject point to the new process.
 	 */
 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS) {
 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
 			if (rule == NULL)
 				goto fail;
 			KASSERT(rule->rr_subject.rs_proc == parent,
 			    ("rule->rr_subject.rs_proc != parent"));
 			rule->rr_subject.rs_proc = child;
 			error = rctl_racct_add_rule_locked(child->p_racct,
 			    rule);
 			rctl_rule_release(rule);
 			if (error != 0)
 				goto fail;
 		} else {
 			error = rctl_racct_add_rule_locked(child->p_racct,
 			    link->rrl_rule);
 			if (error != 0)
 				goto fail;
 		}
 	}
 
 	return (0);
 
 fail:
 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
 		link = LIST_FIRST(&child->p_racct->r_rule_links);
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 	}
 
 	return (EAGAIN);
 }
 
 /*
  * Release rules attached to the racct.
  */
 void
 rctl_racct_release(struct racct *racct)
 {
 	struct rctl_rule_link *link;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	while (!LIST_EMPTY(&racct->r_rule_links)) {
 		link = LIST_FIRST(&racct->r_rule_links);
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 	}
 }
 
 static void
 rctl_init(void)
 {
 
 	if (!racct_enable)
 		return;
 
 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 
 	/*
 	 * Set default values, making sure not to overwrite the ones
 	 * fetched from tunables.  Most of those could be set at the
 	 * declaration, except for the rctl_throttle_max - we cannot
 	 * set it there due to hz not being compile time constant.
 	 */
 	if (rctl_throttle_min < 1)
 		rctl_throttle_min = 1;
 	if (rctl_throttle_max < rctl_throttle_min)
 		rctl_throttle_max = 2 * hz;
 	if (rctl_throttle_pct < 0)
 		rctl_throttle_pct = 100;
 	if (rctl_throttle_pct2 < 0)
 		rctl_throttle_pct2 = 100;
 }
 
 #else /* !RCTL */
 
 #include <sys/types.h>
 #include <sys/sysproto.h>
 
 int
 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #endif /* RCTL */
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index c847783cd3da..8726c35e15a5 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -1,4599 +1,4599 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/ctype.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/capsicum.h>
 #include <sys/compressor.h>
 #include <sys/condvar.h>
 #include <sys/devctl.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/refcount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/ptrace.h>
 #include <sys/posix4.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE3(proc, , , signal__send,
     "struct thread *", "struct proc *", "int");
 SDT_PROBE_DEFINE2(proc, , , signal__clear,
     "int", "ksiginfo_t *");
 SDT_PROBE_DEFINE3(proc, , , signal__discard,
     "struct thread *", "struct proc *", "int");
 
 static int	coredump(struct thread *);
 static int	killpg1(struct thread *td, int sig, int pgid, int all,
 		    ksiginfo_t *ksi);
 static int	issignal(struct thread *td);
 static void	reschedule_signals(struct proc *p, sigset_t block, int flags);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static int	sig_suspend_threads(struct thread *, struct proc *);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock);
 static void	sigqueue_start(void);
 static void	sigfastblock_setpend(struct thread *td, bool resched);
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_sigattach,
 	.f_detach = filt_sigdetach,
 	.f_event = filt_signal,
 };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
     &kern_logsigexit, 0,
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 static int	kern_lognosys = 0;
 SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0,
     "Log invalid syscalls");
 
 static int	kern_signosys = 1;
 SYSCTL_INT(_kern, OID_AUTO, signosys, CTLFLAG_RWTUN, &kern_signosys, 0,
     "Send SIGSYS on return from invalid syscall");
 
 __read_frequently bool sigfastblock_fetch_always = false;
 SYSCTL_BOOL(_kern, OID_AUTO, sigfastblock_fetch_always, CTLFLAG_RWTUN,
     &sigfastblock_fetch_always, 0,
     "Fetch sigfastblock word on each syscall entry for proper "
     "blocking semantic");
 
 static bool	kern_sig_discard_ign = true;
 SYSCTL_BOOL(_kern, OID_AUTO, sig_discard_ign, CTLFLAG_RWTUN,
     &kern_sig_discard_ign, 0,
     "Discard ignored signals on delivery, otherwise queue them to "
     "the target queue");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 static int	sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
     &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
 
 static int	capmode_coredump;
 SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
     &capmode_coredump, 0, "Allow processes in capability mode to dump core");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 static int	coredump_devctl = 0;
 SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
 	0, "Generate a devctl notification when processes coredump");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SIGPROP_KILL		0x01	/* terminates process by default */
 #define	SIGPROP_CORE		0x02	/* ditto and coredumps */
 #define	SIGPROP_STOP		0x04	/* suspend process */
 #define	SIGPROP_TTYSTOP		0x08	/* ditto, from tty */
 #define	SIGPROP_IGNORE		0x10	/* ignore by default */
 #define	SIGPROP_CONT		0x20	/* continue if suspended */
 
 static const int sigproptbl[NSIG] = {
 	[SIGHUP] =	SIGPROP_KILL,
 	[SIGINT] =	SIGPROP_KILL,
 	[SIGQUIT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGILL] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGTRAP] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGABRT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGEMT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGFPE] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGKILL] =	SIGPROP_KILL,
 	[SIGBUS] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGSEGV] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGSYS] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGPIPE] =	SIGPROP_KILL,
 	[SIGALRM] =	SIGPROP_KILL,
 	[SIGTERM] =	SIGPROP_KILL,
 	[SIGURG] =	SIGPROP_IGNORE,
 	[SIGSTOP] =	SIGPROP_STOP,
 	[SIGTSTP] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGCONT] =	SIGPROP_IGNORE | SIGPROP_CONT,
 	[SIGCHLD] =	SIGPROP_IGNORE,
 	[SIGTTIN] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGTTOU] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGIO] =	SIGPROP_IGNORE,
 	[SIGXCPU] =	SIGPROP_KILL,
 	[SIGXFSZ] =	SIGPROP_KILL,
 	[SIGVTALRM] =	SIGPROP_KILL,
 	[SIGPROF] =	SIGPROP_KILL,
 	[SIGWINCH] =	SIGPROP_IGNORE,
 	[SIGINFO] =	SIGPROP_IGNORE,
 	[SIGUSR1] =	SIGPROP_KILL,
 	[SIGUSR2] =	SIGPROP_KILL,
 };
 
 #define	_SIG_FOREACH_ADVANCE(i, set) ({					\
 	int __found;							\
 	for (;;) {							\
 		if (__bits != 0) {					\
 			int __sig = ffs(__bits);			\
 			__bits &= ~(1u << (__sig - 1));			\
 			sig = __i * sizeof((set)->__bits[0]) * NBBY + __sig; \
 			__found = 1;					\
 			break;						\
 		}							\
 		if (++__i == _SIG_WORDS) {				\
 			__found = 0;					\
 			break;						\
 		}							\
 		__bits = (set)->__bits[__i];				\
 	}								\
 	__found != 0;							\
 })
 
 #define	SIG_FOREACH(i, set)						\
 	for (int32_t __i = -1, __bits = 0;				\
 	    _SIG_FOREACH_ADVANCE(i, set); )				\
 
 static sigset_t fastblock_mask;
 
 static void
 ast_sig(struct thread *td, int tda)
 {
 	struct proc *p;
 	int old_boundary, sig;
 	bool resched_sigs;
 
 	p = td->td_proc;
 
 #ifdef DIAGNOSTIC
 	if (p->p_numthreads == 1 && (tda & (TDAI(TDA_SIG) |
 	    TDAI(TDA_AST))) == 0) {
 		PROC_LOCK(p);
 		thread_lock(td);
 		/*
 		 * Note that TDA_SIG should be re-read from
 		 * td_ast, since signal might have been delivered
 		 * after we cleared td_flags above.  This is one of
 		 * the reason for looping check for AST condition.
 		 * See comment in userret() about P_PPWAIT.
 		 */
 		if ((p->p_flag & P_PPWAIT) == 0 &&
 		    (td->td_pflags & TDP_SIGFASTBLOCK) == 0) {
 			if (SIGPENDING(td) && ((tda | td->td_ast) &
 			    (TDAI(TDA_SIG) | TDAI(TDA_AST))) == 0) {
 				thread_unlock(td); /* fix dumps */
 				panic(
 				    "failed2 to set signal flags for ast p %p "
 				    "td %p tda %#x td_ast %#x fl %#x",
 				    p, td, tda, td->td_ast, td->td_flags);
 			}
 		}
 		thread_unlock(td);
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	/*
 	 * Check for signals. Unlocked reads of p_pendingcnt or
 	 * p_siglist might cause process-directed signal to be handled
 	 * later.
 	 */
 	if ((tda & TDAI(TDA_SIG)) != 0 || p->p_pendingcnt > 0 ||
 	    !SIGISEMPTY(p->p_siglist)) {
 		sigfastblock_fetch(td);
 		PROC_LOCK(p);
 		old_boundary = ~TDB_BOUNDARY | (td->td_dbgflags & TDB_BOUNDARY);
 		td->td_dbgflags |= TDB_BOUNDARY;
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0) {
 			KASSERT(sig >= 0, ("sig %d", sig));
 			postsig(sig);
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		td->td_dbgflags &= old_boundary;
 		PROC_UNLOCK(p);
 		resched_sigs = true;
 	} else {
 		resched_sigs = false;
 	}
 
 	/*
 	 * Handle deferred update of the fast sigblock value, after
 	 * the postsig() loop was performed.
 	 */
 	sigfastblock_setpend(td, resched_sigs);
 }
 
 static void
 ast_sigsuspend(struct thread *td, int tda __unused)
 {
 	MPASS((td->td_pflags & TDP_OLDMASK) != 0);
 	td->td_pflags &= ~TDP_OLDMASK;
 	kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0);
 }
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 	SIGFILLSET(fastblock_mask);
 	SIG_CANTMASK(fastblock_mask);
 	ast_register(TDA_SIG, ASTR_UNCOND, 0, ast_sig);
 	ast_register(TDA_SIGSUSPEND, ASTR_ASTF_REQUIRED | ASTR_TDP,
 	    TDP_OLDMASK, ast_sigsuspend);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int mwait)
 {
 	MPASS(mwait == M_WAITOK || mwait == M_NOWAIT);
 
 	if (ksiginfo_zone == NULL)
 		return (NULL);
 	return (uma_zalloc(ksiginfo_zone, mwait | M_ZERO));
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline bool
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if ((ksi->ksi_flags & KSI_EXT) == 0) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (true);
 	}
 	return (false);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	SIGEMPTYSET(list->sq_ptrace);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  *	0	-	signal not found
  *	others	-	signal number
  */
 static int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_ptrace, signo)) {
 		count++;
 		SIGDELSET(sq->sq_ptrace, signo);
 		si->ksi_flags |= KSI_PTRACE;
 	}
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		if (count == 1)
 			SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) &&
 	    !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 static int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	/*
 	 * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path
 	 * for these signals.
 	 */
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(M_NOWAIT)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if (ret != 0) {
 		if ((si->ksi_flags & KSI_PTRACE) != 0) {
 			SIGADDSET(sq->sq_ptrace, signo);
 			ret = 0;
 			goto out_set_bit;
 		} else if ((si->ksi_flags & KSI_TRAP) != 0 ||
 		    (si->ksi_flags & KSI_SIGQ) == 0) {
 			SIGADDSET(sq->sq_kill, signo);
 			ret = 0;
 			goto out_set_bit;
 		}
 		return (ret);
 	}
 
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 	SIGEMPTYSET(sq->sq_ptrace);
 }
 
 static void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set)
 {
 	sigset_t tmp;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_ptrace;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_ptrace, tmp);
 	SIGSETNAND(src->sq_ptrace, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 }
 
 #if 0
 static void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 #endif
 
 static void
 sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_ptrace, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 static void
 sigqueue_delete_set_proc(struct proc *p, const sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 static void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to thread td, the current
  * thread, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 
 	if (SIGPENDING(td))
 		ast_sched(td, TDA_SIG);
 }
 
 /*
  * Returns 1 (true) if altstack is configured for the thread, and the
  * passed stack bottom address falls into the altstack range.  Handles
  * the 43 compat special case where the alt stack size is zero.
  */
 int
 sigonstack(size_t sp)
 {
 	struct thread *td;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_ALTSTACK) == 0)
 		return (0);
 #if defined(COMPAT_43)
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && td->td_sigstk.ss_size == 0)
 		return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0);
 #endif
 	return (sp >= (size_t)td->td_sigstk.ss_sp &&
 	    sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < nitems(sigproptbl))
 		return (sigproptbl[sig]);
 	return (0);
 }
 
 static bool
 sigact_flag_test(const struct sigaction *act, int flag)
 {
 
 	/*
 	 * SA_SIGINFO is reset when signal disposition is set to
 	 * ignore or default.  Other flags are kept according to user
 	 * settings.
 	 */
 	return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO ||
 	    ((__sighandler_t *)act->sa_sigaction != SIG_IGN &&
 	    (__sighandler_t *)act->sa_sigaction != SIG_DFL)));
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  */
 int
 kern_sigaction(struct thread *td, int sig, const struct sigaction *act,
     struct sigaction *oact, int flags)
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 	if (act != NULL && act->sa_handler != SIG_DFL &&
 	    act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK |
 	    SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER |
 	    SA_NOCLDWAIT | SA_SIGINFO)) != 0)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		memset(oact, 0, sizeof(*oact));
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig)) {
 			oact->sa_flags |= SA_SIGINFO;
 			oact->sa_sigaction =
 			    (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
 		} else
 			oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (sigact_flag_test(act, SA_SIGINFO)) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!sigact_flag_test(act, SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (sigact_flag_test(act, SA_ONSTACK))
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (sigact_flag_test(act, SA_RESETHAND))
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (sigact_flag_test(act, SA_NODEFER))
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SIGPROP_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 			/* never to be seen again */
 			sigqueue_delete_proc(p, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 sys_sigaction(struct thread *td, struct sigaction_args *uap)
 {
 	struct sigaction act, oact;
 	struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap)
 {
 	struct sigaction act, oact;
 	struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 int
 osigaction(struct thread *td, struct osigaction_args *uap)
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(struct thread *td, struct osigreturn_args *uap)
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(struct proc *p)
 {
 	int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++) {
 		if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) {
 			SIGADDSET(ps->ps_sigignore, i);
 		}
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset specified signal to the default disposition.
  */
 static void
 sigdflt(struct sigacts *ps, int sig)
 {
 
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	SIGDELSET(ps->ps_sigcatch, sig);
 	if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT)
 		SIGADDSET(ps->ps_sigignore, sig);
 	ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	SIGDELSET(ps->ps_siginfo, sig);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	struct sigacts *ps;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sig_drop_caught(p);
 
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td = curthread;
 	MPASS(td->td_proc == p);
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset,
     int flags)
 {
 	sigset_t new_block, oset1;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	if ((flags & SIGPROCMASK_PROC_LOCKED) != 0)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 	else
 		PROC_LOCK(p);
 	mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0
 	    ? MA_OWNED : MA_NOTOWNED);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			SIGSETOR(td->td_sigmask, *set);
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			goto out;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			if (flags & SIGPROCMASK_OLD)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * The new_block set contains signals that were not previously
 		 * blocked, but are blocked now.
 		 *
 		 * In case we block any signal that was not previously blocked
 		 * for td, and process has the signal pending, try to schedule
 		 * signal delivery to some thread that does not block the
 		 * signal, possibly waking it up.
 		 */
 		if (p->p_numthreads != 1)
 			reschedule_signals(p, new_block, flags);
 	}
 
 out:
 	if (!(flags & SIGPROCMASK_PROC_LOCKED))
 		PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(struct thread *td, struct osigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 int
 sys_sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		/*
 		 * sigwait() function shall not return EINTR, but
 		 * the syscall does.  Non-ancient libc provides the
 		 * wrapper which hides EINTR.  Otherwise, EINTR return
 		 * is used by libthr to handle required cancellation
 		 * point in the sigwait().
 		 */
 		if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT)
 			return (ERESTART);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 static void
 proc_td_siginfo_capture(struct thread *td, siginfo_t *si)
 {
 	struct thread *thr;
 
 	FOREACH_THREAD_IN_PROC(td->td_proc, thr) {
 		if (thr == td)
 			thr->td_si = *si;
 		else
 			thr->td_si.si_signo = 0;
 	}
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t saved_mask, new_block;
 	struct proc *p;
 	int error, sig, timevalid = 0;
 	sbintime_t sbt, precision, tsbt;
 	struct timespec ts;
 	bool traced;
 
 	p = td->td_proc;
 	error = 0;
 	traced = false;
 
 	/* Ensure the sigfastblock value is up to date. */
 	sigfastblock_fetch(td);
 
 	if (timeout != NULL) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			ts = *timeout;
 			if (ts.tv_sec < INT32_MAX / 2) {
 				tsbt = tstosbt(ts);
 				precision = tsbt;
 				precision >>= tc_precexp;
 				if (TIMESEL(&sbt, tsbt))
 					sbt += tc_tick_sbt;
 				sbt += tsbt;
 			} else
 				precision = sbt = 0;
 		}
 	} else
 		precision = sbt = 0;
 	ksiginfo_init(ksi);
 	/* Some signals can not be waited for. */
 	SIG_CANTMASK(waitset);
 	ps = p->p_sigacts;
 	PROC_LOCK(p);
 	saved_mask = td->td_sigmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	if ((p->p_sysent->sv_flags & SV_SIG_DISCIGN) != 0 ||
 	    !kern_sig_discard_ign) {
 		thread_lock(td);
 		td->td_flags |= TDF_SIGWAIT;
 		thread_unlock(td);
 	}
 	for (;;) {
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		KASSERT(sig >= 0, ("sig %d", sig));
 		if (sig != 0 && SIGISMEMBER(waitset, sig)) {
 			if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 ||
 			    sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
 				error = 0;
 				break;
 			}
 		}
 
 		if (error != 0)
 			break;
 
 		/*
 		 * POSIX says this must be checked after looking for pending
 		 * signals.
 		 */
 		if (timeout != NULL && !timevalid) {
 			error = EINVAL;
 			break;
 		}
 
 		if (traced) {
 			error = EINTR;
 			break;
 		}
 
 		error = msleep_sbt(&p->p_sigacts, &p->p_mtx, PPAUSE | PCATCH,
 		    "sigwait", sbt, precision, C_ABSOLUTE);
 
 		/* The syscalls can not be restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 
 		/*
 		 * If PTRACE_SCE or PTRACE_SCX were set after
 		 * userspace entered the syscall, return spurious
 		 * EINTR after wait was done.  Only do this as last
 		 * resort after rechecking for possible queued signals
 		 * and expired timeouts.
 		 */
 		if (error == 0 && (p->p_ptevents & PTRACE_SYSCALL) != 0)
 			traced = true;
 	}
 	thread_lock(td);
 	td->td_flags &= ~TDF_SIGWAIT;
 	thread_unlock(td);
 
 	new_block = saved_mask;
 	SIGSETNAND(new_block, td->td_sigmask);
 	td->td_sigmask = saved_mask;
 	/*
 	 * Fewer signals can be delivered to us, reschedule signal
 	 * notification.
 	 */
 	if (p->p_numthreads != 1)
 		reschedule_signals(p, new_block, 0);
 
 	if (error == 0) {
 		SDT_PROBE2(proc, , , signal__clear, sig, ksi);
 
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
 		}
 #endif
 		if (sig == SIGKILL) {
 			proc_td_siginfo_capture(td, &ksi->ksi_info);
 			sigexit(td, sig);
 		}
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 int
 sys_sigpending(struct thread *td, struct sigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 int
 osigpending(struct thread *td, struct osigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(struct thread *td, struct osigvec_args *uap)
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(struct thread *td, struct osigblock_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(struct thread *td, struct osigsetmask_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Suspend calling thread until signal, providing mask to be set in the
  * meantime.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap)
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 	int has_sig, sig;
 
 	/* Ensure the sigfastblock value is up to date. */
 	sigfastblock_fetch(td);
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
 	    SIGPROCMASK_PROC_LOCKED);
 	td->td_pflags |= TDP_OLDMASK;
 	ast_sched(td, TDA_SIGSUSPEND);
 
 	/*
 	 * Process signals now. Otherwise, we can get spurious wakeup
 	 * due to signal entered process queue, but delivered to other
 	 * thread. But sigsuspend should return only on signal
 	 * delivery.
 	 */
 	(p->p_sysent->sv_set_syscall_retval)(td, EINTR);
 	for (has_sig = 0; !has_sig;) {
 		while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause",
 			0) == 0)
 			/* void */;
 		thread_suspend_check(0);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0) {
 			KASSERT(sig >= 0, ("sig %d", sig));
 			has_sig += postsig(sig);
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 
 		/*
 		 * If PTRACE_SCE or PTRACE_SCX were set after
 		 * userspace entered the syscall, return spurious
 		 * EINTR.
 		 */
 		if ((p->p_ptevents & PTRACE_SYSCALL) != 0)
 			has_sig += 1;
 	}
 	PROC_UNLOCK(p);
 	td->td_errno = EINTR;
 	td->td_pflags |= TDP_NERRNO;
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(struct thread *td, struct osigsuspend_args *uap)
 {
 	sigset_t mask;
 
 	OSIG2SIG(uap->mask, mask);
 	return (kern_sigsuspend(td, mask));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(struct thread *td, struct osigstack_args *uap)
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap)
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 struct killpg1_ctx {
 	struct thread *td;
 	ksiginfo_t *ksi;
 	int sig;
 	bool sent;
 	bool found;
 	int ret;
 };
 
 static void
 killpg1_sendsig_locked(struct proc *p, struct killpg1_ctx *arg)
 {
 	int err;
 
 	err = p_cansignal(arg->td, p, arg->sig);
 	if (err == 0 && arg->sig != 0)
 		pksignal(p, arg->sig, arg->ksi);
 	if (err != ESRCH)
 		arg->found = true;
 	if (err == 0)
 		arg->sent = true;
 	else if (arg->ret == 0 && err != ESRCH && err != EPERM)
 		arg->ret = err;
 }
 
 static void
 killpg1_sendsig(struct proc *p, bool notself, struct killpg1_ctx *arg)
 {
 
 	if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 ||
 	    (notself && p == arg->td->td_proc) || p->p_state == PRS_NEW)
 		return;
 
 	PROC_LOCK(p);
 	killpg1_sendsig_locked(p, arg);
 	PROC_UNLOCK(p);
 }
 
 static void
 kill_processes_prison_cb(struct proc *p, void *arg)
 {
 	struct killpg1_ctx *ctx = arg;
 
 	if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 ||
 	    (p == ctx->td->td_proc) || p->p_state == PRS_NEW)
 		return;
 
 	killpg1_sendsig_locked(p, ctx);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * td is the calling thread, as usual.
  */
 static int
 killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
 {
 	struct proc *p;
 	struct pgrp *pgrp;
 	struct killpg1_ctx arg;
 
 	arg.td = td;
 	arg.ksi = ksi;
 	arg.sig = sig;
 	arg.sent = false;
 	arg.found = false;
 	arg.ret = 0;
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		prison_proc_iterate(td->td_ucred->cr_prison,
 		    kill_processes_prison_cb, &arg);
 	} else {
 again:
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		if (!sx_try_xlock(&pgrp->pg_killsx)) {
 			PGRP_UNLOCK(pgrp);
 			sx_xlock(&pgrp->pg_killsx);
 			sx_xunlock(&pgrp->pg_killsx);
 			goto again;
 		}
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			killpg1_sendsig(p, false, &arg);
 		}
 		PGRP_UNLOCK(pgrp);
 		sx_xunlock(&pgrp->pg_killsx);
 	}
 	MPASS(arg.ret != 0 || arg.found || !arg.sent);
 	if (arg.ret == 0 && !arg.sent)
 		arg.ret = arg.found ? EPERM : ESRCH;
 	return (arg.ret);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 sys_kill(struct thread *td, struct kill_args *uap)
 {
 
 	return (kern_kill(td, uap->pid, uap->signum));
 }
 
 int
 kern_kill(struct thread *td, pid_t pid, int signum)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	/*
 	 * A process in capability mode can send signals only to himself.
 	 * The main rationale behind this is that abort(3) is implemented as
 	 * kill(getpid(), SIGABRT).
 	 */
 	if (IN_CAPABILITY_MODE(td) && pid != td->td_proc->p_pid)
 		return (ECAPMODE);
 
 	AUDIT_ARG_SIGNUM(signum);
 	AUDIT_ARG_PID(pid);
 	if ((u_int)signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 
 	if (pid > 0) {
 		/* kill single process */
 		if ((p = pfind_any(pid)) == NULL)
 			return (ESRCH);
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, signum);
 		if (error == 0 && signum)
 			pksignal(p, signum, &ksi);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	switch (pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, signum, 0, 1, &ksi));
 	case 0:			/* signal own process group */
 		return (killpg1(td, signum, 0, 0, &ksi));
 	default:		/* negative explicit process group */
 		return (killpg1(td, signum, -pid, 0, &ksi));
 	}
 	/* NOTREACHED */
 }
 
 int
 sys_pdkill(struct thread *td, struct pdkill_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_FD(uap->fd);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p);
 	if (error)
 		return (error);
 	AUDIT_ARG_PROCESS(p);
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum)
 		kern_psignal(p, uap->signum);
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(struct thread *td, struct okillpg_args *uap)
 {
 	ksiginfo_t ksi;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_PID(uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
 int
 sys_sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	union sigval sv;
 
 	sv.sival_ptr = uap->value;
 
 	return (kern_sigqueue(td, uap->pid, uap->signum, &sv));
 }
 
 int
 kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	if ((u_int)signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (pid <= 0)
 		return (EINVAL);
 
 	if ((p = pfind_any(pid)) == NULL)
 		return (ESRCH);
 	error = p_cansignal(td, p, signum);
 	if (error == 0 && signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_flags = KSI_SIGQ;
 		ksi.ksi_signo = signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value = *value;
 		error = pksignal(p, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi)
 {
 	struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    (checkctty == 0 || p->p_flag & P_CONTROLT))
 				pksignal(p, sig, ksi);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * Recalculate the signal mask and reset the signal disposition after
  * usermode frame for delivery is formed.  Should be called after
  * mach-specific routine, because sysent->sv_sendsig() needs correct
  * ps_siginfo and signal mask.
  */
 static void
 postsig_done(int sig, struct thread *td, struct sigacts *ps)
 {
 	sigset_t mask;
 
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	td->td_ru.ru_nsignals++;
 	mask = ps->ps_catchmask[_SIG_IDX(sig)];
 	if (!SIGISMEMBER(ps->ps_signodefer, sig))
 		SIGADDSET(mask, sig);
 	kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
 	    SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
 	if (SIGISMEMBER(ps->ps_sigreset, sig))
 		sigdflt(ps, sig);
 }
 
 /*
  * Send a signal caused by a trap to the current thread.  If it will be
  * caught immediately, deliver it with correct code.  Otherwise, post it
  * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
 	sigset_t sigmask;
 	int sig;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 	sigfastblock_fetch(td);
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sigmask = td->td_sigmask;
 	if (td->td_sigblock_val != 0)
 		SIGSETOR(sigmask, fastblock_mask);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(sigmask, sig)) {
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, ksi->ksi_code);
 #endif
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
 		    ksi, &td->td_sigmask);
 		postsig_done(sig, td, ps);
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit && (SIGISMEMBER(sigmask, sig) ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 			td->td_pflags &= ~TDP_SIGFASTBLOCK;
 			td->td_sigblock_val = 0;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsendsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, bool fast_sigblock)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS(!fast_sigblock || p == curproc);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching context to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig) &&
 	    (!fast_sigblock || curthread->td_sigblock_val == 0))
 		return (curthread);
 
 	/* Find a non-stopped thread that does not mask the signal. */
 	signal_td = NULL;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig) && (!fast_sigblock ||
 		    td != curthread || td->td_sigblock_val == 0) &&
 		    (td->td_flags & TDF_BOUNDARY) == 0) {
 			signal_td = td;
 			break;
 		}
 	}
 	/* Select random (first) thread if no better match was found. */
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  *
  * NB: This function may be entered from the debugger via the "kill" DDB
  * command.  There is little that can be done to mitigate the possibly messy
  * side effects of this unwise possibility.
  */
 void
 kern_psignal(struct proc *p, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(p, NULL, sig, &ksi);
 }
 
 int
 pksignal(struct proc *p, int sig, ksiginfo_t *ksi)
 {
 
 	return (tdsendsignal(p, NULL, sig, ksi));
 }
 
 /* Utility function for finding a thread to send signal event to. */
 int
 sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **ttd)
 {
 	struct thread *td;
 
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
 		if (td == NULL)
 			return (ESRCH);
 		*ttd = td;
 	} else {
 		*ttd = NULL;
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 tdsignal(struct thread *td, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(td->td_proc, td, sig, &ksi);
 }
 
 void
 tdksignal(struct thread *td, int sig, ksiginfo_t *ksi)
 {
 
 	(void) tdsendsignal(td->td_proc, td, sig, ksi);
 }
 
 static int
 sig_sleepq_abort(struct thread *td, int intrval)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	if (intrval == 0 && (td->td_flags & TDF_SIGWAIT) == 0) {
 		thread_unlock(td);
 		return (0);
 	}
 	return (sleepq_abort(td, intrval));
 }
 
 int
 tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 	int wakeup_swapper;
 
 	MPASS(td == NULL || p == td->td_proc);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 		panic("%s(): invalid signal %d", __func__, sig);
 
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0)
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	if (td == NULL) {
 		td = sigtd(p, sig, false);
 		sigqueue = &p->p_sigqueue;
 	} else
 		sigqueue = &td->td_sigqueue;
 
 	SDT_PROBE3(proc, , , signal__send, td, p, sig);
 
 	/*
 	 * If the signal is being ignored, then we forget about it
 	 * immediately, except when the target process executes
 	 * sigwait().  (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN, action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		if (kern_sig_discard_ign &&
 		    (p->p_sysent->sv_flags & SV_SIG_DISCIGN) == 0) {
 			SDT_PROBE3(proc, , , signal__discard, td, p, sig);
 
 			mtx_unlock(&ps->ps_mtx);
 			if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0)
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		} else {
 			action = SIG_CATCH;
 			intrval = 0;
 		}
 	} else {
 		if (SIGISMEMBER(td->td_sigmask, sig))
 			action = SIG_HOLD;
 		else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 			action = SIG_CATCH;
 		else
 			action = SIG_DFL;
 		if (SIGISMEMBER(ps->ps_sigintr, sig))
 			intrval = EINTR;
 		else
 			intrval = ERESTART;
 	}
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SIGPROP_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SIGPROP_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SIGPROP_TTYSTOP) != 0 &&
 		    (p->p_pgrp->pg_flags & PGRP_ORPHANED) != 0 &&
 		    action == SIG_DFL) {
 			if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0)
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		sigqueue_delete_proc(p, SIGCONT);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 
 	wakeup_swapper = 0;
 
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediately, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try to do the per-process part here.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		KASSERT(!(p->p_flag & P_WEXIT),
 		    ("signal to stopped but exiting process"));
 		if (sig == SIGKILL) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SIGPROP_CONT) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			PROC_SLOCK(p);
 			if (p->p_numthreads == p->p_suspcount) {
 				PROC_SUNLOCK(p);
 				p->p_flag |= P_CONTINUED;
 				p->p_xsig = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 				PROC_SLOCK(p);
 			}
 			if (action == SIG_DFL) {
 				thread_unsuspend(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete(sigqueue, sig);
 				goto out_cont;
 			}
 			if (action == SIG_CATCH) {
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 				PROC_SUNLOCK(p);
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 			goto out_cont;
 		}
 
 		if (prop & SIGPROP_STOP) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		PROC_SLOCK(p);
 		thread_lock(td);
 		if (TD_CAN_ABORT(td))
 			wakeup_swapper = sig_sleepq_abort(td, intrval);
 		else
 			thread_unlock(td);
 		PROC_SUNLOCK(p);
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			tdsigwakeup(td, sig, action, intrval);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SIGPROP_STOP) {
 			if (p->p_flag & (P_PPWAIT|P_WEXIT))
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xsig = sig;
 			PROC_SLOCK(p);
 			wakeup_swapper = sig_suspend_threads(td, p);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete_proc(p, p->p_xsig);
 			} else
 				PROC_SUNLOCK(p);
 			goto out;
 		}
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 runfast:
 	tdsigwakeup(td, sig, action, intrval);
 	PROC_SLOCK(p);
 	thread_unsuspend(p);
 	PROC_SUNLOCK(p);
 out_cont:
 	itimer_proc_continue(p);
 	kqtimer_proc_continue(p);
 out:
 	/* If we jump here, proc slock should not be owned. */
 	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
 	if (wakeup_swapper)
 		kick_proc0();
 
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	int prop, wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	prop = sigprop(sig);
 
 	PROC_SLOCK(p);
 	thread_lock(td);
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.  Be careful to avoid bumping the
 	 * priority of the idle thread, since we still allow to signal
 	 * kernel processes.
 	 */
 	if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 &&
 	    td->td_priority > PUSER && !TD_IS_IDLETHREAD(td))
 		sched_prio(td, PUSER);
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			goto out;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SIGPROP_CONT) && action == SIG_DFL) {
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			return;
 		}
 
 		/*
 		 * Don't awaken a sleeping thread for SIGSTOP if the
 		 * STOP signal is deferred.
 		 */
 		if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY |
 		    TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY)
 			goto out;
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td))
 			sched_prio(td, PUSER);
 
 		wakeup_swapper = sig_sleepq_abort(td, intrval);
 		PROC_SUNLOCK(p);
 		if (wakeup_swapper)
 			kick_proc0();
 		return;
 	}
 
 	/*
 	 * Other states do nothing with the signal immediately,
 	 * other than kicking ourselves if we are running.
 	 * It will either never be noticed, or noticed very soon.
 	 */
 #ifdef SMP
 	if (TD_IS_RUNNING(td) && td != curthread)
 		forward_signal(td);
 #endif
 
 out:
 	PROC_SUNLOCK(p);
 	thread_unlock(td);
 }
 
 static void
 ptrace_coredumpreq(struct thread *td, struct proc *p,
     struct thr_coredump_req *tcq)
 {
 	void *rl_cookie;
 
 	if (p->p_sysent->sv_coredump == NULL) {
 		tcq->tc_error = ENOSYS;
 		return;
 	}
 
 	rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX);
 	tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp,
 	    tcq->tc_limit, tcq->tc_flags);
 	vn_rangelock_unlock(tcq->tc_vp, rl_cookie);
 }
 
 static void
 ptrace_syscallreq(struct thread *td, struct proc *p,
     struct thr_syscall_req *tsr)
 {
 	struct sysentvec *sv;
 	struct sysent *se;
 	register_t rv_saved[2];
 	int error, nerror;
 	int sc;
 	bool audited, sy_thr_static;
 
 	sv = p->p_sysent;
 	if (sv->sv_table == NULL || sv->sv_size < tsr->ts_sa.code) {
 		tsr->ts_ret.sr_error = ENOSYS;
 		return;
 	}
 
 	sc = tsr->ts_sa.code;
 	if (sc == SYS_syscall || sc == SYS___syscall) {
 		sc = tsr->ts_sa.args[0];
 		memmove(&tsr->ts_sa.args[0], &tsr->ts_sa.args[1],
 		    sizeof(register_t) * (tsr->ts_nargs - 1));
 	}
 
 	tsr->ts_sa.callp = se = &sv->sv_table[sc];
 
 	VM_CNT_INC(v_syscall);
 	td->td_pticks = 0;
 	if (__predict_false(td->td_cowgen != atomic_load_int(
 	    &td->td_proc->p_cowgen)))
 		thread_cow_update(td);
 
 #ifdef CAPABILITY_MODE
 	if (IN_CAPABILITY_MODE(td) && (se->sy_flags & SYF_CAPENABLED) == 0) {
 		tsr->ts_ret.sr_error = ECAPMODE;
 		return;
 	}
 #endif
 
 	sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0;
 	audited = AUDIT_SYSCALL_ENTER(sc, td) != 0;
 
 	if (!sy_thr_static) {
 		error = syscall_thread_enter(td, &se);
 		sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0;
 		if (error != 0) {
 			tsr->ts_ret.sr_error = error;
 			return;
 		}
 	}
 
 	rv_saved[0] = td->td_retval[0];
 	rv_saved[1] = td->td_retval[1];
 	nerror = td->td_errno;
 	td->td_retval[0] = 0;
 	td->td_retval[1] = 0;
 
 #ifdef KDTRACE_HOOKS
 	if (se->sy_entry != 0)
 		(*systrace_probe_func)(&tsr->ts_sa, SYSTRACE_ENTRY, 0);
 #endif
 	tsr->ts_ret.sr_error = se->sy_call(td, tsr->ts_sa.args);
 #ifdef KDTRACE_HOOKS
 	if (se->sy_return != 0)
 		(*systrace_probe_func)(&tsr->ts_sa, SYSTRACE_RETURN,
 		    tsr->ts_ret.sr_error != 0 ? -1 : td->td_retval[0]);
 #endif
 
 	tsr->ts_ret.sr_retval[0] = td->td_retval[0];
 	tsr->ts_ret.sr_retval[1] = td->td_retval[1];
 	td->td_retval[0] = rv_saved[0];
 	td->td_retval[1] = rv_saved[1];
 	td->td_errno = nerror;
 
 	if (audited)
 		AUDIT_SYSCALL_EXIT(error, td);
 	if (!sy_thr_static)
 		syscall_thread_exit(td, se);
 }
 
 static void
 ptrace_remotereq(struct thread *td, int flag)
 {
 	struct proc *p;
 
 	MPASS(td == curthread);
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((td->td_dbgflags & flag) == 0)
 		return;
 	KASSERT((p->p_flag & P_STOPPED_TRACE) != 0, ("not stopped"));
 	KASSERT(td->td_remotereq != NULL, ("td_remotereq is NULL"));
 
 	PROC_UNLOCK(p);
 	switch (flag) {
 	case TDB_COREDUMPREQ:
 		ptrace_coredumpreq(td, p, td->td_remotereq);
 		break;
 	case TDB_SCREMOTEREQ:
 		ptrace_syscallreq(td, p, td->td_remotereq);
 		break;
 	default:
 		__unreachable();
 	}
 	PROC_LOCK(p);
 
 	MPASS((td->td_dbgflags & flag) != 0);
 	td->td_dbgflags &= ~flag;
 	td->td_remotereq = NULL;
 	wakeup(p);
 }
 
 static int
 sig_suspend_threads(struct thread *td, struct proc *p)
 {
 	struct thread *td2;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	wakeup_swapper = 0;
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		thread_lock(td2);
 		ast_sched_locked(td2, TDA_SUSPEND);
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR)) {
 			if (td2->td_flags & TDF_SBDRY) {
 				/*
 				 * Once a thread is asleep with
 				 * TDF_SBDRY and without TDF_SERESTART
 				 * or TDF_SEINTR set, it should never
 				 * become suspended due to this check.
 				 */
 				KASSERT(!TD_IS_SUSPENDED(td2),
 				    ("thread with deferred stops suspended"));
 				if (TD_SBDRY_INTR(td2)) {
 					wakeup_swapper |= sleepq_abort(td2,
 					    TD_SBDRY_ERRNO(td2));
 					continue;
 				}
 			} else if (!TD_IS_SUSPENDED(td2))
 				thread_suspend_one(td2);
 		} else if (!TD_IS_SUSPENDED(td2)) {
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 		thread_unlock(td2);
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Stop the process for an event deemed interesting to the debugger. If si is
  * non-NULL, this is a signal exchange; the new signal requested by the
  * debugger will be returned for handling. If si is NULL, this is some other
  * type of interesting event. The debugger may request a signal be delivered in
  * that case as well, however it will be deferred until it can be handled.
  */
 int
 ptracestop(struct thread *td, int sig, ksiginfo_t *si)
 {
 	struct proc *p = td->td_proc;
 	struct thread *td2;
 	ksiginfo_t ksi;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process"));
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
 	td->td_xsig = sig;
 
 	if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) {
 		td->td_dbgflags |= TDB_XSIG;
 		CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d",
 		    td->td_tid, p->p_pid, td->td_dbgflags, sig);
 		PROC_SLOCK(p);
 		while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
 			if (P_KILLED(p)) {
 				/*
 				 * Ensure that, if we've been PT_KILLed, the
 				 * exit status reflects that. Another thread
 				 * may also be in ptracestop(), having just
 				 * received the SIGKILL, but this thread was
 				 * unsuspended first.
 				 */
 				td->td_dbgflags &= ~TDB_XSIG;
 				td->td_xsig = SIGKILL;
 				p->p_ptevents = 0;
 				break;
 			}
 			if (p->p_flag & P_SINGLE_EXIT &&
 			    !(td->td_dbgflags & TDB_EXIT)) {
 				/*
 				 * Ignore ptrace stops except for thread exit
 				 * events when the process exits.
 				 */
 				td->td_dbgflags &= ~TDB_XSIG;
 				PROC_SUNLOCK(p);
 				return (0);
 			}
 
 			/*
 			 * Make wait(2) work.  Ensure that right after the
 			 * attach, the thread which was decided to become the
 			 * leader of attach gets reported to the waiter.
 			 * Otherwise, just avoid overwriting another thread's
 			 * assignment to p_xthread.  If another thread has
 			 * already set p_xthread, the current thread will get
 			 * a chance to report itself upon the next iteration.
 			 */
 			if ((td->td_dbgflags & TDB_FSTP) != 0 ||
 			    ((p->p_flag2 & P2_PTRACE_FSTP) == 0 &&
 			    p->p_xthread == NULL)) {
 				p->p_xsig = sig;
 				p->p_xthread = td;
 
 				/*
 				 * If we are on sleepqueue already,
 				 * let sleepqueue code decide if it
 				 * needs to go sleep after attach.
 				 */
 				if (td->td_wchan == NULL)
 					td->td_dbgflags &= ~TDB_FSTP;
 
 				p->p_flag2 &= ~P2_PTRACE_FSTP;
 				p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE;
 				sig_suspend_threads(td, p);
 			}
 			if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
 				td->td_dbgflags &= ~TDB_STOPATFORK;
 			}
 stopme:
 			td->td_dbgflags |= TDB_SSWITCH;
 			thread_suspend_switch(td, p);
 			td->td_dbgflags &= ~TDB_SSWITCH;
 			if ((td->td_dbgflags & (TDB_COREDUMPREQ |
 			    TDB_SCREMOTEREQ)) != 0) {
 				MPASS((td->td_dbgflags & (TDB_COREDUMPREQ |
 				    TDB_SCREMOTEREQ)) !=
 				    (TDB_COREDUMPREQ | TDB_SCREMOTEREQ));
 				PROC_SUNLOCK(p);
 				ptrace_remotereq(td, td->td_dbgflags &
 				    (TDB_COREDUMPREQ | TDB_SCREMOTEREQ));
 				PROC_SLOCK(p);
 				goto stopme;
 			}
 			if (p->p_xthread == td)
 				p->p_xthread = NULL;
 			if (!(p->p_flag & P_TRACED))
 				break;
 			if (td->td_dbgflags & TDB_SUSPEND) {
 				if (p->p_flag & P_SINGLE_EXIT)
 					break;
 				goto stopme;
 			}
 		}
 		PROC_SUNLOCK(p);
 	}
 
 	if (si != NULL && sig == td->td_xsig) {
 		/* Parent wants us to take the original signal unchanged. */
 		si->ksi_flags |= KSI_HEAD;
 		if (sigqueue_add(&td->td_sigqueue, sig, si) != 0)
 			si->ksi_signo = 0;
 	} else if (td->td_xsig != 0) {
 		/*
 		 * If parent wants us to take a new signal, then it will leave
 		 * it in td->td_xsig; otherwise we just look for signals again.
 		 */
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = td->td_xsig;
 		ksi.ksi_flags |= KSI_PTRACE;
 		td2 = sigtd(p, td->td_xsig, false);
 		tdsendsignal(p, td2, td->td_xsig, &ksi);
 		if (td != td2)
 			return (0);
 	}
 
 	return (td->td_xsig);
 }
 
 static void
 reschedule_signals(struct proc *p, sigset_t block, int flags)
 {
 	struct sigacts *ps;
 	struct thread *td;
 	int sig;
 	bool fastblk, pslocked;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	pslocked = (flags & SIGPROCMASK_PS_LOCKED) != 0;
 	mtx_assert(&ps->ps_mtx, pslocked ? MA_OWNED : MA_NOTOWNED);
 	if (SIGISEMPTY(p->p_siglist))
 		return;
 	SIGSETAND(block, p->p_siglist);
 	fastblk = (flags & SIGPROCMASK_FASTBLK) != 0;
 	SIG_FOREACH(sig, &block) {
 		td = sigtd(p, sig, fastblk);
 
 		/*
 		 * If sigtd() selected us despite sigfastblock is
 		 * blocking, do not activate AST or wake us, to avoid
 		 * loop in AST handler.
 		 */
 		if (fastblk && td == curthread)
 			continue;
 
 		signotify(td);
 		if (!pslocked)
 			mtx_lock(&ps->ps_mtx);
 		if (p->p_flag & P_TRACED ||
 		    (SIGISMEMBER(ps->ps_sigcatch, sig) &&
 		    !SIGISMEMBER(td->td_sigmask, sig))) {
 			tdsigwakeup(td, sig, SIG_CATCH,
 			    (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
 			    ERESTART));
 		}
 		if (!pslocked)
 			mtx_unlock(&ps->ps_mtx);
 	}
 }
 
 void
 tdsigcleanup(struct thread *td)
 {
 	struct proc *p;
 	sigset_t unblocked;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_flush(&td->td_sigqueue);
 	if (p->p_numthreads == 1)
 		return;
 
 	/*
 	 * Since we cannot handle signals, notify signal post code
 	 * about this by filling the sigmask.
 	 *
 	 * Also, if needed, wake up thread(s) that do not block the
 	 * same signals as the exiting thread, since the thread might
 	 * have been selected for delivery and woken up.
 	 */
 	SIGFILLSET(unblocked);
 	SIGSETNAND(unblocked, td->td_sigmask);
 	SIGFILLSET(td->td_sigmask);
 	reschedule_signals(p, unblocked, 0);
 
 }
 
 static int
 sigdeferstop_curr_flags(int cflags)
 {
 
 	MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 ||
 	    (cflags & TDF_SBDRY) != 0);
 	return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART));
 }
 
 /*
  * Defer the delivery of SIGSTOP for the current thread, according to
  * the requested mode.  Returns previous flags, which must be restored
  * by sigallowstop().
  *
  * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and
  * cleared by the current thread, which allow the lock-less read-only
  * accesses below.
  */
 int
 sigdeferstop_impl(int mode)
 {
 	struct thread *td;
 	int cflags, nflags;
 
 	td = curthread;
 	cflags = sigdeferstop_curr_flags(td->td_flags);
 	switch (mode) {
 	case SIGDEFERSTOP_NOP:
 		nflags = cflags;
 		break;
 	case SIGDEFERSTOP_OFF:
 		nflags = 0;
 		break;
 	case SIGDEFERSTOP_SILENT:
 		nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART);
 		break;
 	case SIGDEFERSTOP_EINTR:
 		nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART;
 		break;
 	case SIGDEFERSTOP_ERESTART:
 		nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR;
 		break;
 	default:
 		panic("sigdeferstop: invalid mode %x", mode);
 		break;
 	}
 	if (cflags == nflags)
 		return (SIGDEFERSTOP_VAL_NCHG);
 	thread_lock(td);
 	td->td_flags = (td->td_flags & ~cflags) | nflags;
 	thread_unlock(td);
 	return (cflags);
 }
 
 /*
  * Restores the STOP handling mode, typically permitting the delivery
  * of SIGSTOP for the current thread.  This does not immediately
  * suspend if a stop was posted.  Instead, the thread will suspend
  * either via ast() or a subsequent interruptible sleep.
  */
 void
 sigallowstop_impl(int prev)
 {
 	struct thread *td;
 	int cflags;
 
 	KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop"));
 	KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0,
 	    ("sigallowstop: incorrect previous mode %x", prev));
 	td = curthread;
 	cflags = sigdeferstop_curr_flags(td->td_flags);
 	if (cflags != prev) {
 		thread_lock(td);
 		td->td_flags = (td->td_flags & ~cflags) | prev;
 		thread_unlock(td);
 	}
 }
 
 enum sigstatus {
 	SIGSTATUS_HANDLE,
 	SIGSTATUS_HANDLED,
 	SIGSTATUS_IGNORE,
 	SIGSTATUS_SBDRY_STOP,
 };
 
 /*
  * The thread has signal "sig" pending.  Figure out what to do with it:
  *
  * _HANDLE     -> the caller should handle the signal
  * _HANDLED    -> handled internally, reload pending signal set
  * _IGNORE     -> ignored, remove from the set of pending signals and try the
  *                next pending signal
  * _SBDRY_STOP -> the signal should stop the thread but this is not
  *                permitted in the current context
  */
 static enum sigstatus
 sigprocess(struct thread *td, int sig)
 {
 	struct proc *p;
 	struct sigacts *ps;
 	struct sigqueue *queue;
 	ksiginfo_t ksi;
 	int prop;
 
 	KASSERT(_SIG_VALID(sig), ("%s: invalid signal %d", __func__, sig));
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * We should allow pending but ignored signals below
 	 * if there is sigwait() active, or P_TRACED was
 	 * on when they were posted.
 	 */
 	if (SIGISMEMBER(ps->ps_sigignore, sig) &&
 	    (p->p_flag & P_TRACED) == 0 &&
 	    (td->td_flags & TDF_SIGWAIT) == 0) {
 		return (SIGSTATUS_IGNORE);
 	}
 
 	/*
 	 * If the process is going to single-thread mode to prepare
 	 * for exit, there is no sense in delivering any signal
 	 * to usermode.  Another important consequence is that
 	 * msleep(..., PCATCH, ...) now is only interruptible by a
 	 * suspend request.
 	 */
 	if ((p->p_flag2 & P2_WEXIT) != 0)
 		return (SIGSTATUS_IGNORE);
 
 	if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) {
 		/*
 		 * If traced, always stop.
 		 * Remove old signal from queue before the stop.
 		 * XXX shrug off debugger, it causes siginfo to
 		 * be thrown away.
 		 */
 		queue = &td->td_sigqueue;
 		ksiginfo_init(&ksi);
 		if (sigqueue_get(queue, sig, &ksi) == 0) {
 			queue = &p->p_sigqueue;
 			sigqueue_get(queue, sig, &ksi);
 		}
 		td->td_si = ksi.ksi_info;
 
 		mtx_unlock(&ps->ps_mtx);
 		sig = ptracestop(td, sig, &ksi);
 		mtx_lock(&ps->ps_mtx);
 
 		td->td_si.si_signo = 0;
 
 		/*
 		 * Keep looking if the debugger discarded or
 		 * replaced the signal.
 		 */
 		if (sig == 0)
 			return (SIGSTATUS_HANDLED);
 
 		/*
 		 * If the signal became masked, re-queue it.
 		 */
 		if (SIGISMEMBER(td->td_sigmask, sig)) {
 			ksi.ksi_flags |= KSI_HEAD;
 			sigqueue_add(&p->p_sigqueue, sig, &ksi);
 			return (SIGSTATUS_HANDLED);
 		}
 
 		/*
 		 * If the traced bit got turned off, requeue the signal and
 		 * reload the set of pending signals.  This ensures that p_sig*
 		 * and p_sigact are consistent.
 		 */
 		if ((p->p_flag & P_TRACED) == 0) {
 			if ((ksi.ksi_flags & KSI_PTRACE) == 0) {
 				ksi.ksi_flags |= KSI_HEAD;
 				sigqueue_add(queue, sig, &ksi);
 			}
 			return (SIGSTATUS_HANDLED);
 		}
 	}
 
 	/*
 	 * Decide whether the signal should be returned.
 	 * Return the signal's number, or fall through
 	 * to clear it from the pending mask.
 	 */
 	switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 	case (intptr_t)SIG_DFL:
 		/*
 		 * Don't take default actions on system processes.
 		 */
 		if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 			/*
 			 * Are you sure you want to ignore SIGSEGV
 			 * in init? XXX
 			 */
 			printf("Process (pid %lu) got signal %d\n",
 				(u_long)p->p_pid, sig);
 #endif
 			return (SIGSTATUS_IGNORE);
 		}
 
 		/*
 		 * If there is a pending stop signal to process with
 		 * default action, stop here, then clear the signal.
 		 * Traced or exiting processes should ignore stops.
 		 * Additionally, a member of an orphaned process group
 		 * should ignore tty stops.
 		 */
 		prop = sigprop(sig);
 		if (prop & SIGPROP_STOP) {
 			mtx_unlock(&ps->ps_mtx);
 			if ((p->p_flag & (P_TRACED | P_WEXIT |
 			    P_SINGLE_EXIT)) != 0 || ((p->p_pgrp->
 			    pg_flags & PGRP_ORPHANED) != 0 &&
 			    (prop & SIGPROP_TTYSTOP) != 0)) {
 				mtx_lock(&ps->ps_mtx);
 				return (SIGSTATUS_IGNORE);
 			}
 			if (TD_SBDRY_INTR(td)) {
 				KASSERT((td->td_flags & TDF_SBDRY) != 0,
 				    ("lost TDF_SBDRY"));
 				mtx_lock(&ps->ps_mtx);
 				return (SIGSTATUS_SBDRY_STOP);
 			}
 			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 			    &p->p_mtx.lock_object, "Catching SIGSTOP");
 			sigqueue_delete(&td->td_sigqueue, sig);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xsig = sig;
 			PROC_SLOCK(p);
 			sig_suspend_threads(td, p);
 			thread_suspend_switch(td, p);
 			PROC_SUNLOCK(p);
 			mtx_lock(&ps->ps_mtx);
 			return (SIGSTATUS_HANDLED);
 		} else if ((prop & SIGPROP_IGNORE) != 0 &&
 		    (td->td_flags & TDF_SIGWAIT) == 0) {
 			/*
 			 * Default action is to ignore; drop it if
 			 * not in kern_sigtimedwait().
 			 */
 			return (SIGSTATUS_IGNORE);
 		} else {
 			return (SIGSTATUS_HANDLE);
 		}
 
 	case (intptr_t)SIG_IGN:
 		if ((td->td_flags & TDF_SIGWAIT) == 0)
 			return (SIGSTATUS_IGNORE);
 		else
 			return (SIGSTATUS_HANDLE);
 
 	default:
 		/*
 		 * This signal has an action, let postsig() process it.
 		 */
 		return (SIGSTATUS_HANDLE);
 	}
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling
  * issignal by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(struct thread *td)
 {
 	struct proc *p;
 	sigset_t sigpending;
 	int sig;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	for (;;) {
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags &
 		    (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 
 		/*
 		 * Do fast sigblock if requested by usermode.  Since
 		 * we do know that there was a signal pending at this
 		 * point, set the FAST_SIGBLOCK_PEND as indicator for
 		 * usermode to perform a dummy call to
 		 * FAST_SIGBLOCK_UNBLOCK, which causes immediate
 		 * delivery of postponed pending signal.
 		 */
 		if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) {
 			if (td->td_sigblock_val != 0)
 				SIGSETNAND(sigpending, fastblock_mask);
 			if (SIGISEMPTY(sigpending)) {
 				td->td_pflags |= TDP_SIGFASTPENDING;
 				return (0);
 			}
 		}
 
 		if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED &&
 		    (p->p_flag2 & P2_PTRACE_FSTP) != 0 &&
 		    SIGISMEMBER(sigpending, SIGSTOP)) {
 			/*
 			 * If debugger just attached, always consume
 			 * SIGSTOP from ptrace(PT_ATTACH) first, to
 			 * execute the debugger attach ritual in
 			 * order.
 			 */
 			td->td_dbgflags |= TDB_FSTP;
 			SIGEMPTYSET(sigpending);
 			SIGADDSET(sigpending, SIGSTOP);
 		}
 
 		SIG_FOREACH(sig, &sigpending) {
 			switch (sigprocess(td, sig)) {
 			case SIGSTATUS_HANDLE:
 				return (sig);
 			case SIGSTATUS_HANDLED:
 				goto next;
 			case SIGSTATUS_IGNORE:
 				sigqueue_delete(&td->td_sigqueue, sig);
 				sigqueue_delete(&p->p_sigqueue, sig);
 				break;
 			case SIGSTATUS_SBDRY_STOP:
 				return (-1);
 			}
 		}
 next:;
 	}
 }
 
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		PROC_SLOCK(p);
 	}
 }
 
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 int
 postsig(int sig)
 {
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
 	    sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
 		return (0);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
 #endif
 
 	if (action == SIG_DFL) {
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		proc_td_siginfo_capture(td, &ksi.ksi_info);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN, ("postsig action %p", action));
 		KASSERT(!SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action: blocked sig %d", sig));
 
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		if (p->p_sig == sig) {
 			p->p_sig = 0;
 		}
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 		postsig_done(sig, td, ps);
 	}
 	return (1);
 }
 
 int
 sig_ast_checksusp(struct thread *td)
 {
 	struct proc *p __diagused;
 	int ret;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!td_ast_pending(td, TDA_SUSPEND))
 		return (0);
 
 	ret = thread_suspend_check(1);
 	MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
 	return (ret);
 }
 
 int
 sig_ast_needsigchk(struct thread *td)
 {
 	struct proc *p;
 	struct sigacts *ps;
 	int ret, sig;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!td_ast_pending(td, TDA_SIG))
 		return (0);
 
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sig = cursig(td);
 	if (sig == -1) {
 		mtx_unlock(&ps->ps_mtx);
 		KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY"));
 		KASSERT(TD_SBDRY_INTR(td),
 		    ("lost TDF_SERESTART of TDF_SEINTR"));
 		KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
 		    (TDF_SEINTR | TDF_SERESTART),
 		    ("both TDF_SEINTR and TDF_SERESTART"));
 		ret = TD_SBDRY_ERRNO(td);
 	} else if (sig != 0) {
 		ret = SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART;
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		mtx_unlock(&ps->ps_mtx);
 		ret = 0;
 	}
 
 	/*
 	 * Do not go into sleep if this thread was the ptrace(2)
 	 * attach leader.  cursig() consumed SIGSTOP from PT_ATTACH,
 	 * but we usually act on the signal by interrupting sleep, and
 	 * should do that here as well.
 	 */
 	if ((td->td_dbgflags & TDB_FSTP) != 0) {
 		if (ret == 0)
 			ret = EINTR;
 		td->td_dbgflags &= ~TDB_FSTP;
 	}
 
 	return (ret);
 }
 
 int
 sig_intr(void)
 {
 	struct thread *td;
 	struct proc *p;
 	int ret;
 
 	td = curthread;
 	if (!td_ast_pending(td, TDA_SIG) && !td_ast_pending(td, TDA_SUSPEND))
 		return (0);
 
 	p = td->td_proc;
 
 	PROC_LOCK(p);
 	ret = sig_ast_checksusp(td);
 	if (ret == 0)
 		ret = sig_ast_needsigchk(td);
 	PROC_UNLOCK(p);
 	return (ret);
 }
 
 bool
 curproc_sigkilled(void)
 {
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	bool res;
 
 	td = curthread;
 	if (!td_ast_pending(td, TDA_SIG))
 		return (false);
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	res = SIGISMEMBER(td->td_sigqueue.sq_signals, SIGKILL) ||
 	    SIGISMEMBER(p->p_sigqueue.sq_signals, SIGKILL);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (res);
 }
 
 void
 proc_wkilled(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_flag & P_WKILLED) == 0) {
 		p->p_flag |= P_WKILLED;
 		/*
 		 * Notify swapper that there is a process to swap in.
 		 * The notification is racy, at worst it would take 10
 		 * seconds for the swapper process to notice.
 		 */
 		if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0)
 			wakeup(&proc0);
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(struct proc *p, const char *why)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n",
 	    p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id,
 	    p->p_ucred->cr_uid, why);
 	proc_wkilled(p);
 	kern_psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 	const char *coreinfo;
 	int rv;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	proc_set_p2_wexit(p);
 
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SIGPROP_CORE) &&
 	    thread_single(p, SINGLE_NO_EXIT) == 0) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		rv = coredump(td);
 		switch (rv) {
 		case 0:
 			sig |= WCOREFLAG;
 			coreinfo = " (core dumped)";
 			break;
 		case EFAULT:
 			coreinfo = " (no core dump - bad address)";
 			break;
 		case EINVAL:
 			coreinfo = " (no core dump - invalid argument)";
 			break;
 		case EFBIG:
 			coreinfo = " (no core dump - too large)";
 			break;
 		default:
 			coreinfo = " (no core dump - other error)";
 			break;
 		}
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), jid %d, uid %d: exited on "
 			    "signal %d%s\n", p->p_pid, p->p_comm,
 			    p->p_ucred->cr_prison->pr_id,
 			    td->td_ucred->cr_uid,
 			    sig &~ WCOREFLAG, coreinfo);
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, 0, sig);
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int sig)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, sig);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 
 	childproc_jobstate(p, reason, p->p_xsig);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason, status;
 
 	if (WCOREDUMP(p->p_xsig)) {
 		reason = CLD_DUMPED;
 		status = WTERMSIG(p->p_xsig);
 	} else if (WIFSIGNALED(p->p_xsig)) {
 		reason = CLD_KILLED;
 		status = WTERMSIG(p->p_xsig);
 	} else {
 		reason = CLD_EXITED;
 		status = p->p_xexit;
 	}
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 #define	MAX_NUM_CORE_FILES 100000
 #ifndef NUM_CORE_FILES
 #define	NUM_CORE_FILES 5
 #endif
 CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES);
 static int num_cores = NUM_CORE_FILES;
 
 static int
 sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int new_val;
 
 	new_val = num_cores;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val > MAX_NUM_CORE_FILES)
 		new_val = MAX_NUM_CORE_FILES;
 	if (new_val < 0)
 		new_val = 0;
 	num_cores = new_val;
 	return (0);
 }
 SYSCTL_PROC(_debug, OID_AUTO, ncores,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
     sysctl_debug_num_cores_check, "I",
     "Maximum number of generated process corefiles while using index format");
 
 #define	GZIP_SUFFIX	".gz"
 #define	ZSTD_SUFFIX	".zst"
 
 int compress_user_cores = 0;
 
 static int
 sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = compress_user_cores;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val != 0 && !compressor_avail(val))
 		return (EINVAL);
 	compress_user_cores = val;
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int),
     sysctl_compress_user_cores, "I",
     "Enable compression of user corefiles ("
     __XSTRING(COMPRESS_GZIP) " = gzip, "
     __XSTRING(COMPRESS_ZSTD) " = zstd)");
 
 int compress_user_cores_level = 6;
 SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
     &compress_user_cores_level, 0,
     "Corefile compression level");
 
 /*
  * Protect the access to corefilename[] by allproc_lock.
  */
 #define	corefilename_lock	allproc_lock
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
 
 static int
 sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	sx_xlock(&corefilename_lock);
 	error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
 	    req);
 	sx_xunlock(&corefilename_lock);
 
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
     CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
     "Process corefile name format string");
 
 static void
 vnode_close_locked(struct thread *td, struct vnode *vp)
 {
 
 	VOP_UNLOCK(vp);
 	vn_close(vp, FWRITE, td->td_ucred, td);
 }
 
 /*
  * If the core format has a %I in it, then we need to check
  * for existing corefiles before defining a name.
  * To do this we iterate over 0..ncores to find a
  * non-existing core file name to use. If all core files are
  * already used we choose the oldest one.
  */
 static int
 corefile_open_last(struct thread *td, char *name, int indexpos,
     int indexlen, int ncores, struct vnode **vpp)
 {
 	struct vnode *oldvp, *nextvp, *vp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error, i, flags, oflags, cmode;
 	char ch;
 	struct timespec lasttime;
 
 	nextvp = oldvp = NULL;
 	cmode = S_IRUSR | S_IWUSR;
 	oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
 	    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 
 	for (i = 0; i < ncores; i++) {
 		flags = O_CREAT | FWRITE | O_NOFOLLOW;
 
 		ch = name[indexpos + indexlen];
 		(void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
 		    i);
 		name[indexpos + indexlen] = ch;
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
 		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
 		    NULL);
 		if (error != 0)
 			break;
 
 		vp = nd.ni_vp;
 		NDFREE_PNBUF(&nd);
 		if ((flags & O_CREAT) == O_CREAT) {
 			nextvp = vp;
 			break;
 		}
 
 		error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 		if (error != 0) {
 			vnode_close_locked(td, vp);
 			break;
 		}
 
 		if (oldvp == NULL ||
 		    lasttime.tv_sec > vattr.va_mtime.tv_sec ||
 		    (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
 		    lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
 			if (oldvp != NULL)
 				vn_close(oldvp, FWRITE, td->td_ucred, td);
 			oldvp = vp;
 			VOP_UNLOCK(oldvp);
 			lasttime = vattr.va_mtime;
 		} else {
 			vnode_close_locked(td, vp);
 		}
 	}
 
 	if (oldvp != NULL) {
 		if (nextvp == NULL) {
 			if ((td->td_proc->p_flag & P_SUGID) != 0) {
 				error = EFAULT;
 				vn_close(oldvp, FWRITE, td->td_ucred, td);
 			} else {
 				nextvp = oldvp;
 				error = vn_lock(nextvp, LK_EXCLUSIVE);
 				if (error != 0) {
 					vn_close(nextvp, FWRITE, td->td_ucred,
 					    td);
 					nextvp = NULL;
 				}
 			}
 		} else {
 			vn_close(oldvp, FWRITE, td->td_ucred, td);
 		}
 	}
 	if (error != 0) {
 		if (nextvp != NULL)
 			vnode_close_locked(td, oldvp);
 	} else {
 		*vpp = nextvp;
 	}
 
 	return (error);
 }
 
 /*
  * corefile_open(comm, uid, pid, td, compress, vpp, namep)
  * Expand the name described in corefilename, using name, uid, and pid
  * and open/create core file.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 static int
 corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
     int compress, int signum, struct vnode **vpp, char **namep)
 {
 	struct sbuf sb;
 	struct nameidata nd;
 	const char *format;
 	char *hostname, *name;
 	int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
 
 	hostname = NULL;
 	format = corefilename;
 	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
 	indexlen = 0;
 	indexpos = -1;
 	ncores = num_cores;
 	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
 	sx_slock(&corefilename_lock);
 	for (i = 0; format[i] != '\0'; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				sbuf_putc(&sb, '%');
 				break;
 			case 'H':	/* hostname */
 				if (hostname == NULL) {
 					hostname = malloc(MAXHOSTNAMELEN,
 					    M_TEMP, M_WAITOK);
 				}
 				getcredhostname(td->td_ucred, hostname,
 				    MAXHOSTNAMELEN);
-				sbuf_printf(&sb, "%s", hostname);
+				sbuf_cat(&sb, hostname);
 				break;
 			case 'I':	/* autoincrementing index */
 				if (indexpos != -1) {
 					sbuf_printf(&sb, "%%I");
 					break;
 				}
 
 				indexpos = sbuf_len(&sb);
 				sbuf_printf(&sb, "%u", ncores - 1);
 				indexlen = sbuf_len(&sb) - indexpos;
 				break;
 			case 'N':	/* process name */
 				sbuf_printf(&sb, "%s", comm);
 				break;
 			case 'P':	/* process id */
 				sbuf_printf(&sb, "%u", pid);
 				break;
 			case 'S':	/* signal number */
 				sbuf_printf(&sb, "%i", signum);
 				break;
 			case 'U':	/* user id */
 				sbuf_printf(&sb, "%u", uid);
 				break;
 			default:
 				log(LOG_ERR,
 				    "Unknown format character %c in "
 				    "corename `%s'\n", format[i], format);
 				break;
 			}
 			break;
 		default:
 			sbuf_putc(&sb, format[i]);
 			break;
 		}
 	}
 	sx_sunlock(&corefilename_lock);
 	free(hostname, M_TEMP);
 	if (compress == COMPRESS_GZIP)
-		sbuf_printf(&sb, GZIP_SUFFIX);
+		sbuf_cat(&sb, GZIP_SUFFIX);
 	else if (compress == COMPRESS_ZSTD)
-		sbuf_printf(&sb, ZSTD_SUFFIX);
+		sbuf_cat(&sb, ZSTD_SUFFIX);
 	if (sbuf_error(&sb) != 0) {
 		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
 		    "long\n", (long)pid, comm, (u_long)uid);
 		sbuf_delete(&sb);
 		free(name, M_TEMP);
 		return (ENOMEM);
 	}
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (indexpos != -1) {
 		error = corefile_open_last(td, name, indexpos, indexlen, ncores,
 		    vpp);
 		if (error != 0) {
 			log(LOG_ERR,
 			    "pid %d (%s), uid (%u):  Path `%s' failed "
 			    "on initial open test, error = %d\n",
 			    pid, comm, uid, name, error);
 		}
 	} else {
 		cmode = S_IRUSR | S_IWUSR;
 		oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
 		    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 		flags = O_CREAT | FWRITE | O_NOFOLLOW;
 		if ((td->td_proc->p_flag & P_SUGID) != 0)
 			flags |= O_EXCL;
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name);
 		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
 		    NULL);
 		if (error == 0) {
 			*vpp = nd.ni_vp;
 			NDFREE_PNBUF(&nd);
 		}
 	}
 
 	if (error != 0) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, error);
 #endif
 		free(name, M_TEMP);
 		return (error);
 	}
 	*namep = name;
 	return (0);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 	struct vnode *vp;
 	struct flock lf;
 	struct vattr vattr;
 	size_t fullpathsize;
 	int error, error1, locked;
 	char *name;			/* name of corefile */
 	void *rl_cookie;
 	off_t limit;
 	char *fullpath, *freepath = NULL;
 	struct sbuf *sb;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 
 	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
 	    (p->p_flag2 & P2_NOTRACE) != 0) {
 		PROC_UNLOCK(p);
 		return (EFAULT);
 	}
 
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(td, RLIMIT_CORE);
 	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
 		PROC_UNLOCK(p);
 		return (EFBIG);
 	}
 	PROC_UNLOCK(p);
 
 	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
 	    compress_user_cores, p->p_sig, &vp, &name);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Don't dump to non-regular files or files with links.
 	 * Do not dump into system files. Effective user must own the corefile.
 	 */
 	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
 	    vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
 	    vattr.va_uid != cred->cr_uid) {
 		VOP_UNLOCK(vp);
 		error = EFAULT;
 		goto out;
 	}
 
 	VOP_UNLOCK(vp);
 
 	/* Postpone other writers, including core dumps of other processes. */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	if (p->p_sysent->sv_coredump != NULL) {
 		error = p->p_sysent->sv_coredump(td, vp, limit, 0);
 	} else {
 		error = ENOSYS;
 	}
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 	vn_rangelock_unlock(vp, rl_cookie);
 
 	/*
 	 * Notify the userland helper that a process triggered a core dump.
 	 * This allows the helper to run an automated debugging session.
 	 */
 	if (error != 0 || coredump_devctl == 0)
 		goto out;
 	sb = sbuf_new_auto();
 	if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0)
 		goto out2;
-	sbuf_printf(sb, "comm=\"");
+	sbuf_cat(sb, "comm=\"");
 	devctl_safe_quote_sb(sb, fullpath);
 	free(freepath, M_TEMP);
-	sbuf_printf(sb, "\" core=\"");
+	sbuf_cat(sb, "\" core=\"");
 
 	/*
 	 * We can't lookup core file vp directly. When we're replacing a core, and
 	 * other random times, we flush the name cache, so it will fail. Instead,
 	 * if the path of the core is relative, add the current dir in front if it.
 	 */
 	if (name[0] != '/') {
 		fullpathsize = MAXPATHLEN;
 		freepath = malloc(fullpathsize, M_TEMP, M_WAITOK);
 		if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) {
 			free(freepath, M_TEMP);
 			goto out2;
 		}
 		devctl_safe_quote_sb(sb, fullpath);
 		free(freepath, M_TEMP);
 		sbuf_putc(sb, '/');
 	}
 	devctl_safe_quote_sb(sb, name);
-	sbuf_printf(sb, "\"");
+	sbuf_putc(sb, '"');
 	if (sbuf_finish(sb) == 0)
 		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
 out2:
 	sbuf_delete(sb);
 out:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 #ifdef AUDIT
 	audit_proc_coredump(td, name, error);
 #endif
 	free(name, M_TEMP);
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(struct thread *td, struct nosys_args *args)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	if (SV_PROC_FLAG(p, SV_SIGSYS) != 0 && kern_signosys) {
 		PROC_LOCK(p);
 		tdsignal(td, SIGSYS);
 		PROC_UNLOCK(p);
 	}
 	if (kern_lognosys == 1 || kern_lognosys == 3) {
 		uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm,
 		    td->td_sa.code);
 	}
 	if (kern_lognosys == 2 || kern_lognosys == 3 ||
 	    (p->p_pid == 1 && (kern_lognosys & 3) == 0)) {
 		printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm,
 		    td->td_sa.code);
 	}
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using stored
  * credentials rather than those of the current process.
  */
 void
 pgsigio(struct sigio **sigiop, int sig, int checkctty)
 {
 	ksiginfo_t ksi;
 	struct sigio *sigio;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			kern_psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				kern_psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	knlist_remove(p->p_klist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	refcount_init(&ps->ps_refcnt, 1);
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	if (refcount_release(&ps->ps_refcnt) == 0)
 		return;
 	mtx_destroy(&ps->ps_mtx);
 	free(ps, M_SUBPROC);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 
 	refcount_acquire(&ps->ps_refcnt);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 
 	return (ps->ps_refcnt > 1);
 }
 
 void
 sig_drop_caught(struct proc *p)
 {
 	int sig;
 	struct sigacts *ps;
 
 	ps = p->p_sigacts;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	SIG_FOREACH(sig, &ps->ps_sigcatch) {
 		sigdflt(ps, sig);
 		if ((sigprop(sig) & SIGPROP_IGNORE) != 0)
 			sigqueue_delete_proc(p, sig);
 	}
 }
 
 static void
 sigfastblock_failed(struct thread *td, bool sendsig, bool write)
 {
 	ksiginfo_t ksi;
 
 	/*
 	 * Prevent further fetches and SIGSEGVs, allowing thread to
 	 * issue syscalls despite corruption.
 	 */
 	sigfastblock_clear(td);
 
 	if (!sendsig)
 		return;
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = SIGSEGV;
 	ksi.ksi_code = write ? SEGV_ACCERR : SEGV_MAPERR;
 	ksi.ksi_addr = td->td_sigblock_ptr;
 	trapsignal(td, &ksi);
 }
 
 static bool
 sigfastblock_fetch_sig(struct thread *td, bool sendsig, uint32_t *valp)
 {
 	uint32_t res;
 
 	if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0)
 		return (true);
 	if (fueword32((void *)td->td_sigblock_ptr, &res) == -1) {
 		sigfastblock_failed(td, sendsig, false);
 		return (false);
 	}
 	*valp = res;
 	td->td_sigblock_val = res & ~SIGFASTBLOCK_FLAGS;
 	return (true);
 }
 
 static void
 sigfastblock_resched(struct thread *td, bool resched)
 {
 	struct proc *p;
 
 	if (resched) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		reschedule_signals(p, td->td_sigmask, 0);
 		PROC_UNLOCK(p);
 	}
 	ast_sched(td, TDA_SIG);
 }
 
 int
 sys_sigfastblock(struct thread *td, struct sigfastblock_args *uap)
 {
 	struct proc *p;
 	int error, res;
 	uint32_t oldval;
 
 	error = 0;
 	p = td->td_proc;
 	switch (uap->cmd) {
 	case SIGFASTBLOCK_SETPTR:
 		if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) {
 			error = EBUSY;
 			break;
 		}
 		if (((uintptr_t)(uap->ptr) & (sizeof(uint32_t) - 1)) != 0) {
 			error = EINVAL;
 			break;
 		}
 		td->td_pflags |= TDP_SIGFASTBLOCK;
 		td->td_sigblock_ptr = uap->ptr;
 		break;
 
 	case SIGFASTBLOCK_UNBLOCK:
 		if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) {
 			error = EINVAL;
 			break;
 		}
 
 		for (;;) {
 			res = casueword32(td->td_sigblock_ptr,
 			    SIGFASTBLOCK_PEND, &oldval, 0);
 			if (res == -1) {
 				error = EFAULT;
 				sigfastblock_failed(td, false, true);
 				break;
 			}
 			if (res == 0)
 				break;
 			MPASS(res == 1);
 			if (oldval != SIGFASTBLOCK_PEND) {
 				error = EBUSY;
 				break;
 			}
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				break;
 		}
 		if (error != 0)
 			break;
 
 		/*
 		 * td_sigblock_val is cleared there, but not on a
 		 * syscall exit.  The end effect is that a single
 		 * interruptible sleep, while user sigblock word is
 		 * set, might return EINTR or ERESTART to usermode
 		 * without delivering signal.  All further sleeps,
 		 * until userspace clears the word and does
 		 * sigfastblock(UNBLOCK), observe current word and no
 		 * longer get interrupted.  It is slight
 		 * non-conformance, with alternative to have read the
 		 * sigblock word on each syscall entry.
 		 */
 		td->td_sigblock_val = 0;
 
 		/*
 		 * Rely on normal ast mechanism to deliver pending
 		 * signals to current thread.  But notify others about
 		 * fake unblock.
 		 */
 		sigfastblock_resched(td, error == 0 && p->p_numthreads != 1);
 
 		break;
 
 	case SIGFASTBLOCK_UNSETPTR:
 		if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) {
 			error = EINVAL;
 			break;
 		}
 		if (!sigfastblock_fetch_sig(td, false, &oldval)) {
 			error = EFAULT;
 			break;
 		}
 		if (oldval != 0 && oldval != SIGFASTBLOCK_PEND) {
 			error = EBUSY;
 			break;
 		}
 		sigfastblock_clear(td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 void
 sigfastblock_clear(struct thread *td)
 {
 	bool resched;
 
 	if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0)
 		return;
 	td->td_sigblock_val = 0;
 	resched = (td->td_pflags & TDP_SIGFASTPENDING) != 0 ||
 	    SIGPENDING(td);
 	td->td_pflags &= ~(TDP_SIGFASTBLOCK | TDP_SIGFASTPENDING);
 	sigfastblock_resched(td, resched);
 }
 
 void
 sigfastblock_fetch(struct thread *td)
 {
 	uint32_t val;
 
 	(void)sigfastblock_fetch_sig(td, true, &val);
 }
 
 static void
 sigfastblock_setpend1(struct thread *td)
 {
 	int res;
 	uint32_t oldval;
 
 	if ((td->td_pflags & TDP_SIGFASTPENDING) == 0)
 		return;
 	res = fueword32((void *)td->td_sigblock_ptr, &oldval);
 	if (res == -1) {
 		sigfastblock_failed(td, true, false);
 		return;
 	}
 	for (;;) {
 		res = casueword32(td->td_sigblock_ptr, oldval, &oldval,
 		    oldval | SIGFASTBLOCK_PEND);
 		if (res == -1) {
 			sigfastblock_failed(td, true, true);
 			return;
 		}
 		if (res == 0) {
 			td->td_sigblock_val = oldval & ~SIGFASTBLOCK_FLAGS;
 			td->td_pflags &= ~TDP_SIGFASTPENDING;
 			break;
 		}
 		MPASS(res == 1);
 		if (thread_check_susp(td, false) != 0)
 			break;
 	}
 }
 
 static void
 sigfastblock_setpend(struct thread *td, bool resched)
 {
 	struct proc *p;
 
 	sigfastblock_setpend1(td);
 	if (resched) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		reschedule_signals(p, fastblock_mask, SIGPROCMASK_FASTBLK);
 		PROC_UNLOCK(p);
 	}
 }
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index a4bfe8e21aed..8baa78951501 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -1,3139 +1,3139 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 #include "opt_sysctl.h"
 
 #include <sys/param.h>
 #include <sys/fail.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/uio.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_lex.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
 static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
 static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
 
 RB_GENERATE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid);
 
 /*
  * The sysctllock protects the MIB tree.  It also protects sysctl
  * contexts used with dynamic sysctls.  The sysctl_register_oid() and
  * sysctl_unregister_oid() routines require the sysctllock to already
  * be held, so the sysctl_wlock() and sysctl_wunlock() routines are
  * provided for the few places in the kernel which need to use that
  * API rather than using the dynamic API.  Use of the dynamic API is
  * strongly encouraged for most code.
  *
  * The sysctlmemlock is used to limit the amount of user memory wired for
  * sysctl requests.  This is implemented by serializing any userland
  * sysctl requests larger than a single page via an exclusive lock.
  *
  * The sysctlstringlock is used to protect concurrent access to writable
  * string nodes in sysctl_handle_string().
  */
 static struct rmlock sysctllock;
 static struct sx __exclusive_cache_line sysctlmemlock;
 static struct sx sysctlstringlock;
 
 #define	SYSCTL_WLOCK()		rm_wlock(&sysctllock)
 #define	SYSCTL_WUNLOCK()	rm_wunlock(&sysctllock)
 #define	SYSCTL_RLOCK(tracker)	rm_rlock(&sysctllock, (tracker))
 #define	SYSCTL_RUNLOCK(tracker)	rm_runlock(&sysctllock, (tracker))
 #define	SYSCTL_WLOCKED()	rm_wowned(&sysctllock)
 #define	SYSCTL_ASSERT_LOCKED()	rm_assert(&sysctllock, RA_LOCKED)
 #define	SYSCTL_ASSERT_WLOCKED()	rm_assert(&sysctllock, RA_WLOCKED)
 #define	SYSCTL_ASSERT_RLOCKED()	rm_assert(&sysctllock, RA_RLOCKED)
 #define	SYSCTL_INIT()		rm_init_flags(&sysctllock, "sysctl lock", \
 				    RM_SLEEPABLE)
 #define	SYSCTL_SLEEP(ch, wmesg, timo)					\
 				rm_sleep(ch, &sysctllock, 0, wmesg, timo)
 
 static int sysctl_root(SYSCTL_HANDLER_ARGS);
 
 /* Root list */
 struct sysctl_oid_list sysctl__children = RB_INITIALIZER(&sysctl__children);
 
 static char*	sysctl_escape_name(const char*);
 static int	sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
 		    int recurse);
 static int	sysctl_old_kernel(struct sysctl_req *, const void *, size_t);
 static int	sysctl_new_kernel(struct sysctl_req *, void *, size_t);
 static int	name2oid(const char *, int *, int *, struct sysctl_oid **);
 
 static struct sysctl_oid *
 sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
 {
 	struct sysctl_oid *oidp;
 
 	SYSCTL_ASSERT_LOCKED();
 	SYSCTL_FOREACH(oidp, list) {
 		if (strcmp(oidp->oid_name, name) == 0) {
 			return (oidp);
 		}
 	}
 	return (NULL);
 }
 
 static struct sysctl_oid *
 sysctl_find_oidnamelen(const char *name, size_t len,
     struct sysctl_oid_list *list)
 {
 	struct sysctl_oid *oidp;
 
 	SYSCTL_ASSERT_LOCKED();
 	SYSCTL_FOREACH(oidp, list) {
 		if (strncmp(oidp->oid_name, name, len) == 0 &&
 		    oidp->oid_name[len] == '\0')
 			return (oidp);
 	}
 	return (NULL);
 }
 
 /*
  * Initialization of the MIB tree.
  *
  * Order by number in each list.
  */
 void
 sysctl_wlock(void)
 {
 
 	SYSCTL_WLOCK();
 }
 
 void
 sysctl_wunlock(void)
 {
 
 	SYSCTL_WUNLOCK();
 }
 
 static int
 sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2,
     struct sysctl_req *req, struct rm_priotracker *tracker)
 {
 	int error;
 
 	if (oid->oid_kind & CTLFLAG_DYN)
 		atomic_add_int(&oid->oid_running, 1);
 
 	if (tracker != NULL)
 		SYSCTL_RUNLOCK(tracker);
 	else
 		SYSCTL_WUNLOCK();
 
 	/*
 	 * Treat set CTLFLAG_NEEDGIANT and unset CTLFLAG_MPSAFE flags the same,
 	 * untill we're ready to remove all traces of Giant from sysctl(9).
 	 */
 	if ((oid->oid_kind & CTLFLAG_NEEDGIANT) ||
 	    (!(oid->oid_kind & CTLFLAG_MPSAFE)))
 		mtx_lock(&Giant);
 	error = oid->oid_handler(oid, arg1, arg2, req);
 	if ((oid->oid_kind & CTLFLAG_NEEDGIANT) ||
 	    (!(oid->oid_kind & CTLFLAG_MPSAFE)))
 		mtx_unlock(&Giant);
 
 	KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error);
 
 	if (tracker != NULL)
 		SYSCTL_RLOCK(tracker);
 	else
 		SYSCTL_WLOCK();
 
 	if (oid->oid_kind & CTLFLAG_DYN) {
 		if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 &&
 		    (oid->oid_kind & CTLFLAG_DYING) != 0)
 			wakeup(&oid->oid_running);
 	}
 
 	return (error);
 }
 
 static void
 sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp)
 {
 	struct sysctl_req req;
 	struct sysctl_oid *curr;
 	char *penv = NULL;
 	char path[96];
 	ssize_t rem = sizeof(path);
 	ssize_t len;
 	uint8_t data[512] __aligned(sizeof(uint64_t));
 	int size;
 	int error;
 
 	path[--rem] = 0;
 
 	for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) {
 		len = strlen(curr->oid_name);
 		rem -= len;
 		if (curr != oidp)
 			rem -= 1;
 		if (rem < 0) {
 			printf("OID path exceeds %d bytes\n", (int)sizeof(path));
 			return;
 		}
 		memcpy(path + rem, curr->oid_name, len);
 		if (curr != oidp)
 			path[rem + len] = '.';
 	}
 
 	memset(&req, 0, sizeof(req));
 
 	req.td = curthread;
 	req.oldfunc = sysctl_old_kernel;
 	req.newfunc = sysctl_new_kernel;
 	req.lock = REQ_UNWIRED;
 
 	switch (oidp->oid_kind & CTLTYPE) {
 	case CTLTYPE_INT:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(int), GETENV_SIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_UINT:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(int), GETENV_UNSIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_LONG:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(long), GETENV_SIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_ULONG:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(long), GETENV_UNSIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_S8:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(int8_t), GETENV_SIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_S16:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(int16_t), GETENV_SIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_S32:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(int32_t), GETENV_SIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_S64:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(int64_t), GETENV_SIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_U8:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(uint8_t), GETENV_UNSIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_U16:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(uint16_t), GETENV_UNSIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_U32:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(uint32_t), GETENV_UNSIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_U64:
 		if (getenv_array(path + rem, data, sizeof(data), &size,
 		    sizeof(uint64_t), GETENV_UNSIGNED) == 0)
 			return;
 		req.newlen = size;
 		req.newptr = data;
 		break;
 	case CTLTYPE_STRING:
 		penv = kern_getenv(path + rem);
 		if (penv == NULL)
 			return;
 		req.newlen = strlen(penv);
 		req.newptr = penv;
 		break;
 	default:
 		return;
 	}
 	error = sysctl_root_handler_locked(oidp, oidp->oid_arg1,
 	    oidp->oid_arg2, &req, NULL);
 	if (error != 0)
 		printf("Setting sysctl %s failed: %d\n", path + rem, error);
 	if (penv != NULL)
 		freeenv(penv);
 }
 
 /*
  * Locate the path to a given oid.  Returns the length of the resulting path,
  * or -1 if the oid was not found.  nodes must have room for CTL_MAXNAME
  * elements.
  */
 static int
 sysctl_search_oid(struct sysctl_oid **nodes, struct sysctl_oid *needle)
 {
 	int indx;
 
 	SYSCTL_ASSERT_LOCKED();
 	indx = 0;
 	/*
 	 * Do a depth-first search of the oid tree, looking for 'needle'. Start
 	 * with the first child of the root.
 	 */
 	nodes[indx] = RB_MIN(sysctl_oid_list, &sysctl__children);
 	for (;;) {
 		if (nodes[indx] == needle)
 			return (indx + 1);
 
 		if (nodes[indx] == NULL) {
 			/* Node has no more siblings, so back up to parent. */
 			if (indx-- == 0) {
 				/* Retreat to root, so give up. */
 				break;
 			}
 		} else if ((nodes[indx]->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			/* Node has children. */
 			if (++indx == CTL_MAXNAME) {
 				/* Max search depth reached, so give up. */
 				break;
 			}
 			/* Start with the first child. */
 			nodes[indx] = RB_MIN(sysctl_oid_list,
 			    &nodes[indx - 1]->oid_children);
 			continue;
 		}
 		/* Consider next sibling. */
 		nodes[indx] = RB_NEXT(sysctl_oid_list, NULL, nodes[indx]);
 	}
 	return (-1);
 }
 
 static void
 sysctl_warn_reuse(const char *func, struct sysctl_oid *leaf)
 {
 	struct sysctl_oid *nodes[CTL_MAXNAME];
 	char buf[128];
 	struct sbuf sb;
 	int rc, i;
 
 	(void)sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN | SBUF_INCLUDENUL);
 	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
 
 	sbuf_printf(&sb, "%s: can't re-use a leaf (", __func__);
 
 	rc = sysctl_search_oid(nodes, leaf);
 	if (rc > 0) {
 		for (i = 0; i < rc; i++)
 			sbuf_printf(&sb, "%s%.*s", nodes[i]->oid_name,
 			    i != (rc - 1), ".");
 	} else {
-		sbuf_printf(&sb, "%s", leaf->oid_name);
+		sbuf_cat(&sb, leaf->oid_name);
 	}
-	sbuf_printf(&sb, ")!\n");
+	sbuf_cat(&sb, ")!\n");
 
 	(void)sbuf_finish(&sb);
 }
 
 #ifdef SYSCTL_DEBUG
 static int
 sysctl_reuse_test(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 
 	SYSCTL_RLOCK(&tracker);
 	sysctl_warn_reuse(__func__, oidp);
 	SYSCTL_RUNLOCK(&tracker);
 	return (0);
 }
 SYSCTL_PROC(_sysctl, OID_AUTO, reuse_test,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_reuse_test, "-",
     "");
 #endif
 
 void
 sysctl_register_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid_list *parent = oidp->oid_parent;
 	struct sysctl_oid *p, key;
 	int oid_number;
 	int timeout = 2;
 
 	/*
 	 * First check if another oid with the same name already
 	 * exists in the parent's list.
 	 */
 	SYSCTL_ASSERT_WLOCKED();
 	p = sysctl_find_oidname(oidp->oid_name, parent);
 	if (p != NULL) {
 		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			p->oid_refcnt++;
 			return;
 		} else {
 			sysctl_warn_reuse(__func__, p);
 			return;
 		}
 	}
 	/* get current OID number */
 	oid_number = oidp->oid_number;
 
 #if (OID_AUTO >= 0)
 #error "OID_AUTO is expected to be a negative value"
 #endif	
 	/*
 	 * Any negative OID number qualifies as OID_AUTO. Valid OID
 	 * numbers should always be positive.
 	 *
 	 * NOTE: DO NOT change the starting value here, change it in
 	 * <sys/sysctl.h>, and make sure it is at least 256 to
 	 * accommodate e.g. net.inet.raw as a static sysctl node.
 	 */
 	if (oid_number < 0) {
 		static int newoid;
 
 		/*
 		 * By decrementing the next OID number we spend less
 		 * time inserting the OIDs into a sorted list.
 		 */
 		if (--newoid < CTL_AUTO_START)
 			newoid = 0x7fffffff;
 
 		oid_number = newoid;
 	}
 
 	/*
 	 * Insert the OID into the parent's list sorted by OID number.
 	 */
 	key.oid_number = oid_number;
 	p = RB_NFIND(sysctl_oid_list, parent, &key);
 	while (p != NULL && oid_number == p->oid_number) {
 		/* get the next valid OID number */
 		if (oid_number < CTL_AUTO_START ||
 		    oid_number == 0x7fffffff) {
 			/* wraparound - restart */
 			oid_number = CTL_AUTO_START;
 			/* don't loop forever */
 			if (!timeout--)
 				panic("sysctl: Out of OID numbers\n");
 			key.oid_number = oid_number;
 			p = RB_NFIND(sysctl_oid_list, parent, &key);
 			continue;
 		}
 		p = RB_NEXT(sysctl_oid_list, NULL, p);
 		oid_number++;
 	}
 	/* check for non-auto OID number collision */
 	if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START &&
 	    oid_number >= CTL_AUTO_START) {
 		printf("sysctl: OID number(%d) is already in use for '%s'\n",
 		    oidp->oid_number, oidp->oid_name);
 	}
 	/* update the OID number, if any */
 	oidp->oid_number = oid_number;
 	RB_INSERT(sysctl_oid_list, parent, oidp);
 
 	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
 	    (oidp->oid_kind & CTLFLAG_TUN) != 0 &&
 	    (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
 #ifdef VIMAGE
 		/*
 		 * Can fetch value multiple times for VNET loader tunables.
 		 * Only fetch once for non-VNET loader tunables.
 		 */
 		if ((oidp->oid_kind & CTLFLAG_VNET) == 0)
 #endif
 			oidp->oid_kind |= CTLFLAG_NOFETCH;
 		/* try to fetch value from kernel environment */
 		sysctl_load_tunable_by_oid_locked(oidp);
 	}
 }
 
 void
 sysctl_register_disabled_oid(struct sysctl_oid *oidp)
 {
 
 	/*
 	 * Mark the leaf as dormant if it's not to be immediately enabled.
 	 * We do not disable nodes as they can be shared between modules
 	 * and it is always safe to access a node.
 	 */
 	KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0,
 	    ("internal flag is set in oid_kind"));
 	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 		oidp->oid_kind |= CTLFLAG_DORMANT;
 	sysctl_register_oid(oidp);
 }
 
 void
 sysctl_enable_oid(struct sysctl_oid *oidp)
 {
 
 	SYSCTL_ASSERT_WLOCKED();
 	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0,
 		    ("sysctl node is marked as dormant"));
 		return;
 	}
 	KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) != 0,
 	    ("enabling already enabled sysctl oid"));
 	oidp->oid_kind &= ~CTLFLAG_DORMANT;
 }
 
 void
 sysctl_unregister_oid(struct sysctl_oid *oidp)
 {
 	int error;
 
 	SYSCTL_ASSERT_WLOCKED();
 	if (oidp->oid_number == OID_AUTO) {
 		error = EINVAL;
 	} else {
 		error = ENOENT;
 		if (RB_REMOVE(sysctl_oid_list, oidp->oid_parent, oidp))
 			error = 0;
 	}
 
 	/* 
 	 * This can happen when a module fails to register and is
 	 * being unloaded afterwards.  It should not be a panic()
 	 * for normal use.
 	 */
 	if (error) {
 		printf("%s: failed(%d) to unregister sysctl(%s)\n",
 		    __func__, error, oidp->oid_name);
 	}
 }
 
 /* Initialize a new context to keep track of dynamically added sysctls. */
 int
 sysctl_ctx_init(struct sysctl_ctx_list *c)
 {
 
 	if (c == NULL) {
 		return (EINVAL);
 	}
 
 	/*
 	 * No locking here, the caller is responsible for not adding
 	 * new nodes to a context until after this function has
 	 * returned.
 	 */
 	TAILQ_INIT(c);
 	return (0);
 }
 
 /* Free the context, and destroy all dynamic oids registered in this context */
 int
 sysctl_ctx_free(struct sysctl_ctx_list *clist)
 {
 	struct sysctl_ctx_entry *e, *e1;
 	int error;
 
 	error = 0;
 	/*
 	 * First perform a "dry run" to check if it's ok to remove oids.
 	 * XXX FIXME
 	 * XXX This algorithm is a hack. But I don't know any
 	 * XXX better solution for now...
 	 */
 	SYSCTL_WLOCK();
 	TAILQ_FOREACH(e, clist, link) {
 		error = sysctl_remove_oid_locked(e->entry, 0, 0);
 		if (error)
 			break;
 	}
 	/*
 	 * Restore deregistered entries, either from the end,
 	 * or from the place where error occurred.
 	 * e contains the entry that was not unregistered
 	 */
 	if (error)
 		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
 	else
 		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
 	while (e1 != NULL) {
 		sysctl_register_oid(e1->entry);
 		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
 	}
 	if (error) {
 		SYSCTL_WUNLOCK();
 		return(EBUSY);
 	}
 	/* Now really delete the entries */
 	e = TAILQ_FIRST(clist);
 	while (e != NULL) {
 		e1 = TAILQ_NEXT(e, link);
 		error = sysctl_remove_oid_locked(e->entry, 1, 0);
 		if (error)
 			panic("sysctl_remove_oid: corrupt tree, entry: %s",
 			    e->entry->oid_name);
 		free(e, M_SYSCTLOID);
 		e = e1;
 	}
 	SYSCTL_WUNLOCK();
 	return (error);
 }
 
 /* Add an entry to the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	SYSCTL_ASSERT_WLOCKED();
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
 	e->entry = oidp;
 	TAILQ_INSERT_HEAD(clist, e, link);
 	return (e);
 }
 
 /* Find an entry in the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	SYSCTL_ASSERT_WLOCKED();
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	TAILQ_FOREACH(e, clist, link) {
 		if (e->entry == oidp)
 			return(e);
 	}
 	return (e);
 }
 
 /*
  * Delete an entry from the context.
  * NOTE: this function doesn't free oidp! You have to remove it
  * with sysctl_remove_oid().
  */
 int
 sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return (EINVAL);
 	SYSCTL_WLOCK();
 	e = sysctl_ctx_entry_find(clist, oidp);
 	if (e != NULL) {
 		TAILQ_REMOVE(clist, e, link);
 		SYSCTL_WUNLOCK();
 		free(e, M_SYSCTLOID);
 		return (0);
 	} else {
 		SYSCTL_WUNLOCK();
 		return (ENOENT);
 	}
 }
 
 /*
  * Remove dynamically created sysctl trees.
  * oidp - top of the tree to be removed
  * del - if 0 - just deregister, otherwise free up entries as well
  * recurse - if != 0 traverse the subtree to be deleted
  */
 int
 sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
 {
 	int error;
 
 	SYSCTL_WLOCK();
 	error = sysctl_remove_oid_locked(oidp, del, recurse);
 	SYSCTL_WUNLOCK();
 	return (error);
 }
 
 int
 sysctl_remove_name(struct sysctl_oid *parent, const char *name,
     int del, int recurse)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	error = ENOENT;
 	SYSCTL_WLOCK();
 	p = sysctl_find_oidname(name, &parent->oid_children);
 	if (p)
 		error = sysctl_remove_oid_locked(p, del, recurse);
 	SYSCTL_WUNLOCK();
 
 	return (error);
 }
 
 /*
  * Duplicate the provided string, escaping any illegal characters.  The result
  * must be freed when no longer in use.
  *
  * The list of illegal characters is ".".
  */
 static char*
 sysctl_escape_name(const char* orig)
 {
 	int i, s = 0, d = 0, nillegals = 0;
 	char *new;
 
 	/* First count the number of illegal characters */
 	for (i = 0; orig[i] != '\0'; i++) {
 		if (orig[i] == '.')
 			nillegals++;
 	}
 
 	/* Allocate storage for new string */
 	new = malloc(i + 2 * nillegals + 1, M_SYSCTLOID, M_WAITOK);
 
 	/* Copy the name, escaping characters as we go */
 	while (orig[s] != '\0') {
 		if (orig[s] == '.') {
 			/* %25 is the hexadecimal representation of '.' */
 			new[d++] = '%';
 			new[d++] = '2';
 			new[d++] = '5';
 			s++;
 		} else {
 			new[d++] = orig[s++];
 		}
 	}
 
 	/* Finally, nul-terminate */
 	new[d] = '\0';
 
 	return (new);
 }
 
 static int
 sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
 {
 	struct sysctl_oid *p, *tmp;
 	int error;
 
 	SYSCTL_ASSERT_WLOCKED();
 	if (oidp == NULL)
 		return(EINVAL);
 	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
 		printf("Warning: can't remove non-dynamic nodes (%s)!\n",
 		    oidp->oid_name);
 		return (EINVAL);
 	}
 	/*
 	 * WARNING: normal method to do this should be through
 	 * sysctl_ctx_free(). Use recursing as the last resort
 	 * method to purge your sysctl tree of leftovers...
 	 * However, if some other code still references these nodes,
 	 * it will panic.
 	 */
 	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		if (oidp->oid_refcnt == 1) {
 			for(p = RB_MIN(sysctl_oid_list, &oidp->oid_children);
 			    p != NULL; p = tmp) {
 				if (!recurse) {
 					printf("Warning: failed attempt to "
 					    "remove oid %s with child %s\n",
 					    oidp->oid_name, p->oid_name);
 					return (ENOTEMPTY);
 				}
 				tmp = RB_NEXT(sysctl_oid_list,
 				    &oidp->oid_children, p);
 				error = sysctl_remove_oid_locked(p, del,
 				    recurse);
 				if (error)
 					return (error);
 			}
 		}
 	}
 	if (oidp->oid_refcnt > 1 ) {
 		oidp->oid_refcnt--;
 	} else {
 		if (oidp->oid_refcnt == 0) {
 			printf("Warning: bad oid_refcnt=%u (%s)!\n",
 				oidp->oid_refcnt, oidp->oid_name);
 			return (EINVAL);
 		}
 		sysctl_unregister_oid(oidp);
 		if (del) {
 			/*
 			 * Wait for all threads running the handler to drain.
 			 * This preserves the previous behavior when the
 			 * sysctl lock was held across a handler invocation,
 			 * and is necessary for module unload correctness.
 			 */
 			while (oidp->oid_running > 0) {
 				oidp->oid_kind |= CTLFLAG_DYING;
 				SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0);
 			}
 			if (oidp->oid_descr)
 				free(__DECONST(char *, oidp->oid_descr),
 				    M_SYSCTLOID);
 			if (oidp->oid_label)
 				free(__DECONST(char *, oidp->oid_label),
 				    M_SYSCTLOID);
 			free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID);
 			free(oidp, M_SYSCTLOID);
 		}
 	}
 	return (0);
 }
 /*
  * Create new sysctls at run time.
  * clist may point to a valid context initialized with sysctl_ctx_init().
  */
 struct sysctl_oid *
 sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
 	int number, const char *name, int kind, void *arg1, intmax_t arg2,
 	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr,
 	const char *label)
 {
 	struct sysctl_oid *oidp;
 	char *escaped;
 
 	/* You have to hook up somewhere.. */
 	if (parent == NULL)
 		return(NULL);
 	escaped = sysctl_escape_name(name);
 	/* Check if the node already exists, otherwise create it */
 	SYSCTL_WLOCK();
 	oidp = sysctl_find_oidname(escaped, parent);
 	if (oidp != NULL) {
 		free(escaped, M_SYSCTLOID);
 		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			oidp->oid_refcnt++;
 			/* Update the context */
 			if (clist != NULL)
 				sysctl_ctx_entry_add(clist, oidp);
 			SYSCTL_WUNLOCK();
 			return (oidp);
 		} else {
 			sysctl_warn_reuse(__func__, oidp);
 			SYSCTL_WUNLOCK();
 			return (NULL);
 		}
 	}
 	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
 	oidp->oid_parent = parent;
 	RB_INIT(&oidp->oid_children);
 	oidp->oid_number = number;
 	oidp->oid_refcnt = 1;
 	oidp->oid_name = escaped;
 	oidp->oid_handler = handler;
 	oidp->oid_kind = CTLFLAG_DYN | kind;
 	oidp->oid_arg1 = arg1;
 	oidp->oid_arg2 = arg2;
 	oidp->oid_fmt = fmt;
 	if (descr != NULL)
 		oidp->oid_descr = strdup(descr, M_SYSCTLOID);
 	if (label != NULL)
 		oidp->oid_label = strdup(label, M_SYSCTLOID);
 	/* Update the context, if used */
 	if (clist != NULL)
 		sysctl_ctx_entry_add(clist, oidp);
 	/* Register this oid */
 	sysctl_register_oid(oidp);
 	SYSCTL_WUNLOCK();
 	return (oidp);
 }
 
 /*
  * Rename an existing oid.
  */
 void
 sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
 {
 	char *newname;
 	char *oldname;
 
 	newname = strdup(name, M_SYSCTLOID);
 	SYSCTL_WLOCK();
 	oldname = __DECONST(char *, oidp->oid_name);
 	oidp->oid_name = newname;
 	SYSCTL_WUNLOCK();
 	free(oldname, M_SYSCTLOID);
 }
 
 /*
  * Reparent an existing oid.
  */
 int
 sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
 {
 	struct sysctl_oid *oidp;
 
 	SYSCTL_WLOCK();
 	if (oid->oid_parent == parent) {
 		SYSCTL_WUNLOCK();
 		return (0);
 	}
 	oidp = sysctl_find_oidname(oid->oid_name, parent);
 	if (oidp != NULL) {
 		SYSCTL_WUNLOCK();
 		return (EEXIST);
 	}
 	sysctl_unregister_oid(oid);
 	oid->oid_parent = parent;
 	oid->oid_number = OID_AUTO;
 	sysctl_register_oid(oid);
 	SYSCTL_WUNLOCK();
 	return (0);
 }
 
 /*
  * Register the kernel's oids on startup.
  */
 SET_DECLARE(sysctl_set, struct sysctl_oid);
 
 static void
 sysctl_register_all(void *arg)
 {
 	struct sysctl_oid **oidp;
 
 	sx_init(&sysctlmemlock, "sysctl mem");
 	sx_init(&sysctlstringlock, "sysctl string handler");
 	SYSCTL_INIT();
 	SYSCTL_WLOCK();
 	SET_FOREACH(oidp, sysctl_set)
 		sysctl_register_oid(*oidp);
 	SYSCTL_WUNLOCK();
 }
 SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL);
 
 #ifdef VIMAGE
 static void
 sysctl_setenv_vnet(void *arg __unused, const char *name)
 {
 	struct sysctl_oid *oidp;
 	int oid[CTL_MAXNAME];
 	int error, nlen;
 
 	SYSCTL_WLOCK();
 	error = name2oid(name, oid, &nlen, &oidp);
 	if (error)
 		goto out;
 
 	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
 	    (oidp->oid_kind & CTLFLAG_VNET) != 0 &&
 	    (oidp->oid_kind & CTLFLAG_TUN) != 0 &&
 	    (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
 		/* Update value from kernel environment */
 		sysctl_load_tunable_by_oid_locked(oidp);
 	}
 out:
 	SYSCTL_WUNLOCK();
 }
 
 static void
 sysctl_unsetenv_vnet(void *arg __unused, const char *name)
 {
 	struct sysctl_oid *oidp;
 	int oid[CTL_MAXNAME];
 	int error, nlen;
 
 	SYSCTL_WLOCK();
 	/*
 	 * The setenv / unsetenv event handlers are invoked by kern_setenv() /
 	 * kern_unsetenv() without exclusive locks. It is rare but still possible
 	 * that the invoke order of event handlers is different from that of
 	 * kern_setenv() and kern_unsetenv().
 	 * Re-check environment variable string to make sure it is unset.
 	 */
 	if (testenv(name))
 		goto out;
 	error = name2oid(name, oid, &nlen, &oidp);
 	if (error)
 		goto out;
 
 	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
 	    (oidp->oid_kind & CTLFLAG_VNET) != 0 &&
 	    (oidp->oid_kind & CTLFLAG_TUN) != 0 &&
 	    (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
 		size_t size;
 
 		switch (oidp->oid_kind & CTLTYPE) {
 		case CTLTYPE_INT:
 		case CTLTYPE_UINT:
 			size = sizeof(int);
 			break;
 		case CTLTYPE_LONG:
 		case CTLTYPE_ULONG:
 			size = sizeof(long);
 			break;
 		case CTLTYPE_S8:
 		case CTLTYPE_U8:
 			size = sizeof(int8_t);
 			break;
 		case CTLTYPE_S16:
 		case CTLTYPE_U16:
 			size = sizeof(int16_t);
 			break;
 		case CTLTYPE_S32:
 		case CTLTYPE_U32:
 			size = sizeof(int32_t);
 			break;
 		case CTLTYPE_S64:
 		case CTLTYPE_U64:
 			size = sizeof(int64_t);
 			break;
 		case CTLTYPE_STRING:
 			MPASS(oidp->oid_arg2 > 0);
 			size = oidp->oid_arg2;
 			break;
 		default:
 			goto out;
 		}
 		vnet_restore_init(oidp->oid_arg1, size);
 	}
 out:
 	SYSCTL_WUNLOCK();
 }
 
 /*
  * Register the kernel's setenv / unsetenv events.
  */
 EVENTHANDLER_DEFINE(setenv, sysctl_setenv_vnet, NULL, EVENTHANDLER_PRI_ANY);
 EVENTHANDLER_DEFINE(unsetenv, sysctl_unsetenv_vnet, NULL, EVENTHANDLER_PRI_ANY);
 #endif
 
 /*
  * "Staff-functions"
  *
  * These functions implement a presently undocumented interface 
  * used by the sysctl program to walk the tree, and get the type
  * so it can print the value.
  * This interface is under work and consideration, and should probably
  * be killed with a big axe by the first person who can find the time.
  * (be aware though, that the proper interface isn't as obvious as it
  * may seem, there are various conflicting requirements.
  *
  * {CTL_SYSCTL, CTL_SYSCTL_DEBUG}		printf the entire MIB-tree.
  * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...}		return the name of the "..."
  *						OID.
  * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...}		return the next OID, honoring
  *						CTLFLAG_SKIP.
  * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID}		return the OID of the name in
  *						"new"
  * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...}		return the kind & format info
  *						for the "..." OID.
  * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...}	return the description of the
  *						"..." OID.
  * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...}	return the aggregation label of
  *						the "..." OID.
  * {CTL_SYSCTL, CTL_SYSCTL_NEXTNOSKIP, ...}	return the next OID, ignoring
  *						CTLFLAG_SKIP.
  */
 
 #ifdef SYSCTL_DEBUG
 static void
 sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
 {
 	int k;
 	struct sysctl_oid *oidp;
 
 	SYSCTL_ASSERT_LOCKED();
 	SYSCTL_FOREACH(oidp, l) {
 		for (k=0; k<i; k++)
 			printf(" ");
 
 		printf("%d %s ", oidp->oid_number, oidp->oid_name);
 
 		printf("%c%c",
 			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
 			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
 
 		if (oidp->oid_handler)
 			printf(" *Handler");
 
 		switch (oidp->oid_kind & CTLTYPE) {
 			case CTLTYPE_NODE:
 				printf(" Node\n");
 				if (!oidp->oid_handler) {
 					sysctl_sysctl_debug_dump_node(
 					    SYSCTL_CHILDREN(oidp), i + 2);
 				}
 				break;
 			case CTLTYPE_INT:    printf(" Int\n"); break;
 			case CTLTYPE_UINT:   printf(" u_int\n"); break;
 			case CTLTYPE_LONG:   printf(" Long\n"); break;
 			case CTLTYPE_ULONG:  printf(" u_long\n"); break;
 			case CTLTYPE_STRING: printf(" String\n"); break;
 			case CTLTYPE_S8:     printf(" int8_t\n"); break;
 			case CTLTYPE_S16:    printf(" int16_t\n"); break;
 			case CTLTYPE_S32:    printf(" int32_t\n"); break;
 			case CTLTYPE_S64:    printf(" int64_t\n"); break;
 			case CTLTYPE_U8:     printf(" uint8_t\n"); break;
 			case CTLTYPE_U16:    printf(" uint16_t\n"); break;
 			case CTLTYPE_U32:    printf(" uint32_t\n"); break;
 			case CTLTYPE_U64:    printf(" uint64_t\n"); break;
 			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
 			default:	     printf("\n");
 		}
 	}
 }
 
 static int
 sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
 {
 	struct rm_priotracker tracker;
 	int error;
 
 	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
 	if (error)
 		return (error);
 	SYSCTL_RLOCK(&tracker);
 	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
 	SYSCTL_RUNLOCK(&tracker);
 	return (ENOENT);
 }
 
 SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", "");
 #endif
 
 static int
 sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int error;
 	struct sysctl_oid *oid, key;
 	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
 	struct rm_priotracker tracker;
 	char buf[10];
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	SYSCTL_RLOCK(&tracker);
 	while (namelen) {
 		if (!lsp) {
 			snprintf(buf,sizeof(buf),"%d",*name);
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, buf, strlen(buf));
 			if (error)
 				goto out;
 			namelen--;
 			name++;
 			continue;
 		}
 		lsp2 = NULL;
 		key.oid_number = *name;
 		oid = RB_FIND(sysctl_oid_list, lsp, &key);
 		if (oid) {
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, oid->oid_name,
 					strlen(oid->oid_name));
 			if (error)
 				goto out;
 
 			namelen--;
 			name++;
 
 			if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE &&
 				!oid->oid_handler)
 				lsp2 = SYSCTL_CHILDREN(oid);
 		}
 		lsp = lsp2;
 	}
 	error = SYSCTL_OUT(req, "", 1);
  out:
 	SYSCTL_RUNLOCK(&tracker);
 	return (error);
 }
 
 /*
  * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in
  * capability mode.
  */
 static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD |
     CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, "");
 
 enum sysctl_iter_action {
 	ITER_SIBLINGS,	/* Not matched, continue iterating siblings */
 	ITER_CHILDREN,	/* Node has children we need to iterate over them */
 	ITER_FOUND,	/* Matching node was found */
 };
 
 /*
  * Tries to find the next node for @name and @namelen.
  *
  * Returns next action to take. 
  */
 static enum sysctl_iter_action
 sysctl_sysctl_next_node(struct sysctl_oid *oidp, int *name, unsigned int namelen,
     bool honor_skip)
 {
 
 	if ((oidp->oid_kind & CTLFLAG_DORMANT) != 0)
 		return (ITER_SIBLINGS);
 
 	if (honor_skip && (oidp->oid_kind & CTLFLAG_SKIP) != 0)
 		return (ITER_SIBLINGS);
 
 	if (namelen == 0) {
 		/*
 		 * We have reached a node with a full name match and are
 		 * looking for the next oid in its children.
 		 *
 		 * For CTL_SYSCTL_NEXTNOSKIP we are done.
 		 *
 		 * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it
 		 * has a handler) and move on to the children.
 		 */
 		if (!honor_skip)
 			return (ITER_FOUND);
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 			return (ITER_FOUND);
 		/* If node does not have an iterator, treat it as leaf */
 		if (oidp->oid_handler) 
 			return (ITER_FOUND);
 
 		/* Report oid as a node to iterate */
 		return (ITER_CHILDREN);
 	}
 
 	/*
 	 * No match yet. Continue seeking the given name.
 	 *
 	 * We are iterating in order by oid_number, so skip oids lower
 	 * than the one we are looking for.
 	 *
 	 * When the current oid_number is higher than the one we seek,
 	 * that means we have reached the next oid in the sequence and
 	 * should return it.
 	 *
 	 * If the oid_number matches the name at this level then we
 	 * have to find a node to continue searching at the next level.
 	 */
 	if (oidp->oid_number < *name)
 		return (ITER_SIBLINGS);
 	if (oidp->oid_number > *name) {
 		/*
 		 * We have reached the next oid.
 		 *
 		 * For CTL_SYSCTL_NEXTNOSKIP we are done.
 		 *
 		 * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it
 		 * has a handler) and move on to the children.
 		 */
 		if (!honor_skip)
 			return (ITER_FOUND);
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			return (ITER_FOUND);
 		/* If node does not have an iterator, treat it as leaf */
 		if (oidp->oid_handler)
 			return (ITER_FOUND);
 		return (ITER_CHILDREN);
 	}
 
 	/* match at a current level */
 	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 		return (ITER_SIBLINGS);
 	if (oidp->oid_handler)
 		return (ITER_SIBLINGS);
 
 	return (ITER_CHILDREN);
 }
 
 /*
  * Recursively walk the sysctl subtree at lsp until we find the given name.
  * Returns true and fills in next oid data in @next and @len if oid is found.
  */
 static bool
 sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int namelen, 
     int *next, int *len, int level, bool honor_skip)
 {
 	struct sysctl_oid_list *next_lsp;
 	struct sysctl_oid *oidp = NULL, key;
 	bool success = false;
 	enum sysctl_iter_action action;
 
 	SYSCTL_ASSERT_LOCKED();
 	/*
 	 * Start the search at the requested oid.  But if not found, then scan
 	 * through all children.
 	 */
 	if (namelen > 0) {
 		key.oid_number = *name;
 		oidp = RB_FIND(sysctl_oid_list, lsp, &key);
 	}
 	if (!oidp)
 		oidp = RB_MIN(sysctl_oid_list, lsp);
 	for(; oidp != NULL; oidp = RB_NEXT(sysctl_oid_list, lsp, oidp)) {
 		action = sysctl_sysctl_next_node(oidp, name, namelen,
 		    honor_skip);
 		if (action == ITER_SIBLINGS)
 			continue;
 		if (action == ITER_FOUND) {
 			success = true;
 			break;
 		}
 		KASSERT((action== ITER_CHILDREN), ("ret(%d)!=ITER_CHILDREN", action));
 
 		next_lsp = SYSCTL_CHILDREN(oidp);
 		if (namelen == 0) {
 			success = sysctl_sysctl_next_action(next_lsp, NULL, 0,
 			    next + 1, len, level + 1, honor_skip);
 		} else {
 			success = sysctl_sysctl_next_action(next_lsp, name + 1,
 			    namelen - 1, next + 1, len, level + 1, honor_skip);
 			if (!success) {
 
 				/*
 				 * We maintain the invariant that current node oid
 				 * is >= the oid provided in @name.
 				 * As there are no usable children at this node,
 				 *  current node oid is strictly > than the requested
 				 *  oid.
 				 * Hence, reduce namelen to 0 to allow for picking first
 				 *  nodes/leafs in the next node in list.
 				 */
 				namelen = 0;
 			}
 		}
 		if (success)
 			break;
 	}
 
 	if (success) {
 		*next = oidp->oid_number;
 		if (level > *len)
 			*len = level;
 	}
 
 	return (success);
 }
 
 static int
 sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int len, error;
 	bool success;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	struct rm_priotracker tracker;
 	int next[CTL_MAXNAME];
 
 	len = 0;
 	SYSCTL_RLOCK(&tracker);
 	success = sysctl_sysctl_next_action(lsp, name, namelen, next, &len, 1,
 	    oidp->oid_number == CTL_SYSCTL_NEXT);
 	SYSCTL_RUNLOCK(&tracker);
 	if (!success)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, next, len * sizeof (int));
 	return (error);
 }
 
 /*
  * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in
  * capability mode.
  */
 static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD |
     CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, "");
 
 static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXTNOSKIP, nextnoskip, CTLFLAG_RD |
     CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, "");
 
 static int
 name2oid(const char *name, int *oid, int *len, struct sysctl_oid **oidpp)
 {
 	struct sysctl_oid *oidp;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	const char *n;
 
 	SYSCTL_ASSERT_LOCKED();
 
 	for (*len = 0; *len < CTL_MAXNAME;) {
 		n = strchrnul(name, '.');
 		oidp = sysctl_find_oidnamelen(name, n - name, lsp);
 		if (oidp == NULL)
 			return (ENOENT);
 		*oid++ = oidp->oid_number;
 		(*len)++;
 
 		name = n;
 		if (*name == '.')
 			name++;
 		if (*name == '\0') {
 			if (oidpp)
 				*oidpp = oidp;
 			return (0);
 		}
 
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			break;
 
 		if (oidp->oid_handler)
 			break;
 
 		lsp = SYSCTL_CHILDREN(oidp);
 	}
 	return (ENOENT);
 }
 
 static int
 sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
 {
 	char *p;
 	int error, oid[CTL_MAXNAME], len = 0;
 	struct sysctl_oid *op = NULL;
 	struct rm_priotracker tracker;
 	char buf[32];
 
 	if (!req->newlen) 
 		return (ENOENT);
 	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
 		return (ENAMETOOLONG);
 
 	p = buf;
 	if (req->newlen >= sizeof(buf))
 		p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
 
 	error = SYSCTL_IN(req, p, req->newlen);
 	if (error) {
 		if (p != buf)
 			free(p, M_SYSCTL);
 		return (error);
 	}
 
 	p [req->newlen] = '\0';
 
 	SYSCTL_RLOCK(&tracker);
 	error = name2oid(p, oid, &len, &op);
 	SYSCTL_RUNLOCK(&tracker);
 
 	if (p != buf)
 		free(p, M_SYSCTL);
 
 	if (error)
 		return (error);
 
 	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
 	return (error);
 }
 
 /*
  * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in
  * capability mode.
  */
 SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW |
     CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0,
     sysctl_sysctl_name2oid, "I", "");
 
 static int
 sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	struct rm_priotracker tracker;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	SYSCTL_RLOCK(&tracker);
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		goto out;
 
 	if (oid->oid_fmt == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
 	if (error)
 		goto out;
 	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
  out:
 	SYSCTL_RUNLOCK(&tracker);
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD |
     CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, "");
 
 static int
 sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	struct rm_priotracker tracker;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	SYSCTL_RLOCK(&tracker);
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		goto out;
 
 	if (oid->oid_descr == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
  out:
 	SYSCTL_RUNLOCK(&tracker);
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD |
     CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, "");
 
 static int
 sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	struct rm_priotracker tracker;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	SYSCTL_RLOCK(&tracker);
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		goto out;
 
 	if (oid->oid_label == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	error = SYSCTL_OUT(req, oid->oid_label, strlen(oid->oid_label) + 1);
  out:
 	SYSCTL_RUNLOCK(&tracker);
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD |
     CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, "");
 
 /*
  * Default "handler" functions.
  */
 
 /*
  * Handle a bool.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_bool(SYSCTL_HANDLER_ARGS)
 {
 	uint8_t temp;
 	int error;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		temp = *(bool *)arg1 ? 1 : 0;
 	else
 		temp = arg2 ? 1 : 0;
 
 	error = SYSCTL_OUT(req, &temp, sizeof(temp));
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else {
 		error = SYSCTL_IN(req, &temp, sizeof(temp));
 		if (!error)
 			*(bool *)arg1 = temp ? 1 : 0;
 	}
 	return (error);
 }
 
 /*
  * Handle an int8_t, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_8(SYSCTL_HANDLER_ARGS)
 {
 	int8_t tmpout;
 	int error = 0;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(int8_t *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
 	return (error);
 }
 
 /*
  * Handle an int16_t, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_16(SYSCTL_HANDLER_ARGS)
 {
 	int16_t tmpout;
 	int error = 0;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(int16_t *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
 	return (error);
 }
 
 /*
  * Handle an int32_t, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_32(SYSCTL_HANDLER_ARGS)
 {
 	int32_t tmpout;
 	int error = 0;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(int32_t *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
 	return (error);
 }
 
 /*
  * Handle an int, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_int(SYSCTL_HANDLER_ARGS)
 {
 	int tmpout, error = 0;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(int *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(int));
 	return (error);
 }
 
 /*
  * Based on sysctl_handle_int() convert milliseconds into ticks.
  * Note: this is used by TCP.
  */
 
 int
 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
 {
 	int error, s, tt;
 
 	tt = *(int *)arg1;
 	s = (int)((int64_t)tt * 1000 / hz);
 
 	error = sysctl_handle_int(oidp, &s, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	tt = (int)((int64_t)s * hz / 1000);
 	if (tt < 1)
 		return (EINVAL);
 
 	*(int *)arg1 = tt;
 	return (0);
 }
 
 /*
  * Handle a long, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_long(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	long tmplong;
 #ifdef SCTL_MASK32
 	int tmpint;
 #endif
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmplong = *(long *)arg1;
 	else
 		tmplong = arg2;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		tmpint = tmplong;
 		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 #ifdef SCTL_MASK32
 	else if (req->flags & SCTL_MASK32) {
 		error = SYSCTL_IN(req, &tmpint, sizeof(int));
 		*(long *)arg1 = (long)tmpint;
 	}
 #endif
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(long));
 	return (error);
 }
 
 /*
  * Handle a 64 bit int, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 int
 sysctl_handle_64(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	uint64_t tmpout;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(uint64_t *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
 	return (error);
 }
 
 /*
  * Handle our generic '\0' terminated 'C' string.
  * Two cases:
  * 	a variable string:  point arg1 at it, arg2 is max length.
  * 	a constant string:  point arg1 at it, arg2 is zero.
  */
 
 int
 sysctl_handle_string(SYSCTL_HANDLER_ARGS)
 {
 	char *tmparg;
 	size_t outlen;
 	int error = 0, ro_string = 0;
 
 	/*
 	 * If the sysctl isn't writable and isn't a preallocated tunable that
 	 * can be modified by kenv(2), microoptimise and treat it as a
 	 * read-only string.
 	 * A zero-length buffer indicates a fixed size read-only
 	 * string.  In ddb, don't worry about trying to make a malloced
 	 * snapshot.
 	 */
 	if ((oidp->oid_kind & (CTLFLAG_WR | CTLFLAG_TUN)) == 0 ||
 	    arg2 == 0 || kdb_active) {
 		arg2 = strlen((char *)arg1) + 1;
 		ro_string = 1;
 	}
 
 	if (req->oldptr != NULL) {
 		if (ro_string) {
 			tmparg = arg1;
 			outlen = strlen(tmparg) + 1;
 		} else {
 			tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK);
 			sx_slock(&sysctlstringlock);
 			memcpy(tmparg, arg1, arg2);
 			sx_sunlock(&sysctlstringlock);
 			outlen = strlen(tmparg) + 1;
 		}
 
 		error = SYSCTL_OUT(req, tmparg, outlen);
 
 		if (!ro_string)
 			free(tmparg, M_SYSCTLTMP);
 	} else {
 		if (!ro_string)
 			sx_slock(&sysctlstringlock);
 		outlen = strlen((char *)arg1) + 1;
 		if (!ro_string)
 			sx_sunlock(&sysctlstringlock);
 		error = SYSCTL_OUT(req, NULL, outlen);
 	}
 	if (error || !req->newptr)
 		return (error);
 
 	if (req->newlen - req->newidx >= arg2 ||
 	    req->newlen - req->newidx < 0) {
 		error = EINVAL;
 	} else if (req->newlen - req->newidx == 0) {
 		sx_xlock(&sysctlstringlock);
 		((char *)arg1)[0] = '\0';
 		sx_xunlock(&sysctlstringlock);
 	} else if (req->newfunc == sysctl_new_kernel) {
 		arg2 = req->newlen - req->newidx;
 		sx_xlock(&sysctlstringlock);
 		error = SYSCTL_IN(req, arg1, arg2);
 		if (error == 0) {
 			((char *)arg1)[arg2] = '\0';
 			req->newidx += arg2;
 		}
 		sx_xunlock(&sysctlstringlock);
 	} else {
 		arg2 = req->newlen - req->newidx;
 		tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK);
 
 		error = SYSCTL_IN(req, tmparg, arg2);
 		if (error) {
 			free(tmparg, M_SYSCTLTMP);
 			return (error);
 		}
 
 		sx_xlock(&sysctlstringlock);
 		memcpy(arg1, tmparg, arg2);
 		((char *)arg1)[arg2] = '\0';
 		sx_xunlock(&sysctlstringlock);
 		free(tmparg, M_SYSCTLTMP);
 		req->newidx += arg2;
 	}
 	return (error);
 }
 
 /*
  * Handle any kind of opaque data.
  * arg1 points to it, arg2 is the size.
  */
 
 int
 sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
 {
 	int error, tries;
 	u_int generation;
 	struct sysctl_req req2;
 
 	/*
 	 * Attempt to get a coherent snapshot, by using the thread
 	 * pre-emption counter updated from within mi_switch() to
 	 * determine if we were pre-empted during a bcopy() or
 	 * copyout(). Make 3 attempts at doing this before giving up.
 	 * If we encounter an error, stop immediately.
 	 */
 	tries = 0;
 	req2 = *req;
 retry:
 	generation = curthread->td_generation;
 	error = SYSCTL_OUT(req, arg1, arg2);
 	if (error)
 		return (error);
 	tries++;
 	if (generation != curthread->td_generation && tries < 3) {
 		*req = req2;
 		goto retry;
 	}
 
 	error = SYSCTL_IN(req, arg1, arg2);
 
 	return (error);
 }
 
 /*
  * Based on sysctl_handle_64() convert microseconds to a sbintime.
  */
 int
 sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int64_t usec;
 
 	usec = sbttous(*(sbintime_t *)arg1);
 
 	error = sysctl_handle_64(oidp, &usec, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	*(sbintime_t *)arg1 = ustosbt(usec);
 
 	return (0);
 }
 
 /*
  * Based on sysctl_handle_64() convert milliseconds to a sbintime.
  */
 int
 sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int64_t msec;
 
 	msec = sbttoms(*(sbintime_t *)arg1);
 
 	error = sysctl_handle_64(oidp, &msec, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	*(sbintime_t *)arg1 = mstosbt(msec);
 
 	return (0);
 }
 
 /*
  * Convert seconds to a struct timeval.  Intended for use with
  * intervals and thus does not permit negative seconds.
  */
 int
 sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS)
 {
 	struct timeval *tv;
 	int error, secs;
 
 	tv = arg1;
 	secs = tv->tv_sec;
 
 	error = sysctl_handle_int(oidp, &secs, 0, req);
 	if (error || req->newptr == NULL)
 		return (error);
 
 	if (secs < 0)
 		return (EINVAL);
 	tv->tv_sec = secs;
 
 	return (0);
 }
 
 /*
  * Transfer functions to/from kernel space.
  * XXX: rather untested at this point
  */
 static int
 sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
 {
 	size_t i = 0;
 
 	if (req->oldptr) {
 		i = l;
 		if (req->oldlen <= req->oldidx)
 			i = 0;
 		else
 			if (i > req->oldlen - req->oldidx)
 				i = req->oldlen - req->oldidx;
 		if (i > 0)
 			bcopy(p, (char *)req->oldptr + req->oldidx, i);
 	}
 	req->oldidx += l;
 	if (req->oldptr && i != l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
 {
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	bcopy((const char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (0);
 }
 
 int
 kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
 {
 	int error = 0;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		req.oldlen = *oldlenp;
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_kernel;
 	req.newfunc = sysctl_new_kernel;
 	req.lock = REQ_UNWIRED;
 
 	error = sysctl_root(0, name, namelen, &req);
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 int
 kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
     void *new, size_t newlen, size_t *retval, int flags)
 {
         int oid[CTL_MAXNAME];
         size_t oidlen, plen;
 	int error;
 
 	oid[0] = CTL_SYSCTL;
 	oid[1] = CTL_SYSCTL_NAME2OID;
 	oidlen = sizeof(oid);
 
 	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
 	    (void *)name, strlen(name), &plen, flags);
 	if (error)
 		return (error);
 
 	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
 	    new, newlen, retval, flags);
 	return (error);
 }
 
 /*
  * Transfer function to/from user space.
  */
 static int
 sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
 {
 	size_t i, len, origidx;
 	int error;
 
 	origidx = req->oldidx;
 	req->oldidx += l;
 	if (req->oldptr == NULL)
 		return (0);
 	/*
 	 * If we have not wired the user supplied buffer and we are currently
 	 * holding locks, drop a witness warning, as it's possible that
 	 * write operations to the user page can sleep.
 	 */
 	if (req->lock != REQ_WIRED)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "sysctl_old_user()");
 	i = l;
 	len = req->validlen;
 	if (len <= origidx)
 		i = 0;
 	else {
 		if (i > len - origidx)
 			i = len - origidx;
 		if (req->lock == REQ_WIRED) {
 			error = copyout_nofault(p, (char *)req->oldptr +
 			    origidx, i);
 		} else
 			error = copyout(p, (char *)req->oldptr + origidx, i);
 		if (error != 0)
 			return (error);
 	}
 	if (i < l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
 {
 	int error;
 
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "sysctl_new_user()");
 	error = copyin((const char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (error);
 }
 
 /*
  * Wire the user space destination buffer.  If set to a value greater than
  * zero, the len parameter limits the maximum amount of wired memory.
  */
 int
 sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
 {
 	int ret;
 	size_t wiredlen;
 
 	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
 	ret = 0;
 	if (req->lock != REQ_WIRED && req->oldptr &&
 	    req->oldfunc == sysctl_old_user) {
 		if (wiredlen != 0) {
 			ret = vslock(req->oldptr, wiredlen);
 			if (ret != 0) {
 				if (ret != ENOMEM)
 					return (ret);
 				wiredlen = 0;
 			}
 		}
 		req->lock = REQ_WIRED;
 		req->validlen = wiredlen;
 	}
 	return (0);
 }
 
 int
 sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
     int *nindx, struct sysctl_req *req)
 {
 	struct sysctl_oid_list *lsp;
 	struct sysctl_oid *oid;
 	struct sysctl_oid key;
 	int indx;
 
 	SYSCTL_ASSERT_LOCKED();
 	lsp = &sysctl__children;
 	indx = 0;
 	while (indx < CTL_MAXNAME) {
 		key.oid_number = name[indx];
 		oid = RB_FIND(sysctl_oid_list, lsp, &key);
 		if (oid == NULL)
 			return (ENOENT);
 
 		indx++;
 		if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			if (oid->oid_handler != NULL || indx == namelen) {
 				*noid = oid;
 				if (nindx != NULL)
 					*nindx = indx;
 				KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
 				    ("%s found DYING node %p", __func__, oid));
 				return (0);
 			}
 			lsp = SYSCTL_CHILDREN(oid);
 		} else if (indx == namelen) {
 			if ((oid->oid_kind & CTLFLAG_DORMANT) != 0)
 				return (ENOENT);
 			*noid = oid;
 			if (nindx != NULL)
 				*nindx = indx;
 			KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
 			    ("%s found DYING node %p", __func__, oid));
 			return (0);
 		} else {
 			return (ENOTDIR);
 		}
 	}
 	return (ENOENT);
 }
 
 /*
  * Traverse our tree, and find the right node, execute whatever it points
  * to, and return the resulting error code.
  */
 
 static int
 sysctl_root(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	struct rm_priotracker tracker;
 	int error, indx, lvl;
 
 	SYSCTL_RLOCK(&tracker);
 
 	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
 	if (error)
 		goto out;
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		/*
 		 * You can't call a sysctl when it's a node, but has
 		 * no handler.  Inform the user that it's a node.
 		 * The indx may or may not be the same as namelen.
 		 */
 		if (oid->oid_handler == NULL) {
 			error = EISDIR;
 			goto out;
 		}
 	}
 
 	/* Is this sysctl writable? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) {
 		error = EPERM;
 		goto out;
 	}
 
 	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * If the process is in capability mode, then don't permit reading or
 	 * writing unless specifically granted for the node.
 	 */
 	if (IN_CAPABILITY_MODE(req->td)) {
 		if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) ||
 		    (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) {
 			error = EPERM;
 			goto out;
 		}
 	}
 #endif
 
 	/* Is this sysctl sensitive to securelevels? */
 	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
 		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
 		error = securelevel_gt(req->td->td_ucred, lvl);
 		if (error)
 			goto out;
 	}
 
 	/* Is this sysctl writable by only privileged users? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
 		int priv;
 
 		if (oid->oid_kind & CTLFLAG_PRISON)
 			priv = PRIV_SYSCTL_WRITEJAIL;
 #ifdef VIMAGE
 		else if ((oid->oid_kind & CTLFLAG_VNET) &&
 		     prison_owns_vnet(req->td->td_ucred))
 			priv = PRIV_SYSCTL_WRITEJAIL;
 #endif
 		else
 			priv = PRIV_SYSCTL_WRITE;
 		error = priv_check(req->td, priv);
 		if (error)
 			goto out;
 	}
 
 	if (!oid->oid_handler) {
 		error = EINVAL;
 		goto out;
 	}
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		arg1 = (int *)arg1 + indx;
 		arg2 -= indx;
 	} else {
 		arg1 = oid->oid_arg1;
 		arg2 = oid->oid_arg2;
 	}
 #ifdef MAC
 	error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
 	    req);
 	if (error != 0)
 		goto out;
 #endif
 #ifdef VIMAGE
 	if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL)
 		arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
 #endif
 	error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker);
 
 out:
 	SYSCTL_RUNLOCK(&tracker);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __sysctl_args {
 	int	*name;
 	u_int	namelen;
 	void	*old;
 	size_t	*oldlenp;
 	void	*new;
 	size_t	newlen;
 };
 #endif
 int
 sys___sysctl(struct thread *td, struct __sysctl_args *uap)
 {
 	int error, i, name[CTL_MAXNAME];
 	size_t j;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
 
  	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
  	if (error)
 		return (error);
 
 	error = userland_sysctl(td, name, uap->namelen,
 		uap->old, uap->oldlenp, 0,
 		uap->new, uap->newlen, &j, 0);
 	if (error && error != ENOMEM)
 		return (error);
 	if (uap->oldlenp) {
 		i = copyout(&j, uap->oldlenp, sizeof(j));
 		if (i)
 			return (i);
 	}
 	return (error);
 }
 
 int
 kern___sysctlbyname(struct thread *td, const char *oname, size_t namelen,
     void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval,
     int flags, bool inkernel)
 {
 	int oid[CTL_MAXNAME];
 	char namebuf[16];
 	char *name;
 	size_t oidlen;
 	int error;
 
 	if (namelen > MAXPATHLEN || namelen == 0)
 		return (EINVAL);
 	name = namebuf;
 	if (namelen > sizeof(namebuf))
 		name = malloc(namelen, M_SYSCTL, M_WAITOK);
 	error = copyin(oname, name, namelen);
 	if (error != 0)
 		goto out;
 
 	oid[0] = CTL_SYSCTL;
 	oid[1] = CTL_SYSCTL_NAME2OID;
 	oidlen = sizeof(oid);
 	error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, namelen,
 	    retval, flags);
 	if (error != 0)
 		goto out;
 	error = userland_sysctl(td, oid, *retval / sizeof(int), old, oldlenp,
 	    inkernel, new, newlen, retval, flags);
 
 out:
 	if (namelen > sizeof(namebuf))
 		free(name, M_SYSCTL);
 	return (error);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct __sysctlbyname_args {
 	const char	*name;
 	size_t	namelen;
 	void	*old;
 	size_t	*oldlenp;
 	void	*new;
 	size_t	newlen;
 };
 #endif
 int
 sys___sysctlbyname(struct thread *td, struct __sysctlbyname_args *uap)
 {
 	size_t rv;
 	int error;
 
 	error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old,
 	    uap->oldlenp, uap->new, uap->newlen, &rv, 0, 0);
 	if (error != 0)
 		return (error);
 	if (uap->oldlenp != NULL)
 		error = copyout(&rv, uap->oldlenp, sizeof(rv));
 
 	return (error);
 }
 
 /*
  * This is used from various compatibility syscalls too.  That's why name
  * must be in kernel space.
  */
 int
 userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, int inkernel, const void *new, size_t newlen,
     size_t *retval, int flags)
 {
 	int error = 0, memlocked;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		if (inkernel) {
 			req.oldlen = *oldlenp;
 		} else {
 			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
 			if (error)
 				return (error);
 		}
 	}
 	req.validlen = req.oldlen;
 	req.oldptr = old;
 
 	if (new != NULL) {
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_user;
 	req.newfunc = sysctl_new_user;
 	req.lock = REQ_UNWIRED;
 
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_SYSCTL))
 		ktrsysctl(name, namelen);
 #endif
 	memlocked = 0;
 	if (req.oldptr && req.oldlen > 4 * PAGE_SIZE) {
 		memlocked = 1;
 		sx_xlock(&sysctlmemlock);
 	}
 	CURVNET_SET(TD_TO_VNET(td));
 
 	for (;;) {
 		req.oldidx = 0;
 		req.newidx = 0;
 		error = sysctl_root(0, name, namelen, &req);
 		if (error != EAGAIN)
 			break;
 		kern_yield(PRI_USER);
 	}
 
 	CURVNET_RESTORE();
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 	if (memlocked)
 		sx_xunlock(&sysctlmemlock);
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 /*
  * Drain into a sysctl struct.  The user buffer should be wired if a page
  * fault would cause issue.
  */
 static int
 sbuf_sysctl_drain(void *arg, const char *data, int len)
 {
 	struct sysctl_req *req = arg;
 	int error;
 
 	error = SYSCTL_OUT(req, data, len);
 	KASSERT(error >= 0, ("Got unexpected negative value %d", error));
 	return (error == 0 ? len : -error);
 }
 
 struct sbuf *
 sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length,
     struct sysctl_req *req)
 {
 
 	/* Supply a default buffer size if none given. */
 	if (buf == NULL && length == 0)
 		length = 64;
 	s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
 	sbuf_set_drain(s, sbuf_sysctl_drain, req);
 	return (s);
 }
 
 #ifdef DDB
 
 /* The current OID the debugger is working with */
 static struct sysctl_oid *g_ddb_oid;
 
 /* The current flags specified by the user */
 static int g_ddb_sysctl_flags;
 
 /* Check to see if the last sysctl printed */
 static int g_ddb_sysctl_printed;
 
 static const int ctl_sign[CTLTYPE+1] = {
 	[CTLTYPE_INT] = 1,
 	[CTLTYPE_LONG] = 1,
 	[CTLTYPE_S8] = 1,
 	[CTLTYPE_S16] = 1,
 	[CTLTYPE_S32] = 1,
 	[CTLTYPE_S64] = 1,
 };
 
 static const int ctl_size[CTLTYPE+1] = {
 	[CTLTYPE_INT] = sizeof(int),
 	[CTLTYPE_UINT] = sizeof(u_int),
 	[CTLTYPE_LONG] = sizeof(long),
 	[CTLTYPE_ULONG] = sizeof(u_long),
 	[CTLTYPE_S8] = sizeof(int8_t),
 	[CTLTYPE_S16] = sizeof(int16_t),
 	[CTLTYPE_S32] = sizeof(int32_t),
 	[CTLTYPE_S64] = sizeof(int64_t),
 	[CTLTYPE_U8] = sizeof(uint8_t),
 	[CTLTYPE_U16] = sizeof(uint16_t),
 	[CTLTYPE_U32] = sizeof(uint32_t),
 	[CTLTYPE_U64] = sizeof(uint64_t),
 };
 
 #define DB_SYSCTL_NAME_ONLY	0x001	/* Compare with -N */
 #define DB_SYSCTL_VALUE_ONLY	0x002	/* Compare with -n */
 #define DB_SYSCTL_OPAQUE	0x004	/* Compare with -o */
 #define DB_SYSCTL_HEX		0x008	/* Compare with -x */
 
 #define DB_SYSCTL_SAFE_ONLY	0x100	/* Only simple types */
 
 static const char db_sysctl_modifs[] = {
 	'N', 'n', 'o', 'x',
 };
 
 static const int db_sysctl_modif_values[] = {
 	DB_SYSCTL_NAME_ONLY, DB_SYSCTL_VALUE_ONLY,
 	DB_SYSCTL_OPAQUE, DB_SYSCTL_HEX,
 };
 
 /* Handlers considered safe to print while recursing */
 static int (* const db_safe_handlers[])(SYSCTL_HANDLER_ARGS) = {
 	sysctl_handle_bool,
 	sysctl_handle_8,
 	sysctl_handle_16,
 	sysctl_handle_32,
 	sysctl_handle_64,
 	sysctl_handle_int,
 	sysctl_handle_long,
 	sysctl_handle_string,
 	sysctl_handle_opaque,
 };
 
 /*
  * Use in place of sysctl_old_kernel to print sysctl values.
  *
  * Compare to the output handling in show_var from sbin/sysctl/sysctl.c
  */
 static int
 sysctl_old_ddb(struct sysctl_req *req, const void *ptr, size_t len)
 {
 	const u_char *val, *p;
 	const char *sep1;
 	size_t intlen, slen;
 	uintmax_t umv;
 	intmax_t mv;
 	int sign, ctltype, hexlen, xflag, error;
 
 	/* Suppress false-positive GCC uninitialized variable warnings */
 	mv = 0;
 	umv = 0;
 
 	slen = len;
 	val = p = ptr;
 
 	if (ptr == NULL) {
 		error = 0;
 		goto out;
 	}
 
 	/* We are going to print */
 	g_ddb_sysctl_printed = 1;
 
 	xflag = g_ddb_sysctl_flags & DB_SYSCTL_HEX;
 
 	ctltype = (g_ddb_oid->oid_kind & CTLTYPE);
 	sign = ctl_sign[ctltype];
 	intlen = ctl_size[ctltype];
 
 	switch (ctltype) {
 	case CTLTYPE_NODE:
 	case CTLTYPE_STRING:
 		db_printf("%.*s", (int) len, (const char *) p);
 		error = 0;
 		goto out;
 
 	case CTLTYPE_INT:
 	case CTLTYPE_UINT:
 	case CTLTYPE_LONG:
 	case CTLTYPE_ULONG:
 	case CTLTYPE_S8:
 	case CTLTYPE_S16:
 	case CTLTYPE_S32:
 	case CTLTYPE_S64:
 	case CTLTYPE_U8:
 	case CTLTYPE_U16:
 	case CTLTYPE_U32:
 	case CTLTYPE_U64:
 		hexlen = 2 + (intlen * CHAR_BIT + 3) / 4;
 		sep1 = "";
 		while (len >= intlen) {
 			switch (ctltype) {
 			case CTLTYPE_INT:
 			case CTLTYPE_UINT:
 				umv = *(const u_int *)p;
 				mv = *(const int *)p;
 				break;
 			case CTLTYPE_LONG:
 			case CTLTYPE_ULONG:
 				umv = *(const u_long *)p;
 				mv = *(const long *)p;
 				break;
 			case CTLTYPE_S8:
 			case CTLTYPE_U8:
 				umv = *(const uint8_t *)p;
 				mv = *(const int8_t *)p;
 				break;
 			case CTLTYPE_S16:
 			case CTLTYPE_U16:
 				umv = *(const uint16_t *)p;
 				mv = *(const int16_t *)p;
 				break;
 			case CTLTYPE_S32:
 			case CTLTYPE_U32:
 				umv = *(const uint32_t *)p;
 				mv = *(const int32_t *)p;
 				break;
 			case CTLTYPE_S64:
 			case CTLTYPE_U64:
 				umv = *(const uint64_t *)p;
 				mv = *(const int64_t *)p;
 				break;
 			}
 
 			db_printf("%s", sep1);
 			if (xflag)
 				db_printf("%#0*jx", hexlen, umv);
 			else if (!sign)
 				db_printf("%ju", umv);
 			else if (g_ddb_oid->oid_fmt[1] == 'K') {
 				/* Kelvins are currently unsupported. */
 				error = EOPNOTSUPP;
 				goto out;
 			} else
 				db_printf("%jd", mv);
 
 			sep1 = " ";
 			len -= intlen;
 			p += intlen;
 		}
 		error = 0;
 		goto out;
 
 	case CTLTYPE_OPAQUE:
 		/* TODO: Support struct functions. */
 
 		/* FALLTHROUGH */
 	default:
 		db_printf("Format:%s Length:%zu Dump:0x",
 		    g_ddb_oid->oid_fmt, len);
 		while (len-- && (xflag || p < val + 16))
 			db_printf("%02x", *p++);
 		if (!xflag && len > 16)
 			db_printf("...");
 		error = 0;
 		goto out;
 	}
 
 out:
 	req->oldidx += slen;
 	return (error);
 }
 
 /*
  * Avoid setting new sysctl values from the debugger
  */
 static int
 sysctl_new_ddb(struct sysctl_req *req, void *p, size_t l)
 {
 
 	if (!req->newptr)
 		return (0);
 
 	/* Changing sysctls from the debugger is currently unsupported */
 	return (EPERM);
 }
 
 /*
  * Run a sysctl handler with the DDB oldfunc and newfunc attached.
  * Instead of copying any output to a buffer we'll dump it right to
  * the console.
  */
 static int
 db_sysctl(struct sysctl_oid *oidp, int *name, u_int namelen,
     void *old, size_t *oldlenp, size_t *retval, int flags)
 {
 	struct sysctl_req req;
 	int error;
 
 	/* Setup the request */
 	bzero(&req, sizeof req);
 	req.td = kdb_thread;
 	req.oldfunc = sysctl_old_ddb;
 	req.newfunc = sysctl_new_ddb;
 	req.lock = REQ_UNWIRED;
 	if (oldlenp) {
 		req.oldlen = *oldlenp;
 	}
 	req.validlen = req.oldlen;
 	if (old) {
 		req.oldptr = old;
 	}
 
 	/* Setup our globals for sysctl_old_ddb */
 	g_ddb_oid = oidp;
 	g_ddb_sysctl_flags = flags;
 	g_ddb_sysctl_printed = 0;
 
 	error = sysctl_root(0, name, namelen, &req);
 
 	/* Reset globals */
 	g_ddb_oid = NULL;
 	g_ddb_sysctl_flags = 0;
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 /*
  * Show a sysctl's name
  */
 static void
 db_show_oid_name(int *oid, size_t nlen)
 {
 	struct sysctl_oid *oidp;
 	int qoid[CTL_MAXNAME + 2];
 	int error;
 
 	qoid[0] = CTL_SYSCTL;
 	qoid[1] = CTL_SYSCTL_NAME;
 	memcpy(qoid + 2, oid, nlen * sizeof(int));
 
 	error = sysctl_find_oid(qoid, nlen + 2, &oidp, NULL, NULL);
 	if (error)
 		db_error("sysctl name oid");
 
 	error = db_sysctl(oidp, qoid, nlen + 2, NULL, NULL, NULL, 0);
 	if (error)
 		db_error("sysctl name");
 }
 
 /*
  * Check to see if an OID is safe to print from ddb.
  */
 static bool
 db_oid_safe(const struct sysctl_oid *oidp)
 {
 	for (unsigned int i = 0; i < nitems(db_safe_handlers); ++i) {
 		if (oidp->oid_handler == db_safe_handlers[i])
 			return (true);
 	}
 
 	return (false);
 }
 
 /*
  * Show a sysctl at a specific OID
  * Compare to the input handling in show_var from sbin/sysctl/sysctl.c
  */
 static int
 db_show_oid(struct sysctl_oid *oidp, int *oid, size_t nlen, int flags)
 {
 	int error, xflag, oflag, Nflag, nflag;
 	size_t len;
 
 	xflag = flags & DB_SYSCTL_HEX;
 	oflag = flags & DB_SYSCTL_OPAQUE;
 	nflag = flags & DB_SYSCTL_VALUE_ONLY;
 	Nflag = flags & DB_SYSCTL_NAME_ONLY;
 
 	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_OPAQUE &&
 	    (!xflag && !oflag))
 		return (0);
 
 	if (Nflag) {
 		db_show_oid_name(oid, nlen);
 		error = 0;
 		goto out;
 	}
 
 	if (!nflag) {
 		db_show_oid_name(oid, nlen);
 		db_printf(": ");
 	}
 
 	if ((flags & DB_SYSCTL_SAFE_ONLY) && !db_oid_safe(oidp)) {
 		db_printf("Skipping, unsafe to print while recursing.");
 		error = 0;
 		goto out;
 	}
 
 	/* Try once, and ask about the size */
 	len = 0;
 	error = db_sysctl(oidp, oid, nlen,
 	    NULL, NULL, &len, flags);
 	if (error)
 		goto out;
 
 	if (!g_ddb_sysctl_printed)
 		/* Lie about the size */
 		error = db_sysctl(oidp, oid, nlen,
 		    (void *) 1, &len, NULL, flags);
 
 out:
 	db_printf("\n");
 	return (error);
 }
 
 /*
  * Show all sysctls under a specific OID
  * Compare to sysctl_all from sbin/sysctl/sysctl.c
  */
 static int
 db_show_sysctl_all(int *oid, size_t len, int flags)
 {
 	struct sysctl_oid *oidp;
 	int qoid[CTL_MAXNAME + 2], next[CTL_MAXNAME];
 	size_t nlen;
 
 	qoid[0] = CTL_SYSCTL;
 	qoid[1] = CTL_SYSCTL_NEXT;
 	if (len) {
 		nlen = len;
 		memcpy(&qoid[2], oid, nlen * sizeof(int));
 	} else {
 		nlen = 1;
 		qoid[2] = CTL_KERN;
 	}
 	for (;;) {
 		int error;
 		size_t nextsize = sizeof(next);
 
 		error = kernel_sysctl(kdb_thread, qoid, nlen + 2,
 		    next, &nextsize, NULL, 0, &nlen, 0);
 		if (error != 0) {
 			if (error == ENOENT)
 				return (0);
 			else
 				db_error("sysctl(next)");
 		}
 
 		nlen /= sizeof(int);
 
 		if (nlen < (unsigned int)len)
 			return (0);
 
 		if (memcmp(&oid[0], &next[0], len * sizeof(int)) != 0)
 			return (0);
 
 		/* Find the OID in question */
 		error = sysctl_find_oid(next, nlen, &oidp, NULL, NULL);
 		if (error)
 			return (error);
 
 		(void)db_show_oid(oidp, next, nlen, flags | DB_SYSCTL_SAFE_ONLY);
 
 		if (db_pager_quit)
 			return (0);
 
 		memcpy(&qoid[2 + len], &next[len], (nlen - len) * sizeof(int));
 	}
 }
 
 /*
  * Show a sysctl by its user facing string
  */
 static int
 db_sysctlbyname(const char *name, int flags)
 {
 	struct sysctl_oid *oidp;
 	int oid[CTL_MAXNAME];
 	int error, nlen;
 
 	error = name2oid(name, oid, &nlen, &oidp);
 	if (error) {
 		return (error);
 	}
 
 	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		db_show_sysctl_all(oid, nlen, flags);
 	} else {
 		error = db_show_oid(oidp, oid, nlen, flags);
 	}
 
 	return (error);
 }
 
 static void
 db_sysctl_cmd_usage(void)
 {
 	db_printf(
 	    " sysctl [/Nnox] <sysctl>					    \n"
 	    "								    \n"
 	    " <sysctl> The name of the sysctl to show.			    \n"
 	    "								    \n"
 	    " Show a sysctl by hooking into SYSCTL_IN and SYSCTL_OUT.	    \n"
 	    " This will work for most sysctls, but should not be used	    \n"
 	    " with sysctls that are known to malloc.			    \n"
 	    "								    \n"
 	    " While recursing any \"unsafe\" sysctls will be skipped.	    \n"
 	    " Call sysctl directly on the sysctl to try printing the	    \n"
 	    " skipped sysctl. This is unsafe and may make the ddb	    \n"
 	    " session unusable.						    \n"
 	    "								    \n"
 	    " Arguments:						    \n"
 	    "	/N	Display only the name of the sysctl.		    \n"
 	    "	/n	Display only the value of the sysctl.		    \n"
 	    "	/o	Display opaque values.				    \n"
 	    "	/x	Display the sysctl in hex.			    \n"
 	    "								    \n"
 	    "For example:						    \n"
 	    "sysctl vm.v_free_min					    \n"
 	    "vn.v_free_min: 12669					    \n"
 	    );
 }
 
 /*
  * Show a specific sysctl similar to sysctl (8).
  */
 DB_COMMAND_FLAGS(sysctl, db_sysctl_cmd, CS_OWN)
 {
 	char name[TOK_STRING_SIZE];
 	int error, i, t, flags;
 
 	/* Parse the modifiers */
 	t = db_read_token();
 	if (t == tSLASH || t == tMINUS) {
 		t = db_read_token();
 		if (t != tIDENT) {
 			db_printf("Bad modifier\n");
 			error = EINVAL;
 			goto out;
 		}
 		db_strcpy(modif, db_tok_string);
 	}
 	else {
 		db_unread_token(t);
 		modif[0] = '\0';
 	}
 
 	flags = 0;
 	for (i = 0; i < nitems(db_sysctl_modifs); i++) {
 		if (strchr(modif, db_sysctl_modifs[i])) {
 			flags |= db_sysctl_modif_values[i];
 		}
 	}
 
 	/* Parse the sysctl names */
 	t = db_read_token();
 	if (t != tIDENT) {
 		db_printf("Need sysctl name\n");
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Copy the name into a temporary buffer */
 	db_strcpy(name, db_tok_string);
 
 	/* Ensure there is no trailing cruft */
 	t = db_read_token();
 	if (t != tEOL) {
 		db_printf("Unexpected sysctl argument\n");
 		error = EINVAL;
 		goto out;
 	}
 
 	error = db_sysctlbyname(name, flags);
 	if (error == ENOENT) {
 		db_printf("unknown oid: '%s'\n", db_tok_string);
 		goto out;
 	} else if (error) {
 		db_printf("%s: error: %d\n", db_tok_string, error);
 		goto out;
 	}
 
 out:
 	/* Ensure we eat all of our text */
 	db_flush_lex();
 
 	if (error == EINVAL) {
 		db_sysctl_cmd_usage();
 	}
 }
 
 #endif /* DDB */
diff --git a/sys/kern/kern_tslog.c b/sys/kern/kern_tslog.c
index a22370b85b02..7b0847d5d187 100644
--- a/sys/kern/kern_tslog.c
+++ b/sys/kern/kern_tslog.c
@@ -1,223 +1,223 @@
 /*-
  * Copyright (c) 2017 Colin Percival
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tslog.h>
 
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 
 #ifndef TSLOGSIZE
 #define TSLOGSIZE 262144
 #endif
 
 static volatile long nrecs = 0;
 static struct timestamp {
 	void * td;
 	int type;
 	const char * f;
 	const char * s;
 	uint64_t tsc;
 } timestamps[TSLOGSIZE];
 
 void
 tslog(void * td, int type, const char * f, const char * s)
 {
 	uint64_t tsc = get_cyclecount();
 	long pos;
 
 	/* A NULL thread is thread0 before curthread is set. */
 	if (td == NULL)
 		td = &thread0;
 
 	/* Grab a slot. */
 	pos = atomic_fetchadd_long(&nrecs, 1);
 
 	/* Store record. */
 	if (pos < nitems(timestamps)) {
 		timestamps[pos].td = td;
 		timestamps[pos].type = type;
 		timestamps[pos].f = f;
 		timestamps[pos].s = s;
 		timestamps[pos].tsc = tsc;
 	}
 }
 
 static int
 sysctl_debug_tslog(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 	size_t i, limit;
 	caddr_t loader_tslog;
 	void * loader_tslog_buf;
 	size_t loader_tslog_len;
 
 	/*
 	 * This code can race against the code in tslog() which stores
 	 * records: Theoretically we could end up reading a record after
 	 * its slots have been reserved but before it has been written.
 	 * Since this code takes orders of magnitude longer to run than
 	 * tslog() takes to write a record, it is highly unlikely that
 	 * anyone will ever experience this race.
 	 */
 	sb = sbuf_new_for_sysctl(NULL, NULL, 1024, req);
 
 	/* Get data from the boot loader, if it provided any. */
 	loader_tslog = preload_search_by_type("TSLOG data");
 	if (loader_tslog != NULL) {
 		loader_tslog_buf = preload_fetch_addr(loader_tslog);
 		loader_tslog_len = preload_fetch_size(loader_tslog);
 		sbuf_bcat(sb, loader_tslog_buf, loader_tslog_len);
 	}
 
 	/* Add data logged within the kernel. */
 	limit = MIN(nrecs, nitems(timestamps));
 	for (i = 0; i < limit; i++) {
 		sbuf_printf(sb, "%p", timestamps[i].td);
 		sbuf_printf(sb, " %llu",
 		    (unsigned long long)timestamps[i].tsc);
 		switch (timestamps[i].type) {
 		case TS_ENTER:
-			sbuf_printf(sb, " ENTER");
+			sbuf_cat(sb, " ENTER");
 			break;
 		case TS_EXIT:
-			sbuf_printf(sb, " EXIT");
+			sbuf_cat(sb, " EXIT");
 			break;
 		case TS_THREAD:
-			sbuf_printf(sb, " THREAD");
+			sbuf_cat(sb, " THREAD");
 			break;
 		case TS_EVENT:
-			sbuf_printf(sb, " EVENT");
+			sbuf_cat(sb, " EVENT");
 			break;
 		}
 		sbuf_printf(sb, " %s", timestamps[i].f ? timestamps[i].f : "(null)");
 		if (timestamps[i].s)
 			sbuf_printf(sb, " %s\n", timestamps[i].s);
 		else
-			sbuf_printf(sb, "\n");
+			sbuf_putc(sb, '\n');
 	}
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, tslog,
     CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_SKIP,
     0, 0, sysctl_debug_tslog, "", "Dump recorded event timestamps");
 
 MALLOC_DEFINE(M_TSLOGUSER, "tsloguser", "Strings used by userland tslog");
 static struct procdata {
 	pid_t ppid;
 	uint64_t tsc_forked;
 	uint64_t tsc_exited;
 	char * execname;
 	char * namei;
 	int reused;
 } procs[PID_MAX + 1];
 
 void
 tslog_user(pid_t pid, pid_t ppid, const char * execname, const char * namei)
 {
 	uint64_t tsc = get_cyclecount();
 
 	/* If we wrapped, do nothing. */
 	if (procs[pid].reused)
 		return;
 
 	/* If we have a ppid, we're recording a fork. */
 	if (ppid != (pid_t)(-1)) {
 		/* If we have a ppid already, we wrapped. */
 		if (procs[pid].ppid) {
 			procs[pid].reused = 1;
 			return;
 		}
 
 		/* Fill in some fields. */
 		procs[pid].ppid = ppid;
 		procs[pid].tsc_forked = tsc;
 		return;
 	}
 
 	/* If we have an execname, record it. */
 	if (execname != NULL) {
 		if (procs[pid].execname != NULL)
 			free(procs[pid].execname, M_TSLOGUSER);
 		procs[pid].execname = strdup(execname, M_TSLOGUSER);
 		return;
 	}
 
 	/* Record the first namei for the process. */
 	if (namei != NULL) {
 		if (procs[pid].namei == NULL)
 			procs[pid].namei = strdup(namei, M_TSLOGUSER);
 		return;
 	}
 
 	/* Otherwise we're recording an exit. */
 	procs[pid].tsc_exited = tsc;
 }
 
 static int
 sysctl_debug_tslog_user(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct sbuf *sb;
 	pid_t pid;
 
 	sb = sbuf_new_for_sysctl(NULL, NULL, 1024, req);
 
 	/* Export the data we logged. */
 	for (pid = 0; pid <= PID_MAX; pid++) {
 		sbuf_printf(sb, "%zu", (size_t)pid);
 		sbuf_printf(sb, " %zu", (size_t)procs[pid].ppid);
 		sbuf_printf(sb, " %llu",
 		    (unsigned long long)procs[pid].tsc_forked);
 		sbuf_printf(sb, " %llu",
 		    (unsigned long long)procs[pid].tsc_exited);
 		sbuf_printf(sb, " \"%s\"", procs[pid].execname ?
 		    procs[pid].execname : "");
 		sbuf_printf(sb, " \"%s\"", procs[pid].namei ?
 		    procs[pid].namei : "");
-		sbuf_printf(sb, "\n");
+		sbuf_putc(sb, '\n');
 	}
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, tslog_user,
     CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_SKIP,
     0, 0, sysctl_debug_tslog_user,
     "", "Dump recorded userland event timestamps");
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 20515f4e430b..ebd7139fa612 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -1,3358 +1,3358 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * This file implements the ULE scheduler.  ULE supports independent CPU
  * run queues and fine grain locking.  It has superior interactive
  * performance under load even on uni-processor systems.
  *
  * etymology:
  *   ULE is the last three letters in schedule.  It owes its name to a
  * generic user created for a scheduling system by Paul Mikesell at
  * Isilon Systems and a general lack of creativity on the part of the author.
  */
 
 #include <sys/cdefs.h>
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/turnstile.h>
 #include <sys/umtxvar.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
 #include <sys/sbuf.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 int __read_mostly		dtrace_vtime_active;
 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
 #endif
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #define	KTR_ULE	0
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 #define	TDQ_NAME_LEN	(sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
 #define	TDQ_LOADNAME_LEN	(sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
 
 /*
  * Thread scheduler specific section.  All fields are protected
  * by the thread lock.
  */
 struct td_sched {	
 	struct runq	*ts_runq;	/* Run-queue we're queued on. */
 	short		ts_flags;	/* TSF_* flags. */
 	int		ts_cpu;		/* CPU that we have affinity for. */
 	int		ts_rltick;	/* Real last tick, for affinity. */
 	int		ts_slice;	/* Ticks of slice remaining. */
 	u_int		ts_slptime;	/* Number of ticks we vol. slept */
 	u_int		ts_runtime;	/* Number of ticks we were running */
 	int		ts_ltick;	/* Last tick that we were running on */
 	int		ts_ftick;	/* First tick that we were running on */
 	int		ts_ticks;	/* Tick count */
 #ifdef KTR
 	char		ts_name[TS_NAME_LEN];
 #endif
 };
 /* flags kept in ts_flags */
 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
 #define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
 
 #define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
 #define	THREAD_CAN_SCHED(td, cpu)	\
     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
 
 _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
     sizeof(struct thread0_storage),
     "increase struct thread0_storage.t0st_sched size");
 
 /*
  * Priority ranges used for interactive and non-interactive timeshare
  * threads.  The timeshare priorities are split up into four ranges.
  * The first range handles interactive threads.  The last three ranges
  * (NHALF, x, and NHALF) handle non-interactive threads with the outer
  * ranges supporting nice values.
  */
 #define	PRI_TIMESHARE_RANGE	(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 #define	PRI_INTERACT_RANGE	((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
 #define	PRI_BATCH_RANGE		(PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
 
 #define	PRI_MIN_INTERACT	PRI_MIN_TIMESHARE
 #define	PRI_MAX_INTERACT	(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
 #define	PRI_MIN_BATCH		(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
 #define	PRI_MAX_BATCH		PRI_MAX_TIMESHARE
 
 /*
  * Cpu percentage computation macros and defines.
  *
  * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
  * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
  * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
  * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
  * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
  * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
  */
 #define	SCHED_TICK_SECS		10
 #define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
 #define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
 #define	SCHED_TICK_SHIFT	10
 #define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
 #define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
 
 /*
  * These macros determine priorities for non-interactive threads.  They are
  * assigned a priority based on their recent cpu utilization as expressed
  * by the ratio of ticks to the tick total.  NHALF priorities at the start
  * and end of the MIN to MAX timeshare range are only reachable with negative
  * or positive nice respectively.
  *
  * PRI_RANGE:	Priority range for utilization dependent priorities.
  * PRI_NRESV:	Number of nice values.
  * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
  * PRI_NICE:	Determines the part of the priority inherited from nice.
  */
 #define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
 #define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
 #define	SCHED_PRI_MIN		(PRI_MIN_BATCH + SCHED_PRI_NHALF)
 #define	SCHED_PRI_MAX		(PRI_MAX_BATCH - SCHED_PRI_NHALF)
 #define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
 #define	SCHED_PRI_TICKS(ts)						\
     (SCHED_TICK_HZ((ts)) /						\
     (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
 #define	SCHED_PRI_NICE(nice)	(nice)
 
 /*
  * These determine the interactivity of a process.  Interactivity differs from
  * cpu utilization in that it expresses the voluntary time slept vs time ran
  * while cpu utilization includes all time not running.  This more accurately
  * models the intent of the thread.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
  * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
  * INTERACT_THRESH:	Threshold for placement on the current runq.
  */
 #define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
 #define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
 #define	SCHED_INTERACT_MAX	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
 /*
  * These parameters determine the slice behavior for batch work.
  */
 #define	SCHED_SLICE_DEFAULT_DIVISOR	10	/* ~94 ms, 12 stathz ticks. */
 #define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
 
 /* Flags kept in td_flags. */
 #define	TDF_PICKCPU	TDF_SCHED0	/* Thread should pick new CPU. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /*
  * tickincr:		Converts a stathz tick into a hz domain scaled by
  *			the shift factor.  Without the shift the error rate
  *			due to rounding would be unacceptably high.
  * realstathz:		stathz is sometimes 0 and run off of hz.
  * sched_slice:		Runtime of each thread before rescheduling.
  * preempt_thresh:	Priority threshold for preemption and remote IPIs.
  */
 static u_int __read_mostly sched_interact = SCHED_INTERACT_THRESH;
 static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT;
 static int __read_mostly realstathz = 127;	/* reset during boot. */
 static int __read_mostly sched_slice = 10;	/* reset during boot. */
 static int __read_mostly sched_slice_min = 1;	/* reset during boot. */
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int __read_mostly preempt_thresh = PRI_MAX_IDLE;
 #else
 static int __read_mostly preempt_thresh = PRI_MIN_KERN;
 #endif
 #else 
 static int __read_mostly preempt_thresh = 0;
 #endif
 static int __read_mostly static_boost = PRI_MIN_BATCH;
 static int __read_mostly sched_idlespins = 10000;
 static int __read_mostly sched_idlespinthresh = -1;
 
 /*
  * tdq - per processor runqs and statistics.  A mutex synchronizes access to
  * most fields.  Some fields are loaded or modified without the mutex.
  *
  * Locking protocols:
  * (c)  constant after initialization
  * (f)  flag, set with the tdq lock held, cleared on local CPU
  * (l)  all accesses are CPU-local
  * (ls) stores are performed by the local CPU, loads may be lockless
  * (t)  all accesses are protected by the tdq mutex
  * (ts) stores are serialized by the tdq mutex, loads may be lockless
  */
 struct tdq {
 	/* 
 	 * Ordered to improve efficiency of cpu_search() and switch().
 	 * tdq_lock is padded to avoid false sharing with tdq_load and
 	 * tdq_cpu_idle.
 	 */
 	struct mtx_padalign tdq_lock;	/* run queue lock. */
 	struct cpu_group *tdq_cg;	/* (c) Pointer to cpu topology. */
 	struct thread	*tdq_curthread;	/* (t) Current executing thread. */
 	int		tdq_load;	/* (ts) Aggregate load. */
 	int		tdq_sysload;	/* (ts) For loadavg, !ITHD load. */
 	int		tdq_cpu_idle;	/* (ls) cpu_idle() is active. */
 	int		tdq_transferable; /* (ts) Transferable thread count. */
 	short		tdq_switchcnt;	/* (l) Switches this tick. */
 	short		tdq_oldswitchcnt; /* (l) Switches last tick. */
 	u_char		tdq_lowpri;	/* (ts) Lowest priority thread. */
 	u_char		tdq_owepreempt;	/* (f) Remote preemption pending. */
 	u_char		tdq_idx;	/* (t) Current insert index. */
 	u_char		tdq_ridx;	/* (t) Current removal index. */
 	int		tdq_id;		/* (c) cpuid. */
 	struct runq	tdq_realtime;	/* (t) real-time run queue. */
 	struct runq	tdq_timeshare;	/* (t) timeshare run queue. */
 	struct runq	tdq_idle;	/* (t) Queue of IDLE threads. */
 	char		tdq_name[TDQ_NAME_LEN];
 #ifdef KTR
 	char		tdq_loadname[TDQ_LOADNAME_LEN];
 #endif
 };
 
 /* Idle thread states and config. */
 #define	TDQ_RUNNING	1
 #define	TDQ_IDLE	2
 
 /* Lockless accessors. */
 #define	TDQ_LOAD(tdq)		atomic_load_int(&(tdq)->tdq_load)
 #define	TDQ_TRANSFERABLE(tdq)	atomic_load_int(&(tdq)->tdq_transferable)
 #define	TDQ_SWITCHCNT(tdq)	(atomic_load_short(&(tdq)->tdq_switchcnt) + \
 				 atomic_load_short(&(tdq)->tdq_oldswitchcnt))
 #define	TDQ_SWITCHCNT_INC(tdq)	(atomic_store_short(&(tdq)->tdq_switchcnt, \
 				 atomic_load_short(&(tdq)->tdq_switchcnt) + 1))
 
 #ifdef SMP
 struct cpu_group __read_mostly *cpu_top;		/* CPU topology */
 
 #define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
 #define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
 
 /*
  * Run-time tunables.
  */
 static int rebalance = 1;
 static int balance_interval = 128;	/* Default set in sched_initticks(). */
 static int __read_mostly affinity;
 static int __read_mostly steal_idle = 1;
 static int __read_mostly steal_thresh = 2;
 static int __read_mostly always_steal = 0;
 static int __read_mostly trysteal_limit = 2;
 
 /*
  * One thread queue per processor.
  */
 static struct tdq __read_mostly *balance_tdq;
 static int balance_ticks;
 DPCPU_DEFINE_STATIC(struct tdq, tdq);
 DPCPU_DEFINE_STATIC(uint32_t, randomval);
 
 #define	TDQ_SELF()	((struct tdq *)PCPU_GET(sched))
 #define	TDQ_CPU(x)	(DPCPU_ID_PTR((x), tdq))
 #define	TDQ_ID(x)	((x)->tdq_id)
 #else	/* !SMP */
 static struct tdq	tdq_cpu;
 
 #define	TDQ_ID(x)	(0)
 #define	TDQ_SELF()	(&tdq_cpu)
 #define	TDQ_CPU(x)	(&tdq_cpu)
 #endif
 
 #define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
 #define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
 #define	TDQ_TRYLOCK(t)		mtx_trylock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_TRYLOCK_FLAGS(t, f)	mtx_trylock_spin_flags(TDQ_LOCKPTR((t)), (f))
 #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCKPTR(t)		((struct mtx *)(&(t)->tdq_lock))
 
 static void sched_setpreempt(int);
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
 static int sched_interact_score(struct thread *);
 static void sched_interact_update(struct thread *);
 static void sched_interact_fork(struct thread *);
 static void sched_pctcpu_update(struct td_sched *, int);
 
 /* Operations on per processor queues */
 static struct thread *tdq_choose(struct tdq *);
 static void tdq_setup(struct tdq *, int i);
 static void tdq_load_add(struct tdq *, struct thread *);
 static void tdq_load_rem(struct tdq *, struct thread *);
 static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
 static __inline void tdq_runq_rem(struct tdq *, struct thread *);
 static inline int sched_shouldpreempt(int, int, int);
 static void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
 static int tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
 static int tdq_move(struct tdq *, struct tdq *);
 static int tdq_idled(struct tdq *);
 static void tdq_notify(struct tdq *, int lowpri);
 static struct thread *tdq_steal(struct tdq *, int);
 static struct thread *runq_steal(struct runq *, int);
 static int sched_pickcpu(struct thread *, int);
 static void sched_balance(void);
 static bool sched_balance_pair(struct tdq *, struct tdq *);
 static inline struct tdq *sched_setcpu(struct thread *, int, int);
 static inline void thread_unblock_switch(struct thread *, struct mtx *);
 static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, 
     struct cpu_group *cg, int indent);
 #endif
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
 SDT_PROVIDER_DEFINE(sched);
 
 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
     "struct proc *", "uint8_t");
 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
     "struct proc *", "void *");
 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", 
     "struct proc *");
 SDT_PROBE_DEFINE(sched, , , on__cpu);
 SDT_PROBE_DEFINE(sched, , , remain__cpu);
 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", 
     "struct proc *");
 
 /*
  * Print the threads waiting on a run-queue.
  */
 static void
 runq_print(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct thread *td;
 	int pri;
 	int j;
 	int i;
 
 	for (i = 0; i < RQB_LEN; i++) {
 		printf("\t\trunq bits %d 0x%zx\n",
 		    i, rq->rq_status.rqb_bits[i]);
 		for (j = 0; j < RQB_BPW; j++)
 			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
 				pri = j + (i << RQB_L2BPW);
 				rqh = &rq->rq_queues[pri];
 				TAILQ_FOREACH(td, rqh, td_runq) {
 					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
 					    td, td->td_name, td->td_priority,
 					    td->td_rqindex, pri);
 				}
 			}
 	}
 }
 
 /*
  * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
  */
 static void __unused
 tdq_print(int cpu)
 {
 	struct tdq *tdq;
 
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq %d:\n", TDQ_ID(tdq));
 	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
 	printf("\tLock name:      %s\n", tdq->tdq_name);
 	printf("\tload:           %d\n", tdq->tdq_load);
 	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
 	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
 	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
 	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
 	printf("\trealtime runq:\n");
 	runq_print(&tdq->tdq_realtime);
 	printf("\ttimeshare runq:\n");
 	runq_print(&tdq->tdq_timeshare);
 	printf("\tidle runq:\n");
 	runq_print(&tdq->tdq_idle);
 }
 
 static inline int
 sched_shouldpreempt(int pri, int cpri, int remote)
 {
 	/*
 	 * If the new priority is not better than the current priority there is
 	 * nothing to do.
 	 */
 	if (pri >= cpri)
 		return (0);
 	/*
 	 * Always preempt idle.
 	 */
 	if (cpri >= PRI_MIN_IDLE)
 		return (1);
 	/*
 	 * If preemption is disabled don't preempt others.
 	 */
 	if (preempt_thresh == 0)
 		return (0);
 	/*
 	 * Preempt if we exceed the threshold.
 	 */
 	if (pri <= preempt_thresh)
 		return (1);
 	/*
 	 * If we're interactive or better and there is non-interactive
 	 * or worse running preempt only remote processors.
 	 */
 	if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
 		return (1);
 	return (0);
 }
 
 /*
  * Add a thread to the actual run-queue.  Keeps transferable counts up to
  * date with what is actually on the run-queue.  Selects the correct
  * queue position for timeshare threads.
  */
 static __inline void
 tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct td_sched *ts;
 	u_char pri;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
 
 	pri = td->td_priority;
 	ts = td_get_sched(td);
 	TD_SET_RUNQ(td);
 	if (THREAD_CAN_MIGRATE(td)) {
 		tdq->tdq_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
 	}
 	if (pri < PRI_MIN_BATCH) {
 		ts->ts_runq = &tdq->tdq_realtime;
 	} else if (pri <= PRI_MAX_BATCH) {
 		ts->ts_runq = &tdq->tdq_timeshare;
 		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
 			("Invalid priority %d on timeshare runq", pri));
 		/*
 		 * This queue contains only priorities between MIN and MAX
 		 * batch.  Use the whole queue to represent these values.
 		 */
 		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
 			pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
 			pri = (pri + tdq->tdq_idx) % RQ_NQS;
 			/*
 			 * This effectively shortens the queue by one so we
 			 * can have a one slot difference between idx and
 			 * ridx while we wait for threads to drain.
 			 */
 			if (tdq->tdq_ridx != tdq->tdq_idx &&
 			    pri == tdq->tdq_ridx)
 				pri = (unsigned char)(pri - 1) % RQ_NQS;
 		} else
 			pri = tdq->tdq_ridx;
 		runq_add_pri(ts->ts_runq, td, pri, flags);
 		return;
 	} else
 		ts->ts_runq = &tdq->tdq_idle;
 	runq_add(ts->ts_runq, td, flags);
 }
 
 /* 
  * Remove a thread from a run-queue.  This typically happens when a thread
  * is selected to run.  Running threads are not on the queue and the
  * transferable count does not reflect them.
  */
 static __inline void
 tdq_runq_rem(struct tdq *tdq, struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
 	KASSERT(ts->ts_runq != NULL,
 	    ("tdq_runq_remove: thread %p null ts_runq", td));
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
 	}
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
 		if (tdq->tdq_idx != tdq->tdq_ridx)
 			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
 		else
 			runq_remove_idx(ts->ts_runq, td, NULL);
 	} else
 		runq_remove(ts->ts_runq, td);
 }
 
 /*
  * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
  * for this thread to the referenced thread queue.
  */
 static void
 tdq_load_add(struct tdq *tdq, struct thread *td)
 {
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
 
 	tdq->tdq_load++;
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload++;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
  * Remove the load from a thread that is transitioning to a sleep state or
  * exiting.
  */
 static void
 tdq_load_rem(struct tdq *tdq, struct thread *td)
 {
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
 	KASSERT(tdq->tdq_load != 0,
 	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
 
 	tdq->tdq_load--;
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload--;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
  * Bound timeshare latency by decreasing slice size as load increases.  We
  * consider the maximum latency as the sum of the threads waiting to run
  * aside from curthread and target no more than sched_slice latency but
  * no less than sched_slice_min runtime.
  */
 static inline int
 tdq_slice(struct tdq *tdq)
 {
 	int load;
 
 	/*
 	 * It is safe to use sys_load here because this is called from
 	 * contexts where timeshare threads are running and so there
 	 * cannot be higher priority load in the system.
 	 */
 	load = tdq->tdq_sysload - 1;
 	if (load >= SCHED_SLICE_MIN_DIVISOR)
 		return (sched_slice_min);
 	if (load <= 1)
 		return (sched_slice);
 	return (sched_slice / load);
 }
 
 /*
  * Set lowpri to its exact value by searching the run-queue and
  * evaluating curthread.  curthread may be passed as an optimization.
  */
 static void
 tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if (ctd == NULL)
 		ctd = tdq->tdq_curthread;
 	td = tdq_choose(tdq);
 	if (td == NULL || td->td_priority > ctd->td_priority)
 		tdq->tdq_lowpri = ctd->td_priority;
 	else
 		tdq->tdq_lowpri = td->td_priority;
 }
 
 #ifdef SMP
 /*
  * We need some randomness. Implement a classic Linear Congruential
  * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for
  * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits
  * of the random state (in the low bits of our answer) to keep
  * the maximum randomness.
  */
 static uint32_t
 sched_random(void)
 {
 	uint32_t *rndptr;
 
 	rndptr = DPCPU_PTR(randomval);
 	*rndptr = *rndptr * 69069 + 5;
 
 	return (*rndptr >> 16);
 }
 
 struct cpu_search {
 	cpuset_t *cs_mask;	/* The mask of allowed CPUs to choose from. */
 	int	cs_prefer;	/* Prefer this CPU and groups including it. */
 	int	cs_running;	/* The thread is now running at cs_prefer. */
 	int	cs_pri;		/* Min priority for low. */
 	int	cs_load;	/* Max load for low, min load for high. */
 	int	cs_trans;	/* Min transferable load for high. */
 };
 
 struct cpu_search_res {
 	int	csr_cpu;	/* The best CPU found. */
 	int	csr_load;	/* The load of cs_cpu. */
 };
 
 /*
  * Search the tree of cpu_groups for the lowest or highest loaded CPU.
  * These routines actually compare the load on all paths through the tree
  * and find the least loaded cpu on the least loaded path, which may differ
  * from the least loaded cpu in the system.  This balances work among caches
  * and buses.
  */
 static int
 cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
     struct cpu_search_res *r)
 {
 	struct cpu_search_res lr;
 	struct tdq *tdq;
 	int c, bload, l, load, p, total;
 
 	total = 0;
 	bload = INT_MAX;
 	r->csr_cpu = -1;
 
 	/* Loop through children CPU groups if there are any. */
 	if (cg->cg_children > 0) {
 		for (c = cg->cg_children - 1; c >= 0; c--) {
 			load = cpu_search_lowest(&cg->cg_child[c], s, &lr);
 			total += load;
 
 			/*
 			 * When balancing do not prefer SMT groups with load >1.
 			 * It allows round-robin between SMT groups with equal
 			 * load within parent group for more fair scheduling.
 			 */
 			if (__predict_false(s->cs_running) &&
 			    (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) &&
 			    load >= 128 && (load & 128) != 0)
 				load += 128;
 
 			if (lr.csr_cpu >= 0 && (load < bload ||
 			    (load == bload && lr.csr_load < r->csr_load))) {
 				bload = load;
 				r->csr_cpu = lr.csr_cpu;
 				r->csr_load = lr.csr_load;
 			}
 		}
 		return (total);
 	}
 
 	/* Loop through children CPUs otherwise. */
 	for (c = cg->cg_last; c >= cg->cg_first; c--) {
 		if (!CPU_ISSET(c, &cg->cg_mask))
 			continue;
 		tdq = TDQ_CPU(c);
 		l = TDQ_LOAD(tdq);
 		if (c == s->cs_prefer) {
 			if (__predict_false(s->cs_running))
 				l--;
 			p = 128;
 		} else
 			p = 0;
 		load = l * 256;
 		total += load - p;
 
 		/*
 		 * Check this CPU is acceptable.
 		 * If the threads is already on the CPU, don't look on the TDQ
 		 * priority, since it can be the priority of the thread itself.
 		 */
 		if (l > s->cs_load ||
 		    (atomic_load_char(&tdq->tdq_lowpri) <= s->cs_pri &&
 		     (!s->cs_running || c != s->cs_prefer)) ||
 		    !CPU_ISSET(c, s->cs_mask))
 			continue;
 
 		/*
 		 * When balancing do not prefer CPUs with load > 1.
 		 * It allows round-robin between CPUs with equal load
 		 * within the CPU group for more fair scheduling.
 		 */
 		if (__predict_false(s->cs_running) && l > 0)
 			p = 0;
 
 		load -= sched_random() % 128;
 		if (bload > load - p) {
 			bload = load - p;
 			r->csr_cpu = c;
 			r->csr_load = load;
 		}
 	}
 	return (total);
 }
 
 static int
 cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
     struct cpu_search_res *r)
 {
 	struct cpu_search_res lr;
 	struct tdq *tdq;
 	int c, bload, l, load, total;
 
 	total = 0;
 	bload = INT_MIN;
 	r->csr_cpu = -1;
 
 	/* Loop through children CPU groups if there are any. */
 	if (cg->cg_children > 0) {
 		for (c = cg->cg_children - 1; c >= 0; c--) {
 			load = cpu_search_highest(&cg->cg_child[c], s, &lr);
 			total += load;
 			if (lr.csr_cpu >= 0 && (load > bload ||
 			    (load == bload && lr.csr_load > r->csr_load))) {
 				bload = load;
 				r->csr_cpu = lr.csr_cpu;
 				r->csr_load = lr.csr_load;
 			}
 		}
 		return (total);
 	}
 
 	/* Loop through children CPUs otherwise. */
 	for (c = cg->cg_last; c >= cg->cg_first; c--) {
 		if (!CPU_ISSET(c, &cg->cg_mask))
 			continue;
 		tdq = TDQ_CPU(c);
 		l = TDQ_LOAD(tdq);
 		load = l * 256;
 		total += load;
 
 		/*
 		 * Check this CPU is acceptable.
 		 */
 		if (l < s->cs_load || TDQ_TRANSFERABLE(tdq) < s->cs_trans ||
 		    !CPU_ISSET(c, s->cs_mask))
 			continue;
 
 		load -= sched_random() % 256;
 		if (load > bload) {
 			bload = load;
 			r->csr_cpu = c;
 		}
 	}
 	r->csr_load = bload;
 	return (total);
 }
 
 /*
  * Find the cpu with the least load via the least loaded path that has a
  * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
  * acceptable.
  */
 static inline int
 sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload,
     int prefer, int running)
 {
 	struct cpu_search s;
 	struct cpu_search_res r;
 
 	s.cs_prefer = prefer;
 	s.cs_running = running;
 	s.cs_mask = mask;
 	s.cs_pri = pri;
 	s.cs_load = maxload;
 	cpu_search_lowest(cg, &s, &r);
 	return (r.csr_cpu);
 }
 
 /*
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
 sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload,
     int mintrans)
 {
 	struct cpu_search s;
 	struct cpu_search_res r;
 
 	s.cs_mask = mask;
 	s.cs_load = minload;
 	s.cs_trans = mintrans;
 	cpu_search_highest(cg, &s, &r);
 	return (r.csr_cpu);
 }
 
 static void
 sched_balance_group(struct cpu_group *cg)
 {
 	struct tdq *tdq;
 	struct thread *td;
 	cpuset_t hmask, lmask;
 	int high, low, anylow;
 
 	CPU_FILL(&hmask);
 	for (;;) {
 		high = sched_highest(cg, &hmask, 1, 0);
 		/* Stop if there is no more CPU with transferrable threads. */
 		if (high == -1)
 			break;
 		CPU_CLR(high, &hmask);
 		CPU_COPY(&hmask, &lmask);
 		/* Stop if there is no more CPU left for low. */
 		if (CPU_EMPTY(&lmask))
 			break;
 		tdq = TDQ_CPU(high);
 		if (TDQ_LOAD(tdq) == 1) {
 			/*
 			 * There is only one running thread.  We can't move
 			 * it from here, so tell it to pick new CPU by itself.
 			 */
 			TDQ_LOCK(tdq);
 			td = tdq->tdq_curthread;
 			if (td->td_lock == TDQ_LOCKPTR(tdq) &&
 			    (td->td_flags & TDF_IDLETD) == 0 &&
 			    THREAD_CAN_MIGRATE(td)) {
 				td->td_flags |= TDF_PICKCPU;
 				ast_sched_locked(td, TDA_SCHED);
 				if (high != curcpu)
 					ipi_cpu(high, IPI_AST);
 			}
 			TDQ_UNLOCK(tdq);
 			break;
 		}
 		anylow = 1;
 nextlow:
 		if (TDQ_TRANSFERABLE(tdq) == 0)
 			continue;
 		low = sched_lowest(cg, &lmask, -1, TDQ_LOAD(tdq) - 1, high, 1);
 		/* Stop if we looked well and found no less loaded CPU. */
 		if (anylow && low == -1)
 			break;
 		/* Go to next high if we found no less loaded CPU. */
 		if (low == -1)
 			continue;
 		/* Transfer thread from high to low. */
 		if (sched_balance_pair(tdq, TDQ_CPU(low))) {
 			/* CPU that got thread can no longer be a donor. */
 			CPU_CLR(low, &hmask);
 		} else {
 			/*
 			 * If failed, then there is no threads on high
 			 * that can run on this low. Drop low from low
 			 * mask and look for different one.
 			 */
 			CPU_CLR(low, &lmask);
 			anylow = 0;
 			goto nextlow;
 		}
 	}
 }
 
 static void
 sched_balance(void)
 {
 	struct tdq *tdq;
 
 	balance_ticks = max(balance_interval / 2, 1) +
 	    (sched_random() % balance_interval);
 	tdq = TDQ_SELF();
 	TDQ_UNLOCK(tdq);
 	sched_balance_group(cpu_top);
 	TDQ_LOCK(tdq);
 }
 
 /*
  * Lock two thread queues using their address to maintain lock order.
  */
 static void
 tdq_lock_pair(struct tdq *one, struct tdq *two)
 {
 	if (one < two) {
 		TDQ_LOCK(one);
 		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
 	} else {
 		TDQ_LOCK(two);
 		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
 	}
 }
 
 /*
  * Unlock two thread queues.  Order is not important here.
  */
 static void
 tdq_unlock_pair(struct tdq *one, struct tdq *two)
 {
 	TDQ_UNLOCK(one);
 	TDQ_UNLOCK(two);
 }
 
 /*
  * Transfer load between two imbalanced thread queues.  Returns true if a thread
  * was moved between the queues, and false otherwise.
  */
 static bool
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
 	int cpu, lowpri;
 	bool ret;
 
 	ret = false;
 	tdq_lock_pair(high, low);
 
 	/*
 	 * Transfer a thread from high to low.
 	 */
 	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load) {
 		lowpri = tdq_move(high, low);
 		if (lowpri != -1) {
 			/*
 			 * In case the target isn't the current CPU notify it of
 			 * the new load, possibly sending an IPI to force it to
 			 * reschedule.  Otherwise maybe schedule a preemption.
 			 */
 			cpu = TDQ_ID(low);
 			if (cpu != PCPU_GET(cpuid))
 				tdq_notify(low, lowpri);
 			else
 				sched_setpreempt(low->tdq_lowpri);
 			ret = true;
 		}
 	}
 	tdq_unlock_pair(high, low);
 	return (ret);
 }
 
 /*
  * Move a thread from one thread queue to another.  Returns -1 if the source
  * queue was empty, else returns the maximum priority of all threads in
  * the destination queue prior to the addition of the new thread.  In the latter
  * case, this priority can be used to determine whether an IPI needs to be
  * delivered.
  */
 static int
 tdq_move(struct tdq *from, struct tdq *to)
 {
 	struct thread *td;
 	int cpu;
 
 	TDQ_LOCK_ASSERT(from, MA_OWNED);
 	TDQ_LOCK_ASSERT(to, MA_OWNED);
 
 	cpu = TDQ_ID(to);
 	td = tdq_steal(from, cpu);
 	if (td == NULL)
 		return (-1);
 
 	/*
 	 * Although the run queue is locked the thread may be
 	 * blocked.  We can not set the lock until it is unblocked.
 	 */
 	thread_lock_block_wait(td);
 	sched_rem(td);
 	THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from));
 	td->td_lock = TDQ_LOCKPTR(to);
 	td_get_sched(td)->ts_cpu = cpu;
 	return (tdq_add(to, td, SRQ_YIELDING));
 }
 
 /*
  * This tdq has idled.  Try to steal a thread from another cpu and switch
  * to it.
  */
 static int
 tdq_idled(struct tdq *tdq)
 {
 	struct cpu_group *cg, *parent;
 	struct tdq *steal;
 	cpuset_t mask;
 	int cpu, switchcnt, goup;
 
 	if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL)
 		return (1);
 	CPU_FILL(&mask);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
 restart:
 	switchcnt = TDQ_SWITCHCNT(tdq);
 	for (cg = tdq->tdq_cg, goup = 0; ; ) {
 		cpu = sched_highest(cg, &mask, steal_thresh, 1);
 		/*
 		 * We were assigned a thread but not preempted.  Returning
 		 * 0 here will cause our caller to switch to it.
 		 */
 		if (TDQ_LOAD(tdq))
 			return (0);
 
 		/*
 		 * We found no CPU to steal from in this group.  Escalate to
 		 * the parent and repeat.  But if parent has only two children
 		 * groups we can avoid searching this group again by searching
 		 * the other one specifically and then escalating two levels.
 		 */
 		if (cpu == -1) {
 			if (goup) {
 				cg = cg->cg_parent;
 				goup = 0;
 			}
 			parent = cg->cg_parent;
 			if (parent == NULL)
 				return (1);
 			if (parent->cg_children == 2) {
 				if (cg == &parent->cg_child[0])
 					cg = &parent->cg_child[1];
 				else
 					cg = &parent->cg_child[0];
 				goup = 1;
 			} else
 				cg = parent;
 			continue;
 		}
 		steal = TDQ_CPU(cpu);
 		/*
 		 * The data returned by sched_highest() is stale and
 		 * the chosen CPU no longer has an eligible thread.
 		 *
 		 * Testing this ahead of tdq_lock_pair() only catches
 		 * this situation about 20% of the time on an 8 core
 		 * 16 thread Ryzen 7, but it still helps performance.
 		 */
 		if (TDQ_LOAD(steal) < steal_thresh ||
 		    TDQ_TRANSFERABLE(steal) == 0)
 			goto restart;
 		/*
 		 * Try to lock both queues. If we are assigned a thread while
 		 * waited for the lock, switch to it now instead of stealing.
 		 * If we can't get the lock, then somebody likely got there
 		 * first so continue searching.
 		 */
 		TDQ_LOCK(tdq);
 		if (tdq->tdq_load > 0) {
 			mi_switch(SW_VOL | SWT_IDLE);
 			return (0);
 		}
 		if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) {
 			TDQ_UNLOCK(tdq);
 			CPU_CLR(cpu, &mask);
 			continue;
 		}
 		/*
 		 * The data returned by sched_highest() is stale and
 		 * the chosen CPU no longer has an eligible thread, or
 		 * we were preempted and the CPU loading info may be out
 		 * of date.  The latter is rare.  In either case restart
 		 * the search.
 		 */
 		if (TDQ_LOAD(steal) < steal_thresh ||
 		    TDQ_TRANSFERABLE(steal) == 0 ||
 		    switchcnt != TDQ_SWITCHCNT(tdq)) {
 			tdq_unlock_pair(tdq, steal);
 			goto restart;
 		}
 		/*
 		 * Steal the thread and switch to it.
 		 */
 		if (tdq_move(steal, tdq) != -1)
 			break;
 		/*
 		 * We failed to acquire a thread even though it looked
 		 * like one was available.  This could be due to affinity
 		 * restrictions or for other reasons.  Loop again after
 		 * removing this CPU from the set.  The restart logic
 		 * above does not restore this CPU to the set due to the
 		 * likelyhood of failing here again.
 		 */
 		CPU_CLR(cpu, &mask);
 		tdq_unlock_pair(tdq, steal);
 	}
 	TDQ_UNLOCK(steal);
 	mi_switch(SW_VOL | SWT_IDLE);
 	return (0);
 }
 
 /*
  * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
  *
  * "lowpri" is the minimum scheduling priority among all threads on
  * the queue prior to the addition of the new thread.
  */
 static void
 tdq_notify(struct tdq *tdq, int lowpri)
 {
 	int cpu;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT(tdq->tdq_lowpri <= lowpri,
 	    ("tdq_notify: lowpri %d > tdq_lowpri %d", lowpri, tdq->tdq_lowpri));
 
 	if (tdq->tdq_owepreempt)
 		return;
 
 	/*
 	 * Check to see if the newly added thread should preempt the one
 	 * currently running.
 	 */
 	if (!sched_shouldpreempt(tdq->tdq_lowpri, lowpri, 1))
 		return;
 
 	/*
 	 * Make sure that our caller's earlier update to tdq_load is
 	 * globally visible before we read tdq_cpu_idle.  Idle thread
 	 * accesses both of them without locks, and the order is important.
 	 */
 	atomic_thread_fence_seq_cst();
 
 	/*
 	 * Try to figure out if we can signal the idle thread instead of sending
 	 * an IPI.  This check is racy; at worst, we will deliever an IPI
 	 * unnecessarily.
 	 */
 	cpu = TDQ_ID(tdq);
 	if (TD_IS_IDLETHREAD(tdq->tdq_curthread) &&
 	    (atomic_load_int(&tdq->tdq_cpu_idle) == 0 || cpu_idle_wakeup(cpu)))
 		return;
 
 	/*
 	 * The run queues have been updated, so any switch on the remote CPU
 	 * will satisfy the preemption request.
 	 */
 	tdq->tdq_owepreempt = 1;
 	ipi_cpu(cpu, IPI_PREEMPT);
 }
 
 /*
  * Steals load from a timeshare queue.  Honors the rotating queue head
  * index.
  */
 static struct thread *
 runq_steal_from(struct runq *rq, int cpu, u_char start)
 {
 	struct rqbits *rqb;
 	struct rqhead *rqh;
 	struct thread *td, *first;
 	int bit;
 	int i;
 
 	rqb = &rq->rq_status;
 	bit = start & (RQB_BPW -1);
 	first = NULL;
 again:
 	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
 		if (rqb->rqb_bits[i] == 0)
 			continue;
 		if (bit == 0)
 			bit = RQB_FFS(rqb->rqb_bits[i]);
 		for (; bit < RQB_BPW; bit++) {
 			if ((rqb->rqb_bits[i] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)];
 			TAILQ_FOREACH(td, rqh, td_runq) {
 				if (first) {
 					if (THREAD_CAN_MIGRATE(td) &&
 					    THREAD_CAN_SCHED(td, cpu))
 						return (td);
 				} else
 					first = td;
 			}
 		}
 	}
 	if (start != 0) {
 		start = 0;
 		goto again;
 	}
 
 	if (first && THREAD_CAN_MIGRATE(first) &&
 	    THREAD_CAN_SCHED(first, cpu))
 		return (first);
 	return (NULL);
 }
 
 /*
  * Steals load from a standard linear queue.
  */
 static struct thread *
 runq_steal(struct runq *rq, int cpu)
 {
 	struct rqhead *rqh;
 	struct rqbits *rqb;
 	struct thread *td;
 	int word;
 	int bit;
 
 	rqb = &rq->rq_status;
 	for (word = 0; word < RQB_LEN; word++) {
 		if (rqb->rqb_bits[word] == 0)
 			continue;
 		for (bit = 0; bit < RQB_BPW; bit++) {
 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
 			TAILQ_FOREACH(td, rqh, td_runq)
 				if (THREAD_CAN_MIGRATE(td) &&
 				    THREAD_CAN_SCHED(td, cpu))
 					return (td);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Attempt to steal a thread in priority order from a thread queue.
  */
 static struct thread *
 tdq_steal(struct tdq *tdq, int cpu)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
 		return (td);
 	if ((td = runq_steal_from(&tdq->tdq_timeshare,
 	    cpu, tdq->tdq_ridx)) != NULL)
 		return (td);
 	return (runq_steal(&tdq->tdq_idle, cpu));
 }
 
 /*
  * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
  * current lock and returns with the assigned queue locked.
  */
 static inline struct tdq *
 sched_setcpu(struct thread *td, int cpu, int flags)
 {
 
 	struct tdq *tdq;
 	struct mtx *mtx;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_CPU(cpu);
 	td_get_sched(td)->ts_cpu = cpu;
 	/*
 	 * If the lock matches just return the queue.
 	 */
 	if (td->td_lock == TDQ_LOCKPTR(tdq)) {
 		KASSERT((flags & SRQ_HOLD) == 0,
 		    ("sched_setcpu: Invalid lock for SRQ_HOLD"));
 		return (tdq);
 	}
 
 	/*
 	 * The hard case, migration, we need to block the thread first to
 	 * prevent order reversals with other cpus locks.
 	 */
 	spinlock_enter();
 	mtx = thread_lock_block(td);
 	if ((flags & SRQ_HOLD) == 0)
 		mtx_unlock_spin(mtx);
 	TDQ_LOCK(tdq);
 	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
 	spinlock_exit();
 	return (tdq);
 }
 
 SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
 SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
 SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
 SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
 SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
 SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
 
 static int
 sched_pickcpu(struct thread *td, int flags)
 {
 	struct cpu_group *cg, *ccg;
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t *mask;
 	int cpu, pri, r, self, intr;
 
 	self = PCPU_GET(cpuid);
 	ts = td_get_sched(td);
 	KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on "
 	    "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name));
 	if (smp_started == 0)
 		return (self);
 	/*
 	 * Don't migrate a running thread from sched_switch().
 	 */
 	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
 		return (ts->ts_cpu);
 	/*
 	 * Prefer to run interrupt threads on the processors that generate
 	 * the interrupt.
 	 */
 	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
 	    curthread->td_intr_nesting_level) {
 		tdq = TDQ_SELF();
 		if (tdq->tdq_lowpri >= PRI_MIN_IDLE) {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (self);
 		}
 		ts->ts_cpu = self;
 		intr = 1;
 		cg = tdq->tdq_cg;
 		goto llc;
 	} else {
 		intr = 0;
 		tdq = TDQ_CPU(ts->ts_cpu);
 		cg = tdq->tdq_cg;
 	}
 	/*
 	 * If the thread can run on the last cpu and the affinity has not
 	 * expired and it is idle, run it there.
 	 */
 	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
 	    atomic_load_char(&tdq->tdq_lowpri) >= PRI_MIN_IDLE &&
 	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
 		if (cg->cg_flags & CG_FLAG_THREAD) {
 			/* Check all SMT threads for being idle. */
 			for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) {
 				pri =
 				    atomic_load_char(&TDQ_CPU(cpu)->tdq_lowpri);
 				if (CPU_ISSET(cpu, &cg->cg_mask) &&
 				    pri < PRI_MIN_IDLE)
 					break;
 			}
 			if (cpu > cg->cg_last) {
 				SCHED_STAT_INC(pickcpu_idle_affinity);
 				return (ts->ts_cpu);
 			}
 		} else {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (ts->ts_cpu);
 		}
 	}
 llc:
 	/*
 	 * Search for the last level cache CPU group in the tree.
 	 * Skip SMT, identical groups and caches with expired affinity.
 	 * Interrupt threads affinity is explicit and never expires.
 	 */
 	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
 		if (cg->cg_flags & CG_FLAG_THREAD)
 			continue;
 		if (cg->cg_children == 1 || cg->cg_count == 1)
 			continue;
 		if (cg->cg_level == CG_SHARE_NONE ||
 		    (!intr && !SCHED_AFFINITY(ts, cg->cg_level)))
 			continue;
 		ccg = cg;
 	}
 	/* Found LLC shared by all CPUs, so do a global search. */
 	if (ccg == cpu_top)
 		ccg = NULL;
 	cpu = -1;
 	mask = &td->td_cpuset->cs_mask;
 	pri = td->td_priority;
 	r = TD_IS_RUNNING(td);
 	/*
 	 * Try hard to keep interrupts within found LLC.  Search the LLC for
 	 * the least loaded CPU we can run now.  For NUMA systems it should
 	 * be within target domain, and it also reduces scheduling overhead.
 	 */
 	if (ccg != NULL && intr) {
 		cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_intrbind);
 	} else
 	/* Search the LLC for the least loaded idle CPU we can run now. */
 	if (ccg != NULL) {
 		cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE),
 		    INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_affinity);
 	}
 	/* Search globally for the least loaded CPU we can run now. */
 	if (cpu < 0) {
 		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_lowest);
 	}
 	/* Search globally for the least loaded CPU. */
 	if (cpu < 0) {
 		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_lowest);
 	}
 	KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu."));
 	KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu));
 	/*
 	 * Compare the lowest loaded cpu to current cpu.
 	 */
 	tdq = TDQ_CPU(cpu);
 	if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri &&
 	    atomic_load_char(&tdq->tdq_lowpri) < PRI_MIN_IDLE &&
 	    TDQ_LOAD(TDQ_SELF()) <= TDQ_LOAD(tdq) + 1) {
 		SCHED_STAT_INC(pickcpu_local);
 		cpu = self;
 	}
 	if (cpu != ts->ts_cpu)
 		SCHED_STAT_INC(pickcpu_migration);
 	return (cpu);
 }
 #endif
 
 /*
  * Pick the highest priority task we have and return it.
  */
 static struct thread *
 tdq_choose(struct tdq *tdq)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	td = runq_choose(&tdq->tdq_realtime);
 	if (td != NULL)
 		return (td);
 	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_BATCH,
 		    ("tdq_choose: Invalid priority on timeshare queue %d",
 		    td->td_priority));
 		return (td);
 	}
 	td = runq_choose(&tdq->tdq_idle);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_IDLE,
 		    ("tdq_choose: Invalid priority on idle queue %d",
 		    td->td_priority));
 		return (td);
 	}
 
 	return (NULL);
 }
 
 /*
  * Initialize a thread queue.
  */
 static void
 tdq_setup(struct tdq *tdq, int id)
 {
 
 	if (bootverbose)
 		printf("ULE: setup cpu %d\n", id);
 	runq_init(&tdq->tdq_realtime);
 	runq_init(&tdq->tdq_timeshare);
 	runq_init(&tdq->tdq_idle);
 	tdq->tdq_id = id;
 	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
 	    "sched lock %d", (int)TDQ_ID(tdq));
 	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN);
 #ifdef KTR
 	snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
 	    "CPU %d load", (int)TDQ_ID(tdq));
 #endif
 }
 
 #ifdef SMP
 static void
 sched_setup_smp(void)
 {
 	struct tdq *tdq;
 	int i;
 
 	cpu_top = smp_topo();
 	CPU_FOREACH(i) {
 		tdq = DPCPU_ID_PTR(i, tdq);
 		tdq_setup(tdq, i);
 		tdq->tdq_cg = smp_topo_find(cpu_top, i);
 		if (tdq->tdq_cg == NULL)
 			panic("Can't find cpu group for %d\n", i);
 		DPCPU_ID_SET(i, randomval, i * 69069 + 5);
 	}
 	PCPU_SET(sched, DPCPU_PTR(tdq));
 	balance_tdq = TDQ_SELF();
 }
 #endif
 
 /*
  * Setup the thread queues and initialize the topology based on MD
  * information.
  */
 static void
 sched_setup(void *dummy)
 {
 	struct tdq *tdq;
 
 #ifdef SMP
 	sched_setup_smp();
 #else
 	tdq_setup(TDQ_SELF(), 0);
 #endif
 	tdq = TDQ_SELF();
 
 	/* Add thread0's load since it's running. */
 	TDQ_LOCK(tdq);
 	thread0.td_lock = TDQ_LOCKPTR(tdq);
 	tdq_load_add(tdq, &thread0);
 	tdq->tdq_curthread = &thread0;
 	tdq->tdq_lowpri = thread0.td_priority;
 	TDQ_UNLOCK(tdq);
 }
 
 /*
  * This routine determines time constants after stathz and hz are setup.
  */
 /* ARGSUSED */
 static void
 sched_initticks(void *dummy)
 {
 	int incr;
 
 	realstathz = stathz ? stathz : hz;
 	sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
 	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 
 	/*
 	 * tickincr is shifted out by 10 to avoid rounding errors due to
 	 * hz not being evenly divisible by stathz on all platforms.
 	 */
 	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
 	/*
 	 * This does not work for values of stathz that are more than
 	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
 	 */
 	if (incr == 0)
 		incr = 1;
 	tickincr = incr;
 #ifdef SMP
 	/*
 	 * Set the default balance interval now that we know
 	 * what realstathz is.
 	 */
 	balance_interval = realstathz;
 	balance_ticks = balance_interval;
 	affinity = SCHED_AFFINITY_DEFAULT;
 #endif
 	if (sched_idlespinthresh < 0)
 		sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
 }
 
 /*
  * This is the core of the interactivity algorithm.  Determines a score based
  * on past behavior.  It is the ratio of sleep time to run time scaled to
  * a [0, 100] integer.  This is the voluntary sleep time of a process, which
  * differs from the cpu usage because it does not account for time spent
  * waiting on a run-queue.  Would be prettier if we had floating point.
  *
  * When a thread's sleep time is greater than its run time the
  * calculation is:
  *
  *                           scaling factor
  * interactivity score =  ---------------------
  *                        sleep time / run time
  *
  *
  * When a thread's run time is greater than its sleep time the
  * calculation is:
  *
  *                                                 scaling factor
  * interactivity score = 2 * scaling factor  -  ---------------------
  *                                              run time / sleep time
  */
 static int
 sched_interact_score(struct thread *td)
 {
 	struct td_sched *ts;
 	int div;
 
 	ts = td_get_sched(td);
 	/*
 	 * The score is only needed if this is likely to be an interactive
 	 * task.  Don't go through the expense of computing it if there's
 	 * no chance.
 	 */
 	if (sched_interact <= SCHED_INTERACT_HALF &&
 		ts->ts_runtime >= ts->ts_slptime)
 			return (SCHED_INTERACT_HALF);
 
 	if (ts->ts_runtime > ts->ts_slptime) {
 		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
 		return (SCHED_INTERACT_HALF +
 		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
 	}
 	if (ts->ts_slptime > ts->ts_runtime) {
 		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
 		return (ts->ts_runtime / div);
 	}
 	/* runtime == slptime */
 	if (ts->ts_runtime)
 		return (SCHED_INTERACT_HALF);
 
 	/*
 	 * This can happen if slptime and runtime are 0.
 	 */
 	return (0);
 
 }
 
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
  */
 static void
 sched_priority(struct thread *td)
 {
 	u_int pri, score;
 
 	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
 		return;
 	/*
 	 * If the score is interactive we place the thread in the realtime
 	 * queue with a priority that is less than kernel and interrupt
 	 * priorities.  These threads are not subject to nice restrictions.
 	 *
 	 * Scores greater than this are placed on the normal timeshare queue
 	 * where the priority is partially decided by the most recent cpu
 	 * utilization and the rest is decided by nice value.
 	 *
 	 * The nice value of the process has a linear effect on the calculated
 	 * score.  Negative nice values make it easier for a thread to be
 	 * considered interactive.
 	 */
 	score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
 	if (score < sched_interact) {
 		pri = PRI_MIN_INTERACT;
 		pri += (PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) * score /
 		    sched_interact;
 		KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
 		    ("sched_priority: invalid interactive priority %u score %u",
 		    pri, score));
 	} else {
 		pri = SCHED_PRI_MIN;
 		if (td_get_sched(td)->ts_ticks)
 			pri += min(SCHED_PRI_TICKS(td_get_sched(td)),
 			    SCHED_PRI_RANGE - 1);
 		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
 		KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
 		    ("sched_priority: invalid priority %u: nice %d, "
 		    "ticks %d ftick %d ltick %d tick pri %d",
 		    pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks,
 		    td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick,
 		    SCHED_PRI_TICKS(td_get_sched(td))));
 	}
 	sched_user_prio(td, pri);
 
 	return;
 }
 
 /*
  * This routine enforces a maximum limit on the amount of scheduling history
  * kept.  It is called after either the slptime or runtime is adjusted.  This
  * function is ugly due to integer math.
  */
 static void
 sched_interact_update(struct thread *td)
 {
 	struct td_sched *ts;
 	u_int sum;
 
 	ts = td_get_sched(td);
 	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
 		return;
 	/*
 	 * This only happens from two places:
 	 * 1) We have added an unusual amount of run time from fork_exit.
 	 * 2) We have added an unusual amount of sleep time from sched_sleep().
 	 */
 	if (sum > SCHED_SLP_RUN_MAX * 2) {
 		if (ts->ts_runtime > ts->ts_slptime) {
 			ts->ts_runtime = SCHED_SLP_RUN_MAX;
 			ts->ts_slptime = 1;
 		} else {
 			ts->ts_slptime = SCHED_SLP_RUN_MAX;
 			ts->ts_runtime = 1;
 		}
 		return;
 	}
 	/*
 	 * If we have exceeded by more than 1/5th then the algorithm below
 	 * will not bring us back into range.  Dividing by two here forces
 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 	 */
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 		ts->ts_runtime /= 2;
 		ts->ts_slptime /= 2;
 		return;
 	}
 	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
 	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
 }
 
 /*
  * Scale back the interactivity history when a child thread is created.  The
  * history is inherited from the parent but the thread may behave totally
  * differently.  For example, a shell spawning a compiler process.  We want
  * to learn that the compiler is behaving badly very quickly.
  */
 static void
 sched_interact_fork(struct thread *td)
 {
 	struct td_sched *ts;
 	int ratio;
 	int sum;
 
 	ts = td_get_sched(td);
 	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum > SCHED_SLP_RUN_FORK) {
 		ratio = sum / SCHED_SLP_RUN_FORK;
 		ts->ts_runtime /= ratio;
 		ts->ts_slptime /= ratio;
 	}
 }
 
 /*
  * Called from proc0_init() to setup the scheduler fields.
  */
 void
 schedinit(void)
 {
 	struct td_sched *ts0;
 
 	/*
 	 * Set up the scheduler specific parts of thread0.
 	 */
 	ts0 = td_get_sched(&thread0);
 	ts0->ts_ltick = ticks;
 	ts0->ts_ftick = ticks;
 	ts0->ts_slice = 0;
 	ts0->ts_cpu = curcpu;	/* set valid CPU number */
 }
 
 /*
  * schedinit_ap() is needed prior to calling sched_throw(NULL) to ensure that
  * the pcpu requirements are met for any calls in the period between curthread
  * initialization and sched_throw().  One can safely add threads to the queue
  * before sched_throw(), for instance, as long as the thread lock is setup
  * correctly.
  *
  * TDQ_SELF() relies on the below sched pcpu setting; it may be used only
  * after schedinit_ap().
  */
 void
 schedinit_ap(void)
 {
 
 #ifdef SMP
 	PCPU_SET(sched, DPCPU_PTR(tdq));
 #endif
 	PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(TDQ_SELF());
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
  * at most sched_slice stathz ticks.
  */
 int
 sched_rr_interval(void)
 {
 
 	/* Convert sched_slice from stathz to hz. */
 	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
 }
 
 /*
  * Update the percent cpu tracking information when it is requested or
  * the total history exceeds the maximum.  We keep a sliding history of
  * tick counts that slowly decays.  This is less precise than the 4BSD
  * mechanism since it happens with less regular and frequent events.
  */
 static void
 sched_pctcpu_update(struct td_sched *ts, int run)
 {
 	int t = ticks;
 
 	/*
 	 * The signed difference may be negative if the thread hasn't run for
 	 * over half of the ticks rollover period.
 	 */
 	if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) {
 		ts->ts_ticks = 0;
 		ts->ts_ftick = t - SCHED_TICK_TARG;
 	} else if (t - ts->ts_ftick >= SCHED_TICK_MAX) {
 		ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) *
 		    (ts->ts_ltick - (t - SCHED_TICK_TARG));
 		ts->ts_ftick = t - SCHED_TICK_TARG;
 	}
 	if (run)
 		ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT;
 	ts->ts_ltick = t;
 }
 
 /*
  * Adjust the priority of a thread.  Move it to the appropriate run-queue
  * if necessary.  This is the back-end for several priority related
  * functions.
  */
 static void
 sched_thread_priority(struct thread *td, u_char prio)
 {
 	struct tdq *tdq;
 	int oldpri;
 
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
 	    "prio:%d", td->td_priority, "new prio:%d", prio,
 	    KTR_ATTR_LINKED, sched_tdname(curthread));
 	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 	if (td != curthread && prio < td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
 		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
 		    curthread);
 	} 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	/*
 	 * If the priority has been elevated due to priority
 	 * propagation, we may have to move ourselves to a new
 	 * queue.  This could be optimized to not re-add in some
 	 * cases.
 	 */
 	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
 		sched_rem(td);
 		td->td_priority = prio;
 		sched_add(td, SRQ_BORROWING | SRQ_HOLDTD);
 		return;
 	}
 	/*
 	 * If the thread is currently running we may have to adjust the lowpri
 	 * information so other cpus are aware of our current priority.
 	 */
 	if (TD_IS_RUNNING(td)) {
 		tdq = TDQ_CPU(td_get_sched(td)->ts_cpu);
 		oldpri = td->td_priority;
 		td->td_priority = prio;
 		if (prio < tdq->tdq_lowpri)
 			tdq->tdq_lowpri = prio;
 		else if (tdq->tdq_lowpri == oldpri)
 			tdq_setlowpri(tdq, td);
 		return;
 	}
 	td->td_priority = prio;
 }
 
 /*
  * Update a thread's priority when it is lent another thread's
  * priority.
  */
 void
 sched_lend_prio(struct thread *td, u_char prio)
 {
 
 	td->td_flags |= TDF_BORROWING;
 	sched_thread_priority(td, prio);
 }
 
 /*
  * Restore a thread's priority when priority propagation is
  * over.  The prio argument is the minimum priority the thread
  * needs to have to satisfy other possible priority lending
  * requests.  If the thread's regular priority is less
  * important than prio, the thread will keep a priority boost
  * of prio.
  */
 void
 sched_unlend_prio(struct thread *td, u_char prio)
 {
 	u_char base_pri;
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
 		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
 		td->td_flags &= ~TDF_BORROWING;
 		sched_thread_priority(td, base_pri);
 	} else
 		sched_lend_prio(td, prio);
 }
 
 /*
  * Standard entry for setting the priority to an absolute value.
  */
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	u_char oldprio;
 
 	/* First, update the base priority. */
 	td->td_base_pri = prio;
 
 	/*
 	 * If the thread is borrowing another thread's priority, don't
 	 * ever lower the priority.
 	 */
 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 		return;
 
 	/* Change the real priority. */
 	oldprio = td->td_priority;
 	sched_thread_priority(td, prio);
 
 	/*
 	 * If the thread is on a turnstile, then let the turnstile update
 	 * its state.
 	 */
 	if (TD_ON_LOCK(td) && oldprio != prio)
 		turnstile_adjust(td, oldprio);
 }
 
 /*
  * Set the base interrupt thread priority.
  */
 void
 sched_ithread_prio(struct thread *td, u_char prio)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(td->td_pri_class == PRI_ITHD);
 	td->td_base_ithread_pri = prio;
 	sched_prio(td, prio);
 }
 
 /*
  * Set the base user priority, does not effect current running priority.
  */
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
 
 	td->td_base_user_pri = prio;
 	if (td->td_lend_user_pri <= prio)
 		return;
 	td->td_user_pri = prio;
 }
 
 void
 sched_lend_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_lend_user_pri = prio;
 	td->td_user_pri = min(prio, td->td_base_user_pri);
 	if (td->td_priority > td->td_user_pri)
 		sched_prio(td, td->td_user_pri);
 	else if (td->td_priority != td->td_user_pri)
 		ast_sched_locked(td, TDA_SCHED);
 }
 
 /*
  * Like the above but first check if there is anything to do.
  */
 void
 sched_lend_user_prio_cond(struct thread *td, u_char prio)
 {
 
 	if (td->td_lend_user_pri != prio)
 		goto lend;
 	if (td->td_user_pri != min(prio, td->td_base_user_pri))
 		goto lend;
 	if (td->td_priority != td->td_user_pri)
 		goto lend;
 	return;
 
 lend:
 	thread_lock(td);
 	sched_lend_user_prio(td, prio);
 	thread_unlock(td);
 }
 
 #ifdef SMP
 /*
  * This tdq is about to idle.  Try to steal a thread from another CPU before
  * choosing the idle thread.
  */
 static void
 tdq_trysteal(struct tdq *tdq)
 {
 	struct cpu_group *cg, *parent;
 	struct tdq *steal;
 	cpuset_t mask;
 	int cpu, i, goup;
 
 	if (smp_started == 0 || steal_idle == 0 || trysteal_limit == 0 ||
 	    tdq->tdq_cg == NULL)
 		return;
 	CPU_FILL(&mask);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
 	/* We don't want to be preempted while we're iterating. */
 	spinlock_enter();
 	TDQ_UNLOCK(tdq);
 	for (i = 1, cg = tdq->tdq_cg, goup = 0; ; ) {
 		cpu = sched_highest(cg, &mask, steal_thresh, 1);
 		/*
 		 * If a thread was added while interrupts were disabled don't
 		 * steal one here.
 		 */
 		if (TDQ_LOAD(tdq) > 0) {
 			TDQ_LOCK(tdq);
 			break;
 		}
 
 		/*
 		 * We found no CPU to steal from in this group.  Escalate to
 		 * the parent and repeat.  But if parent has only two children
 		 * groups we can avoid searching this group again by searching
 		 * the other one specifically and then escalating two levels.
 		 */
 		if (cpu == -1) {
 			if (goup) {
 				cg = cg->cg_parent;
 				goup = 0;
 			}
 			if (++i > trysteal_limit) {
 				TDQ_LOCK(tdq);
 				break;
 			}
 			parent = cg->cg_parent;
 			if (parent == NULL) {
 				TDQ_LOCK(tdq);
 				break;
 			}
 			if (parent->cg_children == 2) {
 				if (cg == &parent->cg_child[0])
 					cg = &parent->cg_child[1];
 				else
 					cg = &parent->cg_child[0];
 				goup = 1;
 			} else
 				cg = parent;
 			continue;
 		}
 		steal = TDQ_CPU(cpu);
 		/*
 		 * The data returned by sched_highest() is stale and
 		 * the chosen CPU no longer has an eligible thread.
 		 * At this point unconditionally exit the loop to bound
 		 * the time spent in the critcal section.
 		 */
 		if (TDQ_LOAD(steal) < steal_thresh ||
 		    TDQ_TRANSFERABLE(steal) == 0)
 			continue;
 		/*
 		 * Try to lock both queues. If we are assigned a thread while
 		 * waited for the lock, switch to it now instead of stealing.
 		 * If we can't get the lock, then somebody likely got there
 		 * first.
 		 */
 		TDQ_LOCK(tdq);
 		if (tdq->tdq_load > 0)
 			break;
 		if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0)
 			break;
 		/*
 		 * The data returned by sched_highest() is stale and
                  * the chosen CPU no longer has an eligible thread.
 		 */
 		if (TDQ_LOAD(steal) < steal_thresh ||
 		    TDQ_TRANSFERABLE(steal) == 0) {
 			TDQ_UNLOCK(steal);
 			break;
 		}
 		/*
 		 * If we fail to acquire one due to affinity restrictions,
 		 * bail out and let the idle thread to a more complete search
 		 * outside of a critical section.
 		 */
 		if (tdq_move(steal, tdq) == -1) {
 			TDQ_UNLOCK(steal);
 			break;
 		}
 		TDQ_UNLOCK(steal);
 		break;
 	}
 	spinlock_exit();
 }
 #endif
 
 /*
  * Handle migration from sched_switch().  This happens only for
  * cpu binding.
  */
 static struct mtx *
 sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct tdq *tdn;
 #ifdef SMP
 	int lowpri;
 #endif
 
 	KASSERT(THREAD_CAN_MIGRATE(td) ||
 	    (td_get_sched(td)->ts_flags & TSF_BOUND) != 0,
 	    ("Thread %p shouldn't migrate", td));
 	KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: "
 	    "thread %s queued on absent CPU %d.", td->td_name,
 	    td_get_sched(td)->ts_cpu));
 	tdn = TDQ_CPU(td_get_sched(td)->ts_cpu);
 #ifdef SMP
 	tdq_load_rem(tdq, td);
 	/*
 	 * Do the lock dance required to avoid LOR.  We have an 
 	 * extra spinlock nesting from sched_switch() which will
 	 * prevent preemption while we're holding neither run-queue lock.
 	 */
 	TDQ_UNLOCK(tdq);
 	TDQ_LOCK(tdn);
 	lowpri = tdq_add(tdn, td, flags);
 	tdq_notify(tdn, lowpri);
 	TDQ_UNLOCK(tdn);
 	TDQ_LOCK(tdq);
 #endif
 	return (TDQ_LOCKPTR(tdn));
 }
 
 /*
  * thread_lock_unblock() that does not assume td_lock is blocked.
  */
 static inline void
 thread_unblock_switch(struct thread *td, struct mtx *mtx)
 {
 	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
 	    (uintptr_t)mtx);
 }
 
 /*
  * Switch threads.  This function has to handle threads coming in while
  * blocked for some reason, running, or idle.  It also must deal with
  * migrating a thread from one queue to another as running threads may
  * be assigned elsewhere via binding.
  */
 void
 sched_switch(struct thread *td, int flags)
 {
 	struct thread *newtd;
 	struct tdq *tdq;
 	struct td_sched *ts;
 	struct mtx *mtx;
 	int srqflag;
 	int cpuid, preempted;
 #ifdef SMP
 	int pickcpu;
 #endif
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_SELF();
 	ts = td_get_sched(td);
 	sched_pctcpu_update(ts, 1);
 #ifdef SMP
 	pickcpu = (td->td_flags & TDF_PICKCPU) != 0;
 	if (pickcpu)
 		ts->ts_rltick = ticks - affinity * MAX_CACHE_LEVELS;
 	else
 		ts->ts_rltick = ticks;
 #endif
 	td->td_lastcpu = td->td_oncpu;
 	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
 	    (flags & SW_PREEMPT) != 0;
 	td->td_flags &= ~(TDF_PICKCPU | TDF_SLICEEND);
 	ast_unsched_locked(td, TDA_SCHED);
 	td->td_owepreempt = 0;
 	atomic_store_char(&tdq->tdq_owepreempt, 0);
 	if (!TD_IS_IDLETHREAD(td))
 		TDQ_SWITCHCNT_INC(tdq);
 
 	/*
 	 * Always block the thread lock so we can drop the tdq lock early.
 	 */
 	mtx = thread_lock_block(td);
 	spinlock_enter();
 	if (TD_IS_IDLETHREAD(td)) {
 		MPASS(mtx == TDQ_LOCKPTR(tdq));
 		TD_SET_CAN_RUN(td);
 	} else if (TD_IS_RUNNING(td)) {
 		MPASS(mtx == TDQ_LOCKPTR(tdq));
 		srqflag = preempted ?
 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 		    SRQ_OURSELF|SRQ_YIELDING;
 #ifdef SMP
 		if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu)
 		    || pickcpu))
 			ts->ts_cpu = sched_pickcpu(td, 0);
 #endif
 		if (ts->ts_cpu == cpuid)
 			tdq_runq_add(tdq, td, srqflag);
 		else
 			mtx = sched_switch_migrate(tdq, td, srqflag);
 	} else {
 		/* This thread must be going to sleep. */
 		if (mtx != TDQ_LOCKPTR(tdq)) {
 			mtx_unlock_spin(mtx);
 			TDQ_LOCK(tdq);
 		}
 		tdq_load_rem(tdq, td);
 #ifdef SMP
 		if (tdq->tdq_load == 0)
 			tdq_trysteal(tdq);
 #endif
 	}
 
 #if (KTR_COMPILE & KTR_SCHED) != 0
 	if (TD_IS_IDLETHREAD(td))
 		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
 		    "prio:%d", td->td_priority);
 	else
 		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
 		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
 		    "lockname:\"%s\"", td->td_lockname);
 #endif
 
 	/*
 	 * We enter here with the thread blocked and assigned to the
 	 * appropriate cpu run-queue or sleep-queue and with the current
 	 * thread-queue locked.
 	 */
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	MPASS(td == tdq->tdq_curthread);
 	newtd = choosethread();
 	sched_pctcpu_update(td_get_sched(newtd), 0);
 	TDQ_UNLOCK(tdq);
 
 	/*
 	 * Call the MD code to switch contexts if necessary.
 	 */
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
 
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If DTrace has set the active vtime enum to anything
 		 * other than INACTIVE (0), then it should have set the
 		 * function to call.
 		 */
 		if (dtrace_vtime_active)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 		td->td_oncpu = NOCPU;
 		cpu_switch(td, newtd, mtx);
 		cpuid = td->td_oncpu = PCPU_GET(cpuid);
 
 		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else {
 		thread_unblock_switch(td, mtx);
 		SDT_PROBE0(sched, , , remain__cpu);
 	}
 	KASSERT(curthread->td_md.md_spinlock_count == 1,
 	    ("invalid count %d", curthread->td_md.md_spinlock_count));
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 }
 
 /*
  * Adjust thread priorities as a result of a nice request.
  */
 void
 sched_nice(struct proc *p, int nice)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		sched_priority(td);
 		sched_prio(td, td->td_base_user_pri);
 		thread_unlock(td);
 	}
 }
 
 /*
  * Record the sleep time for the interactivity scorer.
  */
 void
 sched_sleep(struct thread *td, int prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	td->td_slptick = ticks;
 	if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
 		td->td_flags |= TDF_CANSWAP;
 	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
 		return;
 	if (static_boost == 1 && prio)
 		sched_prio(td, prio);
 	else if (static_boost && td->td_priority > static_boost)
 		sched_prio(td, static_boost);
 }
 
 /*
  * Schedule a thread to resume execution and record how long it voluntarily
  * slept.  We also update the pctcpu, interactivity, and priority.
  *
  * Requires the thread lock on entry, drops on exit.
  */
 void
 sched_wakeup(struct thread *td, int srqflags)
 {
 	struct td_sched *ts;
 	int slptick;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	td->td_flags &= ~TDF_CANSWAP;
 
 	/*
 	 * If we slept for more than a tick update our interactivity and
 	 * priority.
 	 */
 	slptick = td->td_slptick;
 	td->td_slptick = 0;
 	if (slptick && slptick != ticks) {
 		ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
 		sched_interact_update(td);
 		sched_pctcpu_update(ts, 0);
 	}
 
 	/*
 	 * When resuming an idle ithread, restore its base ithread
 	 * priority.
 	 */
 	if (PRI_BASE(td->td_pri_class) == PRI_ITHD &&
 	    td->td_priority != td->td_base_ithread_pri)
 		sched_prio(td, td->td_base_ithread_pri);
 
 	/*
 	 * Reset the slice value since we slept and advanced the round-robin.
 	 */
 	ts->ts_slice = 0;
 	sched_add(td, SRQ_BORING | srqflags);
 }
 
 /*
  * Penalize the parent for creating a new child and initialize the child's
  * priority.
  */
 void
 sched_fork(struct thread *td, struct thread *child)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_pctcpu_update(td_get_sched(td), 1);
 	sched_fork_thread(td, child);
 	/*
 	 * Penalize the parent and child for forking.
 	 */
 	sched_interact_fork(child);
 	sched_priority(child);
 	td_get_sched(td)->ts_runtime += tickincr;
 	sched_interact_update(td);
 	sched_priority(td);
 }
 
 /*
  * Fork a new thread, may be within the same process.
  */
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 	struct td_sched *ts;
 	struct td_sched *ts2;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Initialize child.
 	 */
 	ts = td_get_sched(td);
 	ts2 = td_get_sched(child);
 	child->td_oncpu = NOCPU;
 	child->td_lastcpu = NOCPU;
 	child->td_lock = TDQ_LOCKPTR(tdq);
 	child->td_cpuset = cpuset_ref(td->td_cpuset);
 	child->td_domain.dr_policy = td->td_cpuset->cs_domain;
 	ts2->ts_cpu = ts->ts_cpu;
 	ts2->ts_flags = 0;
 	/*
 	 * Grab our parents cpu estimation information.
 	 */
 	ts2->ts_ticks = ts->ts_ticks;
 	ts2->ts_ltick = ts->ts_ltick;
 	ts2->ts_ftick = ts->ts_ftick;
 	/*
 	 * Do not inherit any borrowed priority from the parent.
 	 */
 	child->td_priority = child->td_base_pri;
 	/*
 	 * And update interactivity score.
 	 */
 	ts2->ts_slptime = ts->ts_slptime;
 	ts2->ts_runtime = ts->ts_runtime;
 	/* Attempt to quickly learn interactivity. */
 	ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
 #ifdef KTR
 	bzero(ts2->ts_name, sizeof(ts2->ts_name));
 #endif
 }
 
 /*
  * Adjust the priority class of a thread.
  */
 void
 sched_class(struct thread *td, int class)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_pri_class == class)
 		return;
 	td->td_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
 sched_exit(struct proc *p, struct thread *child)
 {
 	struct thread *td;
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
 	    "prio:%d", child->td_priority);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	sched_exit_thread(td, child);
 }
 
 /*
  * Penalize another thread for the time spent on this one.  This helps to
  * worsen the priority and interactivity of processes which schedule batch
  * jobs such as make.  This has little effect on the make process itself but
  * causes new processes spawned by it to receive worse scores immediately.
  */
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
 	    "prio:%d", child->td_priority);
 	/*
 	 * Give the child's runtime to the parent without returning the
 	 * sleep time as a penalty to the parent.  This causes shells that
 	 * launch expensive things to mark their children as expensive.
 	 */
 	thread_lock(td);
 	td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime;
 	sched_interact_update(td);
 	sched_priority(td);
 	thread_unlock(td);
 }
 
 void
 sched_preempt(struct thread *td)
 {
 	struct tdq *tdq;
 	int flags;
 
 	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 
 	thread_lock(td);
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if (td->td_priority > tdq->tdq_lowpri) {
 		if (td->td_critnest == 1) {
 			flags = SW_INVOL | SW_PREEMPT;
 			flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE :
 			    SWT_REMOTEPREEMPT;
 			mi_switch(flags);
 			/* Switch dropped thread lock. */
 			return;
 		}
 		td->td_owepreempt = 1;
 	} else {
 		tdq->tdq_owepreempt = 0;
 	}
 	thread_unlock(td);
 }
 
 /*
  * Fix priorities on return to user-space.  Priorities may be elevated due
  * to static priorities in msleep() or similar.
  */
 void
 sched_userret_slowpath(struct thread *td)
 {
 
 	thread_lock(td);
 	td->td_priority = td->td_user_pri;
 	td->td_base_pri = td->td_user_pri;
 	tdq_setlowpri(TDQ_SELF(), td);
 	thread_unlock(td);
 }
 
 SCHED_STAT_DEFINE(ithread_demotions, "Interrupt thread priority demotions");
 SCHED_STAT_DEFINE(ithread_preemptions,
     "Interrupt thread preemptions due to time-sharing");
 
 /*
  * Return time slice for a given thread.  For ithreads this is
  * sched_slice.  For other threads it is tdq_slice(tdq).
  */
 static inline int
 td_slice(struct thread *td, struct tdq *tdq)
 {
 	if (PRI_BASE(td->td_pri_class) == PRI_ITHD)
 		return (sched_slice);
 	return (tdq_slice(tdq));
 }
 
 /*
  * Handle a stathz tick.  This is really only relevant for timeshare
  * and interrupt threads.
  */
 void
 sched_clock(struct thread *td, int cnt)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
 #ifdef SMP
 	/*
 	 * We run the long term load balancer infrequently on the first cpu.
 	 */
 	if (balance_tdq == tdq && smp_started != 0 && rebalance != 0 &&
 	    balance_ticks != 0) {
 		balance_ticks -= cnt;
 		if (balance_ticks <= 0)
 			sched_balance();
 	}
 #endif
 	/*
 	 * Save the old switch count so we have a record of the last ticks
 	 * activity.   Initialize the new switch count based on our load.
 	 * If there is some activity seed it to reflect that.
 	 */
 	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
 	tdq->tdq_switchcnt = tdq->tdq_load;
 
 	/*
 	 * Advance the insert index once for each tick to ensure that all
 	 * threads get a chance to run.
 	 */
 	if (tdq->tdq_idx == tdq->tdq_ridx) {
 		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
 		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
 			tdq->tdq_ridx = tdq->tdq_idx;
 	}
 	ts = td_get_sched(td);
 	sched_pctcpu_update(ts, 1);
 	if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td))
 		return;
 
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
 		/*
 		 * We used a tick; charge it to the thread so
 		 * that we can compute our interactivity.
 		 */
 		td_get_sched(td)->ts_runtime += tickincr * cnt;
 		sched_interact_update(td);
 		sched_priority(td);
 	}
 
 	/*
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
 	ts->ts_slice += cnt;
 	if (ts->ts_slice >= td_slice(td, tdq)) {
 		ts->ts_slice = 0;
 
 		/*
 		 * If an ithread uses a full quantum, demote its
 		 * priority and preempt it.
 		 */
 		if (PRI_BASE(td->td_pri_class) == PRI_ITHD) {
 			SCHED_STAT_INC(ithread_preemptions);
 			td->td_owepreempt = 1;
 			if (td->td_base_pri + RQ_PPQ < PRI_MAX_ITHD) {
 				SCHED_STAT_INC(ithread_demotions);
 				sched_prio(td, td->td_base_pri + RQ_PPQ);
 			}
 		} else {
 			ast_sched_locked(td, TDA_SCHED);
 			td->td_flags |= TDF_SLICEEND;
 		}
 	}
 }
 
 u_int
 sched_estcpu(struct thread *td __unused)
 {
 
 	return (0);
 }
 
 /*
  * Return whether the current CPU has runnable tasks.  Used for in-kernel
  * cooperative idle threads.
  */
 int
 sched_runnable(void)
 {
 	struct tdq *tdq;
 	int load;
 
 	load = 1;
 
 	tdq = TDQ_SELF();
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (TDQ_LOAD(tdq) > 0)
 			goto out;
 	} else
 		if (TDQ_LOAD(tdq) - 1 > 0)
 			goto out;
 	load = 0;
 out:
 	return (load);
 }
 
 /*
  * Choose the highest priority thread to run.  The thread is removed from
  * the run-queue while running however the load remains.
  */
 struct thread *
 sched_choose(void)
 {
 	struct thread *td;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	td = tdq_choose(tdq);
 	if (td != NULL) {
 		tdq_runq_rem(tdq, td);
 		tdq->tdq_lowpri = td->td_priority;
 	} else { 
 		tdq->tdq_lowpri = PRI_MAX_IDLE;
 		td = PCPU_GET(idlethread);
 	}
 	tdq->tdq_curthread = td;
 	return (td);
 }
 
 /*
  * Set owepreempt if the currently running thread has lower priority than "pri".
  * Preemption never happens directly in ULE, we always request it once we exit a
  * critical section.
  */
 static void
 sched_setpreempt(int pri)
 {
 	struct thread *ctd;
 	int cpri;
 
 	ctd = curthread;
 	THREAD_LOCK_ASSERT(ctd, MA_OWNED);
 
 	cpri = ctd->td_priority;
 	if (pri < cpri)
 		ast_sched_locked(ctd, TDA_SCHED);
 	if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
 		return;
 	if (!sched_shouldpreempt(pri, cpri, 0))
 		return;
 	ctd->td_owepreempt = 1;
 }
 
 /*
  * Add a thread to a thread queue.  Select the appropriate runq and add the
  * thread to it.  This is the internal function called when the tdq is
  * predetermined.
  */
 static int
 tdq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 	int lowpri;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
 	lowpri = tdq->tdq_lowpri;
 	if (td->td_priority < lowpri)
 		tdq->tdq_lowpri = td->td_priority;
 	tdq_runq_add(tdq, td, flags);
 	tdq_load_add(tdq, td);
 	return (lowpri);
 }
 
 /*
  * Select the target thread queue and add a thread to it.  Request
  * preemption or IPI a remote processor if required.
  *
  * Requires the thread lock on entry, drops on exit.
  */
 void
 sched_add(struct thread *td, int flags)
 {
 	struct tdq *tdq;
 #ifdef SMP
 	int cpu, lowpri;
 #endif
 
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Recalculate the priority before we select the target cpu or
 	 * run-queue.
 	 */
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_priority(td);
 #ifdef SMP
 	/*
 	 * Pick the destination cpu and if it isn't ours transfer to the
 	 * target cpu.
 	 */
 	cpu = sched_pickcpu(td, flags);
 	tdq = sched_setcpu(td, cpu, flags);
 	lowpri = tdq_add(tdq, td, flags);
 	if (cpu != PCPU_GET(cpuid))
 		tdq_notify(tdq, lowpri);
 	else if (!(flags & SRQ_YIELDING))
 		sched_setpreempt(td->td_priority);
 #else
 	tdq = TDQ_SELF();
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	if (td->td_lock != TDQ_LOCKPTR(tdq)) {
 		TDQ_LOCK(tdq);
 		if ((flags & SRQ_HOLD) != 0)
 			td->td_lock = TDQ_LOCKPTR(tdq);
 		else
 			thread_lock_set(td, TDQ_LOCKPTR(tdq));
 	}
 	(void)tdq_add(tdq, td, flags);
 	if (!(flags & SRQ_YIELDING))
 		sched_setpreempt(td->td_priority);
 #endif
 	if (!(flags & SRQ_HOLDTD))
 		thread_unlock(td);
 }
 
 /*
  * Remove a thread from a run-queue without running it.  This is used
  * when we're stealing a thread from a remote queue.  Otherwise all threads
  * exit by calling sched_exit_thread() and sched_throw() themselves.
  */
 void
 sched_rem(struct thread *td)
 {
 	struct tdq *tdq;
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority);
 	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 	tdq = TDQ_CPU(td_get_sched(td)->ts_cpu);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
 	tdq_runq_rem(tdq, td);
 	tdq_load_rem(tdq, td);
 	TD_SET_CAN_RUN(td);
 	if (td->td_priority == tdq->tdq_lowpri)
 		tdq_setlowpri(tdq, NULL);
 }
 
 /*
  * Fetch cpu utilization information.  Updates on demand.
  */
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	fixpt_t pctcpu;
 	struct td_sched *ts;
 
 	pctcpu = 0;
 	ts = td_get_sched(td);
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_pctcpu_update(ts, TD_IS_RUNNING(td));
 	if (ts->ts_ticks) {
 		int rtick;
 
 		/* How many rtick per second ? */
 		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
 		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
 	}
 
 	return (pctcpu);
 }
 
 /*
  * Enforce affinity settings for a thread.  Called after adjustments to
  * cpumask.
  */
 void
 sched_affinity(struct thread *td)
 {
 #ifdef SMP
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
 		return;
 	if (TD_ON_RUNQ(td)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING | SRQ_HOLDTD);
 		return;
 	}
 	if (!TD_IS_RUNNING(td))
 		return;
 	/*
 	 * Force a switch before returning to userspace.  If the
 	 * target thread is not running locally send an ipi to force
 	 * the issue.
 	 */
 	ast_sched_locked(td, TDA_SCHED);
 	if (td != curthread)
 		ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
 #endif
 }
 
 /*
  * Bind a thread to a target cpu.
  */
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
 	ts = td_get_sched(td);
 	if (ts->ts_flags & TSF_BOUND)
 		sched_unbind(td);
 	KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
 	ts->ts_flags |= TSF_BOUND;
 	sched_pin();
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 	ts->ts_cpu = cpu;
 	/* When we return from mi_switch we'll be on the correct cpu. */
 	mi_switch(SW_VOL | SWT_BIND);
 	thread_lock(td);
 }
 
 /*
  * Release a bound thread.
  */
 void
 sched_unbind(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
 	ts = td_get_sched(td);
 	if ((ts->ts_flags & TSF_BOUND) == 0)
 		return;
 	ts->ts_flags &= ~TSF_BOUND;
 	sched_unpin();
 }
 
 int
 sched_is_bound(struct thread *td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td_get_sched(td)->ts_flags & TSF_BOUND);
 }
 
 /*
  * Basic yield call.
  */
 void
 sched_relinquish(struct thread *td)
 {
 	thread_lock(td);
 	mi_switch(SW_VOL | SWT_RELINQUISH);
 }
 
 /*
  * Return the total system load.
  */
 int
 sched_load(void)
 {
 #ifdef SMP
 	int total;
 	int i;
 
 	total = 0;
 	CPU_FOREACH(i)
 		total += atomic_load_int(&TDQ_CPU(i)->tdq_sysload);
 	return (total);
 #else
 	return (atomic_load_int(&TDQ_SELF()->tdq_sysload));
 #endif
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 #ifdef SMP
 #define	TDQ_IDLESPIN(tdq)						\
     ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
 #else
 #define	TDQ_IDLESPIN(tdq)	1
 #endif
 
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
 	struct thread *td;
 	struct tdq *tdq;
 	int oldswitchcnt, switchcnt;
 	int i;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	td = curthread;
 	tdq = TDQ_SELF();
 	THREAD_NO_SLEEPING();
 	oldswitchcnt = -1;
 	for (;;) {
 		if (TDQ_LOAD(tdq)) {
 			thread_lock(td);
 			mi_switch(SW_VOL | SWT_IDLE);
 		}
 		switchcnt = TDQ_SWITCHCNT(tdq);
 #ifdef SMP
 		if (always_steal || switchcnt != oldswitchcnt) {
 			oldswitchcnt = switchcnt;
 			if (tdq_idled(tdq) == 0)
 				continue;
 		}
 		switchcnt = TDQ_SWITCHCNT(tdq);
 #else
 		oldswitchcnt = switchcnt;
 #endif
 		/*
 		 * If we're switching very frequently, spin while checking
 		 * for load rather than entering a low power state that 
 		 * may require an IPI.  However, don't do any busy
 		 * loops while on SMT machines as this simply steals
 		 * cycles from cores doing useful work.
 		 */
 		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
 			for (i = 0; i < sched_idlespins; i++) {
 				if (TDQ_LOAD(tdq))
 					break;
 				cpu_spinwait();
 			}
 		}
 
 		/* If there was context switch during spin, restart it. */
 		switchcnt = TDQ_SWITCHCNT(tdq);
 		if (TDQ_LOAD(tdq) != 0 || switchcnt != oldswitchcnt)
 			continue;
 
 		/* Run main MD idle handler. */
 		atomic_store_int(&tdq->tdq_cpu_idle, 1);
 		/*
 		 * Make sure that the tdq_cpu_idle update is globally visible
 		 * before cpu_idle() reads tdq_load.  The order is important
 		 * to avoid races with tdq_notify().
 		 */
 		atomic_thread_fence_seq_cst();
 		/*
 		 * Checking for again after the fence picks up assigned
 		 * threads often enough to make it worthwhile to do so in
 		 * order to avoid calling cpu_idle().
 		 */
 		if (TDQ_LOAD(tdq) != 0) {
 			atomic_store_int(&tdq->tdq_cpu_idle, 0);
 			continue;
 		}
 		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
 		atomic_store_int(&tdq->tdq_cpu_idle, 0);
 
 		/*
 		 * Account thread-less hardware interrupts and
 		 * other wakeup reasons equal to context switches.
 		 */
 		switchcnt = TDQ_SWITCHCNT(tdq);
 		if (switchcnt != oldswitchcnt)
 			continue;
 		TDQ_SWITCHCNT_INC(tdq);
 		oldswitchcnt++;
 	}
 }
 
 /*
  * sched_throw_grab() chooses a thread from the queue to switch to
  * next.  It returns with the tdq lock dropped in a spinlock section to
  * keep interrupts disabled until the CPU is running in a proper threaded
  * context.
  */
 static struct thread *
 sched_throw_grab(struct tdq *tdq)
 {
 	struct thread *newtd;
 
 	newtd = choosethread();
 	spinlock_enter();
 	TDQ_UNLOCK(tdq);
 	KASSERT(curthread->td_md.md_spinlock_count == 1,
 	    ("invalid count %d", curthread->td_md.md_spinlock_count));
 	return (newtd);
 }
 
 /*
  * A CPU is entering for the first time.
  */
 void
 sched_ap_entry(void)
 {
 	struct thread *newtd;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 
 	/* This should have been setup in schedinit_ap(). */
 	THREAD_LOCKPTR_ASSERT(curthread, TDQ_LOCKPTR(tdq));
 
 	TDQ_LOCK(tdq);
 	/* Correct spinlock nesting. */
 	spinlock_exit();
 	PCPU_SET(switchtime, cpu_ticks());
 	PCPU_SET(switchticks, ticks);
 
 	newtd = sched_throw_grab(tdq);
 
 	/* doesn't return */
 	cpu_throw(NULL, newtd);
 }
 
 /*
  * A thread is exiting.
  */
 void
 sched_throw(struct thread *td)
 {
 	struct thread *newtd;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 
 	MPASS(td != NULL);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq));
 
 	tdq_load_rem(tdq, td);
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
 	thread_lock_block(td);
 
 	newtd = sched_throw_grab(tdq);
 
 	/* doesn't return */
 	cpu_switch(td, newtd, TDQ_LOCKPTR(tdq));
 }
 
 /*
  * This is called from fork_exit().  Just acquire the correct locks and
  * let fork do the rest of the work.
  */
 void
 sched_fork_exit(struct thread *td)
 {
 	struct tdq *tdq;
 	int cpuid;
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with the scheduler lock held.
 	 */
 	KASSERT(curthread->td_md.md_spinlock_count == 1,
 	    ("invalid count %d", curthread->td_md.md_spinlock_count));
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_SELF();
 	TDQ_LOCK(tdq);
 	spinlock_exit();
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	td->td_oncpu = cpuid;
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 	SDT_PROBE0(sched, , , on__cpu);
 }
 
 /*
  * Create on first use to catch odd startup conditions.
  */
 char *
 sched_tdname(struct thread *td)
 {
 #ifdef KTR
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	if (ts->ts_name[0] == '\0')
 		snprintf(ts->ts_name, sizeof(ts->ts_name),
 		    "%s tid %d", td->td_name, td->td_tid);
 	return (ts->ts_name);
 #else
 	return (td->td_name);
 #endif
 }
 
 #ifdef KTR
 void
 sched_clear_tdname(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	ts->ts_name[0] = '\0';
 }
 #endif
 
 #ifdef SMP
 
 /*
  * Build the CPU topology dump string. Is recursively called to collect
  * the topology tree.
  */
 static int
 sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
     int indent)
 {
 	char cpusetbuf[CPUSETBUFSIZ];
 	int i, first;
 
 	sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
 	    "", 1 + indent / 2, cg->cg_level);
 	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
 	    cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
 	first = TRUE;
 	for (i = cg->cg_first; i <= cg->cg_last; i++) {
 		if (CPU_ISSET(i, &cg->cg_mask)) {
 			if (!first)
-				sbuf_printf(sb, ", ");
+				sbuf_cat(sb, ", ");
 			else
 				first = FALSE;
 			sbuf_printf(sb, "%d", i);
 		}
 	}
-	sbuf_printf(sb, "</cpu>\n");
+	sbuf_cat(sb, "</cpu>\n");
 
 	if (cg->cg_flags != 0) {
 		sbuf_printf(sb, "%*s <flags>", indent, "");
 		if ((cg->cg_flags & CG_FLAG_HTT) != 0)
-			sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
+			sbuf_cat(sb, "<flag name=\"HTT\">HTT group</flag>");
 		if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
-			sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
+			sbuf_cat(sb, "<flag name=\"THREAD\">THREAD group</flag>");
 		if ((cg->cg_flags & CG_FLAG_SMT) != 0)
-			sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
+			sbuf_cat(sb, "<flag name=\"SMT\">SMT group</flag>");
 		if ((cg->cg_flags & CG_FLAG_NODE) != 0)
-			sbuf_printf(sb, "<flag name=\"NODE\">NUMA node</flag>");
-		sbuf_printf(sb, "</flags>\n");
+			sbuf_cat(sb, "<flag name=\"NODE\">NUMA node</flag>");
+		sbuf_cat(sb, "</flags>\n");
 	}
 
 	if (cg->cg_children > 0) {
 		sbuf_printf(sb, "%*s <children>\n", indent, "");
 		for (i = 0; i < cg->cg_children; i++)
 			sysctl_kern_sched_topology_spec_internal(sb, 
 			    &cg->cg_child[i], indent+2);
 		sbuf_printf(sb, "%*s </children>\n", indent, "");
 	}
 	sbuf_printf(sb, "%*s</group>\n", indent, "");
 	return (0);
 }
 
 /*
  * Sysctl handler for retrieving topology dump. It's a wrapper for
  * the recursive sysctl_kern_smp_topology_spec_internal().
  */
 static int
 sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *topo;
 	int err;
 
 	KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
 
 	topo = sbuf_new_for_sysctl(NULL, NULL, 512, req);
 	if (topo == NULL)
 		return (ENOMEM);
 
-	sbuf_printf(topo, "<groups>\n");
+	sbuf_cat(topo, "<groups>\n");
 	err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
-	sbuf_printf(topo, "</groups>\n");
+	sbuf_cat(topo, "</groups>\n");
 
 	if (err == 0) {
 		err = sbuf_finish(topo);
 	}
 	sbuf_delete(topo);
 	return (err);
 }
 
 #endif
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val, period;
 
 	period = 1000000 / realstathz;
 	new_val = period * sched_slice;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
 	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Scheduler");
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
     "Scheduler name");
 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_kern_quantum, "I",
     "Quantum for timeshare threads in microseconds");
 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
     "Quantum for timeshare threads in stathz ticks");
 SYSCTL_UINT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
     "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
     &preempt_thresh, 0,
     "Maximal (lowest) priority for preemption");
 SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
     "Assign static kernel priorities to sleeping threads");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
     "Number of times idle thread will spin waiting for new work");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
     &sched_idlespinthresh, 0,
     "Threshold before we will permit idle thread spinning");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
     "Number of hz ticks to keep thread affinity for");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
     "Enables the long-term load balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
     &balance_interval, 0,
     "Average period in stathz ticks to run the long-term balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
     "Attempts to steal work from other cores before idling");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
     "Minimum load on remote CPU before we'll steal");
 SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit,
     0, "Topological distance limit for stealing threads in sched_switch()");
 SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0,
     "Always run the stealer from the idle thread");
 SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
     CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
     "XML dump of detected CPU topology");
 #endif
 
 /* ps compat.  All cpu percentages from ULE are weighted. */
 static int ccpu = 0;
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0,
     "Decay factor used for updating %CPU in 4BSD scheduler");
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
index 9b04518010e3..ac9d73ce3c6c 100644
--- a/sys/kern/subr_blist.c
+++ b/sys/kern/subr_blist.c
@@ -1,1177 +1,1177 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
  *
  *	This module implements a general bitmap allocator/deallocator.  The
  *	allocator eats around 2 bits per 'block'.  The module does not
  *	try to interpret the meaning of a 'block' other than to return
  *	SWAPBLK_NONE on an allocation failure.
  *
  *	A radix tree controls access to pieces of the bitmap, and includes
  *	auxiliary information at each interior node about the availabilty of
  *	contiguous free blocks in the subtree rooted at that node.  A radix
  *	constant defines the size of the bitmaps contained in a leaf node
  *	and the number of descendents of each of the meta (interior) nodes.
  *	Each subtree is associated with a range of blocks.  The root of any
  *	subtree stores a hint field that defines an upper bound on the size
  *	of the largest allocation that can begin in the associated block
  *	range.  A hint is an upper bound on a potential allocation, but not
  *	necessarily a tight upper bound.
  *
  *	The bitmap field in each node directs the search for available blocks.
  *	For a leaf node, a bit is set if the corresponding block is free.  For a
  *	meta node, a bit is set if the corresponding subtree contains a free
  *	block somewhere within it.  The search at a meta node considers only
  *	children of that node that represent a range that includes a free block.
  *
  * 	The hinting greatly increases code efficiency for allocations while
  *	the general radix structure optimizes both allocations and frees.  The
  *	radix tree should be able to operate well no matter how much
  *	fragmentation there is and no matter how large a bitmap is used.
  *
  *	The blist code wires all necessary memory at creation time.  Neither
  *	allocations nor frees require interaction with the memory subsystem.
  *	The non-blocking nature of allocations and frees is required by swap
  *	code (vm/swap_pager.c).
  *
  *	LAYOUT: The radix tree is laid out recursively using a linear array.
  *	Each meta node is immediately followed (laid out sequentially in
  *	memory) by BLIST_RADIX lower-level nodes.  This is a recursive
  *	structure but one that can be easily scanned through a very simple
  *	'skip' calculation.  The memory allocation is only large enough to
  *	cover the number of blocks requested at creation time.  Nodes that
  *	represent blocks beyond that limit, nodes that would never be read
  *	or written, are not allocated, so that the last of the
  *	BLIST_RADIX lower-level nodes of a some nodes may not be allocated.
  *
  *	NOTE: the allocator cannot currently allocate more than
  *	BLIST_RADIX blocks per call.  It will panic with 'allocation too
  *	large' if you try.  This is an area that could use improvement.  The
  *	radix is large enough that this restriction does not effect the swap
  *	system, though.  Currently only the allocation code is affected by
  *	this algorithmic unfeature.  The freeing code can handle arbitrary
  *	ranges.
  *
  *	This code can be compiled stand-alone for debugging.
  */
 
 #include <sys/cdefs.h>
 #ifdef _KERNEL
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/blist.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/proc.h>
 #include <sys/mutex.h>
 
 #else
 
 #ifndef BLIST_NO_DEBUG
 #define BLIST_DEBUG
 #endif
 
 #include <sys/errno.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdbool.h>
 
 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
 #define malloc(a,b,c)	calloc(a, 1)
 #define free(a,b)	free(a)
 #define ummin(a,b)	((a) < (b) ? (a) : (b))
 #define imin(a,b)	((a) < (b) ? (a) : (b))
 #define KASSERT(a,b)	assert(a)
 
 #include <sys/blist.h>
 
 #endif
 
 /*
  * static support functions
  */
 static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk,
     int *count, int maxcount);
 static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
     int maxcount, u_daddr_t radix);
 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
 		    u_daddr_t radix);
 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
 		    blist_t dest, daddr_t count);
 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
 		    u_daddr_t radix);
 #ifndef _KERNEL
 static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
 		    int tab);
 #endif
 
 #ifdef _KERNEL
 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
 #endif
 
 #define	BLIST_MASK	(BLIST_RADIX - 1)
 
 /*
  * For a subtree that can represent the state of up to 'radix' blocks, the
  * number of leaf nodes of the subtree is L=radix/BLIST_RADIX.  If 'm'
  * is short for BLIST_RADIX, then for a tree of height h with L=m**h
  * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
  * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
  * in the 'meta' functions that process subtrees.  Since integer division
  * discards remainders, we can express this computation as
  * skip = (m * m**h) / (m - 1)
  * skip = (m * (radix / m)) / (m - 1)
  * skip = radix / (m - 1)
  * so that simple integer division by a constant can safely be used for the
  * calculation.
  */
 static inline daddr_t
 radix_to_skip(daddr_t radix)
 {
 
 	return (radix / BLIST_MASK);
 }
 
 /*
  * Provide a mask with count bits set, starting as position n.
  */
 static inline u_daddr_t
 bitrange(int n, int count)
 {
 
 	return (((u_daddr_t)-1 << n) &
 	    ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count))));
 }
 
 static inline int
 bitpos(u_daddr_t mask)
 {
 
 	_Static_assert(sizeof(long long) >= sizeof(mask),
 	    "mask too big for ffsll()");
 	return (ffsll(mask) - 1);
 }
 
 /*
  * blist_create() - create a blist capable of handling up to the specified
  *		    number of blocks
  *
  *	blocks - must be greater than 0
  * 	flags  - malloc flags
  *
  *	The smallest blist consists of a single leaf node capable of
  *	managing BLIST_RADIX blocks.
  */
 blist_t
 blist_create(daddr_t blocks, int flags)
 {
 	blist_t bl;
 	u_daddr_t nodes, radix;
 
 	KASSERT(blocks > 0, ("invalid block count"));
 
 	/*
 	 * Calculate the radix and node count used for scanning.
 	 */
 	nodes = 1;
 	for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0;
 	    radix *= BLIST_RADIX)
 		nodes += 1 + (blocks - 1) / BLIST_RADIX / radix;
 
 	/*
 	 * Include a sentinel node to ensure that cross-leaf scans stay within
 	 * the bounds of the allocation.
 	 */
 	if (blocks % BLIST_RADIX == 0)
 		nodes++;
 
 	bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
 	    M_ZERO);
 	if (bl == NULL)
 		return (NULL);
 
 	bl->bl_blocks = blocks;
 	bl->bl_radix = radix;
 
 #if defined(BLIST_DEBUG)
 	printf(
 		"BLIST representing %lld blocks (%lld MB of swap)"
 		", requiring %lldK of ram\n",
 		(long long)bl->bl_blocks,
 		(long long)bl->bl_blocks * 4 / 1024,
 		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
 	);
 	printf("BLIST raw radix tree contains %lld records\n",
 	    (long long)nodes);
 #endif
 
 	return (bl);
 }
 
 void
 blist_destroy(blist_t bl)
 {
 
 	free(bl, M_SWAP);
 }
 
 /*
  * blist_alloc() -   reserve space in the block bitmap.  Return the base
  *		     of a contiguous region or SWAPBLK_NONE if space could
  *		     not be allocated.
  */
 daddr_t
 blist_alloc(blist_t bl, int *count, int maxcount)
 {
 	daddr_t blk, cursor;
 
 	KASSERT(*count <= maxcount,
 	    ("invalid parameters %d > %d", *count, maxcount));
 	KASSERT(*count <= BLIST_MAX_ALLOC,
 	    ("minimum allocation too large: %d", *count));
 
 	/*
 	 * This loop iterates at most twice.  An allocation failure in the
 	 * first iteration leads to a second iteration only if the cursor was
 	 * non-zero.  When the cursor is zero, an allocation failure will
 	 * stop further iterations.
 	 */
 	for (cursor = bl->bl_cursor;; cursor = 0) {
 		blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount,
 		    bl->bl_radix);
 		if (blk != SWAPBLK_NONE) {
 			bl->bl_avail -= *count;
 			bl->bl_cursor = blk + *count;
 			if (bl->bl_cursor == bl->bl_blocks)
 				bl->bl_cursor = 0;
 			return (blk);
 		}
 		if (cursor == 0)
 			return (SWAPBLK_NONE);
 	}
 }
 
 /*
  * blist_avail() -	return the number of free blocks.
  */
 daddr_t
 blist_avail(blist_t bl)
 {
 
 	return (bl->bl_avail);
 }
 
 /*
  * blist_free() -	free up space in the block bitmap.  Return the base
  *		     	of a contiguous region.
  */
 void
 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
 {
 
 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
 	    ("freeing invalid range: blkno %jx, count %d, blocks %jd",
 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
 	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
 	bl->bl_avail += count;
 }
 
 /*
  * blist_fill() -	mark a region in the block bitmap as off-limits
  *			to the allocator (i.e. allocate it), ignoring any
  *			existing allocations.  Return the number of blocks
  *			actually filled that were free before the call.
  */
 daddr_t
 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
 {
 	daddr_t filled;
 
 	KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks,
 	    ("filling invalid range: blkno %jx, count %d, blocks %jd",
 	    (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks));
 	filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
 	bl->bl_avail -= filled;
 	return (filled);
 }
 
 /*
  * blist_resize() -	resize an existing radix tree to handle the
  *			specified number of blocks.  This will reallocate
  *			the tree and transfer the previous bitmap to the new
  *			one.  When extending the tree you can specify whether
  *			the new blocks are to left allocated or freed.
  */
 void
 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
 {
     blist_t newbl = blist_create(count, flags);
     blist_t save = *pbl;
 
     *pbl = newbl;
     if (count > save->bl_blocks)
 	    count = save->bl_blocks;
     blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
 
     /*
      * If resizing upwards, should we free the new space or not?
      */
     if (freenew && count < newbl->bl_blocks) {
 	    blist_free(newbl, count, newbl->bl_blocks - count);
     }
     blist_destroy(save);
 }
 
 #ifdef BLIST_DEBUG
 
 /*
  * blist_print()    - dump radix tree
  */
 void
 blist_print(blist_t bl)
 {
 	printf("BLIST avail = %jd, cursor = %08jx {\n",
 	    (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
 
 	if (bl->bl_root->bm_bitmap != 0)
 		blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
 	printf("}\n");
 }
 
 #endif
 
 static const u_daddr_t fib[] = {
 	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
 	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
 	514229, 832040, 1346269, 2178309, 3524578,
 };
 
 /*
  * Use 'gap' to describe a maximal range of unallocated blocks/bits.
  */
 struct gap_stats {
 	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
 	daddr_t	num;		/* number of gaps observed */
 	daddr_t	max;		/* largest gap size */
 	daddr_t	avg;		/* average gap size */
 	daddr_t	err;		/* sum - num * avg */
 	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
 	int	max_bucket;	/* last histo elt with nonzero val */
 };
 
 /*
  * gap_stats_counting()    - is the state 'counting 1 bits'?
  *                           or 'skipping 0 bits'?
  */
 static inline bool
 gap_stats_counting(const struct gap_stats *stats)
 {
 
 	return (stats->start != SWAPBLK_NONE);
 }
 
 /*
  * init_gap_stats()    - initialize stats on gap sizes
  */
 static inline void
 init_gap_stats(struct gap_stats *stats)
 {
 
 	bzero(stats, sizeof(*stats));
 	stats->start = SWAPBLK_NONE;
 }
 
 /*
  * update_gap_stats()    - update stats on gap sizes
  */
 static void
 update_gap_stats(struct gap_stats *stats, daddr_t posn)
 {
 	daddr_t size;
 	int hi, lo, mid;
 
 	if (!gap_stats_counting(stats)) {
 		stats->start = posn;
 		return;
 	}
 	size = posn - stats->start;
 	stats->start = SWAPBLK_NONE;
 	if (size > stats->max)
 		stats->max = size;
 
 	/*
 	 * Find the fibonacci range that contains size,
 	 * expecting to find it in an early range.
 	 */
 	lo = 0;
 	hi = 1;
 	while (hi < nitems(fib) && fib[hi] <= size) {
 		lo = hi;
 		hi *= 2;
 	}
 	if (hi >= nitems(fib))
 		hi = nitems(fib);
 	while (lo + 1 != hi) {
 		mid = (lo + hi) >> 1;
 		if (fib[mid] <= size)
 			lo = mid;
 		else
 			hi = mid;
 	}
 	stats->histo[lo]++;
 	if (lo > stats->max_bucket)
 		stats->max_bucket = lo;
 	stats->err += size - stats->avg;
 	stats->num++;
 	stats->avg += stats->err / stats->num;
 	stats->err %= stats->num;
 }
 
 /*
  * dump_gap_stats()    - print stats on gap sizes
  */
 static inline void
 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
 {
 	int i;
 
 	sbuf_printf(s, "number of maximal free ranges: %jd\n",
 	    (intmax_t)stats->num);
 	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
 	sbuf_printf(s, "average maximal free range size: %jd\n",
 	    (intmax_t)stats->avg);
-	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
-	sbuf_printf(s, "               count  |  size range\n");
-	sbuf_printf(s, "               -----  |  ----------\n");
+	sbuf_cat(s, "number of maximal free ranges of different sizes:\n");
+	sbuf_cat(s, "               count  |  size range\n");
+	sbuf_cat(s, "               -----  |  ----------\n");
 	for (i = 0; i < stats->max_bucket; i++) {
 		if (stats->histo[i] != 0) {
 			sbuf_printf(s, "%20jd  |  ",
 			    (intmax_t)stats->histo[i]);
 			if (fib[i] != fib[i + 1] - 1)
 				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
 				    (intmax_t)fib[i + 1] - 1);
 			else
 				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
 		}
 	}
 	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
 	if (stats->histo[i] > 1)
 		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
 		    (intmax_t)stats->max);
 	else
 		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
 }
 
 /*
  * blist_stats()    - dump radix tree stats
  */
 void
 blist_stats(blist_t bl, struct sbuf *s)
 {
 	struct gap_stats gstats;
 	struct gap_stats *stats = &gstats;
 	daddr_t i, nodes, radix;
 	u_daddr_t diff, mask;
 	int digit;
 
 	init_gap_stats(stats);
 	nodes = 0;
 	radix = bl->bl_radix;
 	for (i = 0; i < bl->bl_blocks; ) {
 		/*
 		 * Check for skippable subtrees starting at i.
 		 */
 		while (radix != 1) {
 			if (bl->bl_root[nodes].bm_bitmap == 0) {
 				if (gap_stats_counting(stats))
 					update_gap_stats(stats, i);
 				break;
 			}
 
 			/*
 			 * Skip subtree root.
 			 */
 			nodes++;
 			radix /= BLIST_RADIX;
 		}
 		if (radix == 1) {
 			/*
 			 * Scan leaf.
 			 */
 			mask = bl->bl_root[nodes].bm_bitmap;
 			diff = mask ^ (mask << 1);
 			if (gap_stats_counting(stats))
 				diff ^= 1;
 			while (diff != 0) {
 				digit = bitpos(diff);
 				update_gap_stats(stats, i + digit);
 				diff ^= bitrange(digit, 1);
 			}
 		}
 		nodes += radix_to_skip(radix * BLIST_RADIX);
 		i += radix * BLIST_RADIX;
 
 		/*
 		 * Find max size subtree starting at i.
 		 */
 		for (radix = 1; 
 		    ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0;
 		    radix *= BLIST_RADIX)
 			;
 	}
 	update_gap_stats(stats, i);
 	dump_gap_stats(stats, s);
 }
 
 /************************************************************************
  *			  ALLOCATION SUPPORT FUNCTIONS			*
  ************************************************************************
  *
  *	These support functions do all the actual work.  They may seem
  *	rather longish, but that's because I've commented them up.  The
  *	actual code is straight forward.
  *
  */
 
 /*
  * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf.
  *
  *	'scan' is a leaf node, and its first block is at address 'start'.  The
  *	next leaf node could be adjacent, or several nodes away if the least
  *	common ancestor of 'scan' and its neighbor is several levels up.  Use
  *	addresses to determine how many meta-nodes lie between the leaves.  If
  *	sequence of leaves starting with the next one has enough initial bits
  *	set, clear them and clear the bits in the meta nodes on the path up to
  *	the least common ancestor to mark any subtrees made completely empty.
  */
 static int
 blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount)
 {
 	u_daddr_t radix;
 	daddr_t blk;
 	int avail, digit;
 
 	start += BLIST_RADIX;
 	for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) {
 		/* Skip meta-nodes, as long as they promise more free blocks. */
 		radix = BLIST_RADIX;
 		while (((++scan)->bm_bitmap & 1) == 1 &&
 		    ((blk / radix) & BLIST_MASK) == 0)
 			radix *= BLIST_RADIX;
 		if (~scan->bm_bitmap != 0) {
 			/*
 			 * Either there is no next leaf with any free blocks,
 			 * or we've reached the next leaf and found that some
 			 * of its blocks are not free.  In the first case,
 			 * bitpos() returns zero here.
 			 */
 			avail = blk - start + bitpos(~scan->bm_bitmap);
 			if (avail < count || avail == 0) {
 				/*
 				 * There isn't a next leaf with enough free
 				 * blocks at its beginning to bother
 				 * allocating.
 				 */
 				return (avail);
 			}
 			maxcount = imin(avail, maxcount);
 			if (maxcount % BLIST_RADIX == 0) {
 				/*
 				 * There was no next leaf.  Back scan up to
 				 * last leaf.
 				 */
 				do {
 					radix /= BLIST_RADIX;
 					--scan;
 				} while (radix != 1);
 				blk -= BLIST_RADIX;
 			}
 		}
 	}
 
 	/*
 	 * 'scan' is the last leaf that provides blocks.  Clear from 1 to
 	 * BLIST_RADIX bits to represent the allocation of those last blocks.
 	 */
 	if (maxcount % BLIST_RADIX != 0)
 		scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX);
 	else
 		scan->bm_bitmap = 0;
 
 	for (;;) {
 		/* Back up over meta-nodes, clearing bits if necessary. */
 		blk -= BLIST_RADIX;
 		for (radix = BLIST_RADIX;
 		    (digit = ((blk / radix) & BLIST_MASK)) == 0;
 		    radix *= BLIST_RADIX) {
 			if ((scan--)->bm_bitmap == 0)
 				scan->bm_bitmap ^= 1;
 		}
 		if ((scan--)->bm_bitmap == 0)
 			scan[-digit * radix_to_skip(radix)].bm_bitmap ^=
 			    (u_daddr_t)1 << digit;
 
 		if (blk == start)
 			break;
 		/* Clear all the bits of this leaf. */
 		scan->bm_bitmap = 0;
 	}
 	return (maxcount);
 }
 
 /*
  * BLST_LEAF_ALLOC() -	allocate at a leaf in the radix tree (a bitmap).
  *
  *	This function is the core of the allocator.  Its execution time is
  *	proportional to log(count), plus height of the tree if the allocation
  *	crosses a leaf boundary.
  */
 static daddr_t
 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount)
 {
 	u_daddr_t mask;
 	int bighint, count1, hi, lo, num_shifts;
 
 	count1 = *count - 1;
 	num_shifts = fls(count1);
 	mask = ~scan->bm_bitmap;
 	while ((mask & (mask + 1)) != 0 && num_shifts > 0) {
 		/*
 		 * If bit i is 0 in mask, then bits in [i, i + (count1 >>
 		 * num_shifts)] are 1 in scan->bm_bitmap.  Reduce num_shifts to
 		 * 0, while preserving this invariant.  The updates to mask
 		 * leave fewer bits 0, but each bit that remains 0 represents a
 		 * longer string of consecutive 1-bits in scan->bm_bitmap.  If
 		 * more updates to mask cannot set more bits, because mask is
 		 * partitioned with all 1 bits following all 0 bits, the loop
 		 * terminates immediately.
 		 */
 		num_shifts--;
 		mask |= mask >> ((count1 >> num_shifts) + 1) / 2;
 	}
 	bighint = count1 >> num_shifts;
 	if (~mask == 0) {
 		/*
 		 * Update bighint.  There is no allocation bigger than
 		 * count1 >> num_shifts starting in this leaf.
 		 */
 		scan->bm_bighint = bighint;
 		return (SWAPBLK_NONE);
 	}
 
 	/* Discard any candidates that appear before blk. */
 	if ((blk & BLIST_MASK) != 0) {
 		if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) {
 			/* Grow bighint in case all discarded bits are set. */
 			bighint += blk & BLIST_MASK;
 			mask |= bitrange(0, blk & BLIST_MASK);
 			if (~mask == 0) {
 				scan->bm_bighint = bighint;
 				return (SWAPBLK_NONE);
 			}
 		}
 		blk -= blk & BLIST_MASK;
 	}
 
 	/*
 	 * The least significant set bit in mask marks the start of the first
 	 * available range of sufficient size.  Find its position.
 	 */
 	lo = bitpos(~mask);
 
 	/*
 	 * Find how much space is available starting at that position.
 	 */
 	if ((mask & (mask + 1)) != 0) {
 		/* Count the 1 bits starting at position lo. */
 		hi = bitpos(mask & (mask + 1)) + count1;
 		if (maxcount < hi - lo)
 			hi = lo + maxcount;
 		*count = hi - lo;
 		mask = ~bitrange(lo, *count);
 	} else if (maxcount <= BLIST_RADIX - lo) {
 		/* All the blocks we can use are available here. */
 		hi = lo + maxcount;
 		*count = maxcount;
 		mask = ~bitrange(lo, *count);
 		if (hi == BLIST_RADIX)
 			scan->bm_bighint = bighint;
 	} else {
 		/* Check next leaf for some of the blocks we want or need. */
 		count1 = *count - (BLIST_RADIX - lo);
 		maxcount -= BLIST_RADIX - lo;
 		hi = blst_next_leaf_alloc(scan, blk, count1, maxcount);
 		if (hi < count1)
 			/*
 			 * The next leaf cannot supply enough blocks to reach
 			 * the minimum required allocation.  The hint cannot be
 			 * updated, because the same allocation request could
 			 * be satisfied later, by this leaf, if the state of
 			 * the next leaf changes, and without any changes to
 			 * this leaf.
 			 */
 			return (SWAPBLK_NONE);
 		*count = BLIST_RADIX - lo + hi;
 		scan->bm_bighint = bighint;
 	}
 
 	/* Clear the allocated bits from this leaf. */
 	scan->bm_bitmap &= mask;
 	return (blk + lo);
 }
 
 /*
  * blist_meta_alloc() -	allocate at a meta in the radix tree.
  *
  *	Attempt to allocate at a meta node.  If we can't, we update
  *	bighint and return a failure.  Updating bighint optimize future
  *	calls that hit this node.  We have to check for our collapse cases
  *	and we have a few optimizations strewn in as well.
  */
 static daddr_t
 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count,
     int maxcount, u_daddr_t radix)
 {
 	daddr_t blk, i, r, skip;
 	u_daddr_t mask;
 	bool scan_from_start;
 	int digit;
 
 	if (radix == 1)
 		return (blst_leaf_alloc(scan, cursor, count, maxcount));
 	blk = cursor & -(radix * BLIST_RADIX);
 	scan_from_start = (cursor == blk);
 	skip = radix_to_skip(radix);
 	mask = scan->bm_bitmap;
 
 	/* Discard any candidates that appear before cursor. */
 	digit = (cursor / radix) & BLIST_MASK;
 	mask &= (u_daddr_t)-1 << digit;
 	if (mask == 0)
 		return (SWAPBLK_NONE);
 
 	/*
 	 * If the first try is for a block that includes the cursor, pre-undo
 	 * the digit * radix offset in the first call; otherwise, ignore the
 	 * cursor entirely.
 	 */
 	if (((mask >> digit) & 1) == 1)
 		cursor -= digit * radix;
 	else
 		cursor = blk;
 
 	/*
 	 * Examine the nonempty subtree associated with each bit set in mask.
 	 */
 	do {
 		digit = bitpos(mask);
 		i = 1 + digit * skip;
 		if (*count <= scan[i].bm_bighint) {
 			/*
 			 * The allocation might fit beginning in the i'th subtree.
 			 */
 			r = blst_meta_alloc(&scan[i], cursor + digit * radix,
 			    count, maxcount, radix / BLIST_RADIX);
 			if (r != SWAPBLK_NONE) {
 				if (scan[i].bm_bitmap == 0)
 					scan->bm_bitmap ^= bitrange(digit, 1);
 				return (r);
 			}
 		}
 		cursor = blk;
 	} while ((mask ^= bitrange(digit, 1)) != 0);
 
 	/*
 	 * We couldn't allocate count in this subtree.  If the whole tree was
 	 * scanned, and the last tree node is allocated, update bighint.
 	 */
 	if (scan_from_start && !(digit == BLIST_RADIX - 1 &&
 	    scan[i].bm_bighint == BLIST_MAX_ALLOC))
 		scan->bm_bighint = *count - 1;
 
 	return (SWAPBLK_NONE);
 }
 
 /*
  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
  *
  */
 static void
 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
 {
 	u_daddr_t mask;
 
 	/*
 	 * free some data in this bitmap
 	 * mask=0000111111111110000
 	 *          \_________/\__/
 	 *		count   n
 	 */
 	mask = bitrange(blk & BLIST_MASK, count);
 	KASSERT((scan->bm_bitmap & mask) == 0,
 	    ("freeing free block: %jx, size %d, mask %jx",
 	    (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask));
 	scan->bm_bitmap |= mask;
 }
 
 /*
  * BLST_META_FREE() - free allocated blocks from radix tree meta info
  *
  *	This support routine frees a range of blocks from the bitmap.
  *	The range must be entirely enclosed by this radix node.  If a
  *	meta node, we break the range down recursively to free blocks
  *	in subnodes (which means that this code can free an arbitrary
  *	range whereas the allocation code cannot allocate an arbitrary
  *	range).
  */
 static void
 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
 {
 	daddr_t blk, endBlk, i, skip;
 	int digit, endDigit;
 
 	/*
 	 * We could probably do a better job here.  We are required to make
 	 * bighint at least as large as the biggest allocable block of data.
 	 * If we just shoehorn it, a little extra overhead will be incurred
 	 * on the next allocation (but only that one typically).
 	 */
 	scan->bm_bighint = BLIST_MAX_ALLOC;
 
 	if (radix == 1)
 		return (blst_leaf_free(scan, freeBlk, count));
 
 	endBlk = freeBlk + count;
 	blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
 	/*
 	 * blk is first block past the end of the range of this meta node,
 	 * or 0 in case of overflow.
 	 */
 	if (blk != 0)
 		endBlk = ummin(endBlk, blk);
 	skip = radix_to_skip(radix);
 	blk = freeBlk & -radix;
 	digit = (blk / radix) & BLIST_MASK;
 	endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK);
 	scan->bm_bitmap |= bitrange(digit, endDigit - digit);
 	for (i = 1 + digit * skip; blk < endBlk; i += skip) {
 		blk += radix;
 		count = ummin(blk, endBlk) - freeBlk;
 		blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX);
 		freeBlk = blk;
 	}
 }
 
 /*
  * BLST_COPY() - copy one radix tree to another
  *
  *	Locates free space in the source tree and frees it in the destination
  *	tree.  The space may not already be free in the destination.
  */
 static void
 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
     daddr_t count)
 {
 	daddr_t endBlk, i, skip;
 
 	/*
 	 * Leaf node
 	 */
 
 	if (radix == 1) {
 		u_daddr_t v = scan->bm_bitmap;
 
 		if (v == (u_daddr_t)-1) {
 			blist_free(dest, blk, count);
 		} else if (v != 0) {
 			int i;
 
 			for (i = 0; i < count; ++i) {
 				if (v & ((u_daddr_t)1 << i))
 					blist_free(dest, blk + i, 1);
 			}
 		}
 		return;
 	}
 
 	/*
 	 * Meta node
 	 */
 
 	if (scan->bm_bitmap == 0) {
 		/*
 		 * Source all allocated, leave dest allocated
 		 */
 		return;
 	}
 
 	endBlk = blk + count;
 	skip = radix_to_skip(radix);
 	for (i = 1; blk < endBlk; i += skip) {
 		blk += radix;
 		count = radix;
 		if (blk >= endBlk)
 			count -= blk - endBlk;
 		blst_copy(&scan[i], blk - radix,
 		    radix / BLIST_RADIX, dest, count);
 	}
 }
 
 /*
  * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
  *
  *	This routine allocates all blocks in the specified range
  *	regardless of any existing allocations in that range.  Returns
  *	the number of blocks allocated by the call.
  */
 static daddr_t
 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
 {
 	daddr_t nblks;
 	u_daddr_t mask;
 
 	mask = bitrange(blk & BLIST_MASK, count);
 
 	/* Count the number of blocks that we are allocating. */
 	nblks = bitcount64(scan->bm_bitmap & mask);
 
 	scan->bm_bitmap &= ~mask;
 	return (nblks);
 }
 
 /*
  * BLIST_META_FILL() -	allocate specific blocks at a meta node
  *
  *	This routine allocates the specified range of blocks,
  *	regardless of any existing allocations in the range.  The
  *	range must be within the extent of this node.  Returns the
  *	number of blocks allocated by the call.
  */
 static daddr_t
 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
 {
 	daddr_t blk, endBlk, i, nblks, skip;
 	int digit;
 
 	if (radix == 1)
 		return (blst_leaf_fill(scan, allocBlk, count));
 
 	endBlk = allocBlk + count;
 	blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX);
 	/*
 	 * blk is first block past the end of the range of this meta node,
 	 * or 0 in case of overflow.
 	 */
 	if (blk != 0)
 		endBlk = ummin(endBlk, blk);
 	skip = radix_to_skip(radix);
 	blk = allocBlk & -radix;
 	nblks = 0;
 	while (blk < endBlk) {
 		digit = (blk / radix) & BLIST_MASK;
 		i = 1 + digit * skip;
 		blk += radix;
 		count = ummin(blk, endBlk) - allocBlk;
 		nblks += blst_meta_fill(&scan[i], allocBlk, count,
 		    radix / BLIST_RADIX);
 		if (scan[i].bm_bitmap == 0)
 			scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
 		allocBlk = blk;
 	}
 	return (nblks);
 }
 
 #ifdef BLIST_DEBUG
 
 static void
 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
 {
 	daddr_t skip;
 	u_daddr_t mask;
 	int digit;
 
 	if (radix == 1) {
 		printf(
 		    "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
 		    tab, tab, "",
 		    (long long)blk, (long long)BLIST_RADIX,
 		    (int)(1 + (BLIST_RADIX - 1) / 4),
 		    (long long)scan->bm_bitmap,
 		    (long long)scan->bm_bighint
 		);
 		return;
 	}
 
 	printf(
 	    "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
 	    tab, tab, "",
 	    (long long)blk, (long long)radix * BLIST_RADIX,
 	    (long long)radix * BLIST_RADIX,
 	    (int)(1 + (BLIST_RADIX - 1) / 4),
 	    (long long)scan->bm_bitmap,
 	    (long long)scan->bm_bighint
 	);
 
 	skip = radix_to_skip(radix);
 	tab += 4;
 
 	mask = scan->bm_bitmap;
 	/* Examine the nonempty subtree associated with each bit set in mask */
 	do {
 		digit = bitpos(mask);
 		blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
 		    radix / BLIST_RADIX, tab);
 	} while ((mask ^= bitrange(digit, 1)) != 0);
 	tab -= 4;
 
 	printf(
 	    "%*.*s}\n",
 	    tab, tab, ""
 	);
 }
 
 #endif
 
 #ifdef BLIST_DEBUG
 
 int
 main(int ac, char **av)
 {
 	daddr_t size = BLIST_RADIX * BLIST_RADIX;
 	int i;
 	blist_t bl;
 	struct sbuf *s;
 
 	for (i = 1; i < ac; ++i) {
 		const char *ptr = av[i];
 		if (*ptr != '-') {
 			size = strtoll(ptr, NULL, 0);
 			continue;
 		}
 		ptr += 2;
 		fprintf(stderr, "Bad option: %s\n", ptr - 2);
 		exit(1);
 	}
 	bl = blist_create(size, M_WAITOK);
 	if (bl == NULL) {
 		fprintf(stderr, "blist_create failed\n");
 		exit(1);
 	}
 	blist_free(bl, 0, size);
 
 	for (;;) {
 		char buf[1024];
 		long long da = 0;
 		int count = 0, maxcount = 0;
 
 		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
 		    (long long)size, (long long)bl->bl_radix * BLIST_RADIX);
 		fflush(stdout);
 		if (fgets(buf, sizeof(buf), stdin) == NULL)
 			break;
 		switch(buf[0]) {
 		case 'r':
 			if (sscanf(buf + 1, "%d", &count) == 1) {
 				blist_resize(&bl, count, 1, M_WAITOK);
 			} else {
 				printf("?\n");
 			}
 		case 'p':
 			blist_print(bl);
 			break;
 		case 's':
 			s = sbuf_new_auto();
 			blist_stats(bl, s);
 			sbuf_finish(s);
 			printf("%s", sbuf_data(s));
 			sbuf_delete(s);
 			break;
 		case 'a':
 			if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) {
 				daddr_t blk = blist_alloc(bl, &count, maxcount);
 				printf("    R=%08llx, c=%08d\n",
 				    (long long)blk, count);
 			} else {
 				printf("?\n");
 			}
 			break;
 		case 'f':
 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
 				blist_free(bl, da, count);
 			} else {
 				printf("?\n");
 			}
 			break;
 		case 'l':
 			if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) {
 				printf("    n=%jd\n",
 				    (intmax_t)blist_fill(bl, da, count));
 			} else {
 				printf("?\n");
 			}
 			break;
 		case '?':
 		case 'h':
 			puts(
 			    "p          -print\n"
 			    "s          -stats\n"
 			    "a %d %d    -allocate\n"
 			    "f %x %d    -free\n"
 			    "l %x %d    -fill\n"
 			    "r %d       -resize\n"
 			    "h/?        -help\n"
 			    "q          -quit"
 			);
 			break;
 		case 'q':
 			break;
 		default:
 			printf("?\n");
 			break;
 		}
 		if (buf[0] == 'q')
 			break;
 	}
 	return (0);
 }
 
 #endif
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
index 8caab20cf709..648394abd026 100644
--- a/sys/kern/subr_bus.c
+++ b/sys/kern/subr_bus.c
@@ -1,5837 +1,5837 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1997,1998,2003 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_bus.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/domainset.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <machine/bus.h>
 #include <sys/random.h>
 #include <sys/refcount.h>
 #include <sys/rman.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 
 #include <net/vnet.h>
 
 #include <machine/cpu.h>
 #include <machine/stdarg.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     NULL);
 SYSCTL_ROOT_NODE(OID_AUTO, dev, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     NULL);
 
 static bool disable_failed_devs = false;
 SYSCTL_BOOL(_hw_bus, OID_AUTO, disable_failed_devices, CTLFLAG_RWTUN, &disable_failed_devs,
     0, "Do not retry attaching devices that return an error from DEVICE_ATTACH the first time");
 
 /*
  * Used to attach drivers to devclasses.
  */
 typedef struct driverlink *driverlink_t;
 struct driverlink {
 	kobj_class_t	driver;
 	TAILQ_ENTRY(driverlink) link;	/* list of drivers in devclass */
 	int		pass;
 	int		flags;
 #define DL_DEFERRED_PROBE	1	/* Probe deferred on this */
 	TAILQ_ENTRY(driverlink) passlink;
 };
 
 /*
  * Forward declarations
  */
 typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
 typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
 typedef TAILQ_HEAD(device_list, _device) device_list_t;
 
 struct devclass {
 	TAILQ_ENTRY(devclass) link;
 	devclass_t	parent;		/* parent in devclass hierarchy */
 	driver_list_t	drivers;	/* bus devclasses store drivers for bus */
 	char		*name;
 	device_t	*devices;	/* array of devices indexed by unit */
 	int		maxunit;	/* size of devices array */
 	int		flags;
 #define DC_HAS_CHILDREN		1
 
 	struct sysctl_ctx_list sysctl_ctx;
 	struct sysctl_oid *sysctl_tree;
 };
 
 /**
  * @brief Implementation of _device.
  *
  * The structure is named "_device" instead of "device" to avoid type confusion
  * caused by other subsystems defining a (struct device).
  */
 struct _device {
 	/*
 	 * A device is a kernel object. The first field must be the
 	 * current ops table for the object.
 	 */
 	KOBJ_FIELDS;
 
 	/*
 	 * Device hierarchy.
 	 */
 	TAILQ_ENTRY(_device)	link;	/**< list of devices in parent */
 	TAILQ_ENTRY(_device)	devlink; /**< global device list membership */
 	device_t	parent;		/**< parent of this device  */
 	device_list_t	children;	/**< list of child devices */
 
 	/*
 	 * Details of this device.
 	 */
 	driver_t	*driver;	/**< current driver */
 	devclass_t	devclass;	/**< current device class */
 	int		unit;		/**< current unit number */
 	char*		nameunit;	/**< name+unit e.g. foodev0 */
 	char*		desc;		/**< driver specific description */
 	u_int		busy;		/**< count of calls to device_busy() */
 	device_state_t	state;		/**< current device state  */
 	uint32_t	devflags;	/**< api level flags for device_get_flags() */
 	u_int		flags;		/**< internal device flags  */
 	u_int	order;			/**< order from device_add_child_ordered() */
 	void	*ivars;			/**< instance variables  */
 	void	*softc;			/**< current driver's variables  */
 
 	struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables  */
 	struct sysctl_oid *sysctl_tree;	/**< state for sysctl variables */
 };
 
 static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
 static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
 
 EVENTHANDLER_LIST_DEFINE(device_attach);
 EVENTHANDLER_LIST_DEFINE(device_detach);
 EVENTHANDLER_LIST_DEFINE(device_nomatch);
 EVENTHANDLER_LIST_DEFINE(dev_lookup);
 
 static void devctl2_init(void);
 static bool device_frozen;
 
 #define DRIVERNAME(d)	((d)? d->name : "no driver")
 #define DEVCLANAME(d)	((d)? d->name : "no devclass")
 
 #ifdef BUS_DEBUG
 
 static int bus_debug = 1;
 SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RWTUN, &bus_debug, 0,
     "Bus debug level");
 #define PDEBUG(a)	if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
 #define DEVICENAME(d)	((d)? device_get_name(d): "no device")
 
 /**
  * Produce the indenting, indent*2 spaces plus a '.' ahead of that to
  * prevent syslog from deleting initial spaces
  */
 #define indentprintf(p)	do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf("  "); printf p ; } while (0)
 
 static void print_device_short(device_t dev, int indent);
 static void print_device(device_t dev, int indent);
 void print_device_tree_short(device_t dev, int indent);
 void print_device_tree(device_t dev, int indent);
 static void print_driver_short(driver_t *driver, int indent);
 static void print_driver(driver_t *driver, int indent);
 static void print_driver_list(driver_list_t drivers, int indent);
 static void print_devclass_short(devclass_t dc, int indent);
 static void print_devclass(devclass_t dc, int indent);
 void print_devclass_list_short(void);
 void print_devclass_list(void);
 
 #else
 /* Make the compiler ignore the function calls */
 #define PDEBUG(a)			/* nop */
 #define DEVICENAME(d)			/* nop */
 
 #define print_device_short(d,i)		/* nop */
 #define print_device(d,i)		/* nop */
 #define print_device_tree_short(d,i)	/* nop */
 #define print_device_tree(d,i)		/* nop */
 #define print_driver_short(d,i)		/* nop */
 #define print_driver(d,i)		/* nop */
 #define print_driver_list(d,i)		/* nop */
 #define print_devclass_short(d,i)	/* nop */
 #define print_devclass(d,i)		/* nop */
 #define print_devclass_list_short()	/* nop */
 #define print_devclass_list()		/* nop */
 #endif
 
 /*
  * dev sysctl tree
  */
 
 enum {
 	DEVCLASS_SYSCTL_PARENT,
 };
 
 static int
 devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	devclass_t dc = (devclass_t)arg1;
 	const char *value;
 
 	switch (arg2) {
 	case DEVCLASS_SYSCTL_PARENT:
 		value = dc->parent ? dc->parent->name : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (SYSCTL_OUT_STR(req, value));
 }
 
 static void
 devclass_sysctl_init(devclass_t dc)
 {
 	if (dc->sysctl_tree != NULL)
 		return;
 	sysctl_ctx_init(&dc->sysctl_ctx);
 	dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
 	    OID_AUTO, "%parent",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
 	    "parent class");
 }
 
 enum {
 	DEVICE_SYSCTL_DESC,
 	DEVICE_SYSCTL_DRIVER,
 	DEVICE_SYSCTL_LOCATION,
 	DEVICE_SYSCTL_PNPINFO,
 	DEVICE_SYSCTL_PARENT,
 };
 
 static int
 device_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	device_t dev = (device_t)arg1;
 	int error;
 
 	sbuf_new_for_sysctl(&sb, NULL, 1024, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	bus_topo_lock();
 	switch (arg2) {
 	case DEVICE_SYSCTL_DESC:
 		sbuf_cat(&sb, dev->desc ? dev->desc : "");
 		break;
 	case DEVICE_SYSCTL_DRIVER:
 		sbuf_cat(&sb, dev->driver ? dev->driver->name : "");
 		break;
 	case DEVICE_SYSCTL_LOCATION:
 		bus_child_location(dev, &sb);
 		break;
 	case DEVICE_SYSCTL_PNPINFO:
 		bus_child_pnpinfo(dev, &sb);
 		break;
 	case DEVICE_SYSCTL_PARENT:
 		sbuf_cat(&sb, dev->parent ? dev->parent->nameunit : "");
 		break;
 	default:
 		error = EINVAL;
 		goto out;
 	}
 	error = sbuf_finish(&sb);
 out:
 	bus_topo_unlock();
 	sbuf_delete(&sb);
 	return (error);
 }
 
 static void
 device_sysctl_init(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 	int domain;
 
 	if (dev->sysctl_tree != NULL)
 		return;
 	devclass_sysctl_init(dc);
 	sysctl_ctx_init(&dev->sysctl_ctx);
 	dev->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&dev->sysctl_ctx,
 	    SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
 	    dev->nameunit + strlen(dc->name),
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "", "device_index");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
 	    "device description");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%driver",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
 	    "device driver name");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%location",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
 	    "device location relative to parent");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%pnpinfo",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
 	    "device identification");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%parent",
 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
 	    "parent device");
 	if (bus_get_domain(dev, &domain) == 0)
 		SYSCTL_ADD_INT(&dev->sysctl_ctx,
 		    SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%domain",
 		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, domain, "NUMA domain");
 }
 
 static void
 device_sysctl_update(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 
 	if (dev->sysctl_tree == NULL)
 		return;
 	sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name));
 }
 
 static void
 device_sysctl_fini(device_t dev)
 {
 	if (dev->sysctl_tree == NULL)
 		return;
 	sysctl_ctx_free(&dev->sysctl_ctx);
 	dev->sysctl_tree = NULL;
 }
 
 static struct device_list bus_data_devices;
 static int bus_data_generation = 1;
 
 static kobj_method_t null_methods[] = {
 	KOBJMETHOD_END
 };
 
 DEFINE_CLASS(null, null_methods, 0);
 
 void
 bus_topo_assert(void)
 {
 
 	GIANT_REQUIRED;	
 }
 
 struct mtx *
 bus_topo_mtx(void)
 {
 
 	return (&Giant);
 }
 
 void
 bus_topo_lock(void)
 {
 
 	mtx_lock(bus_topo_mtx());
 }
 
 void
 bus_topo_unlock(void)
 {
 
 	mtx_unlock(bus_topo_mtx());
 }
 
 /*
  * Bus pass implementation
  */
 
 static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes);
 int bus_current_pass = BUS_PASS_ROOT;
 
 /**
  * @internal
  * @brief Register the pass level of a new driver attachment
  *
  * Register a new driver attachment's pass level.  If no driver
  * attachment with the same pass level has been added, then @p new
  * will be added to the global passes list.
  *
  * @param new		the new driver attachment
  */
 static void
 driver_register_pass(struct driverlink *new)
 {
 	struct driverlink *dl;
 
 	/* We only consider pass numbers during boot. */
 	if (bus_current_pass == BUS_PASS_DEFAULT)
 		return;
 
 	/*
 	 * Walk the passes list.  If we already know about this pass
 	 * then there is nothing to do.  If we don't, then insert this
 	 * driver link into the list.
 	 */
 	TAILQ_FOREACH(dl, &passes, passlink) {
 		if (dl->pass < new->pass)
 			continue;
 		if (dl->pass == new->pass)
 			return;
 		TAILQ_INSERT_BEFORE(dl, new, passlink);
 		return;
 	}
 	TAILQ_INSERT_TAIL(&passes, new, passlink);
 }
 
 /**
  * @brief Raise the current bus pass
  *
  * Raise the current bus pass level to @p pass.  Call the BUS_NEW_PASS()
  * method on the root bus to kick off a new device tree scan for each
  * new pass level that has at least one driver.
  */
 void
 bus_set_pass(int pass)
 {
 	struct driverlink *dl;
 
 	if (bus_current_pass > pass)
 		panic("Attempt to lower bus pass level");
 
 	TAILQ_FOREACH(dl, &passes, passlink) {
 		/* Skip pass values below the current pass level. */
 		if (dl->pass <= bus_current_pass)
 			continue;
 
 		/*
 		 * Bail once we hit a driver with a pass level that is
 		 * too high.
 		 */
 		if (dl->pass > pass)
 			break;
 
 		/*
 		 * Raise the pass level to the next level and rescan
 		 * the tree.
 		 */
 		bus_current_pass = dl->pass;
 		BUS_NEW_PASS(root_bus);
 	}
 
 	/*
 	 * If there isn't a driver registered for the requested pass,
 	 * then bus_current_pass might still be less than 'pass'.  Set
 	 * it to 'pass' in that case.
 	 */
 	if (bus_current_pass < pass)
 		bus_current_pass = pass;
 	KASSERT(bus_current_pass == pass, ("Failed to update bus pass level"));
 }
 
 /*
  * Devclass implementation
  */
 
 static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
 
 /**
  * @internal
  * @brief Find or create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise if @p create is non-zero create and return a new device
  * class.
  *
  * If @p parentname is non-NULL, the parent of the devclass is set to
  * the devclass of that name.
  *
  * @param classname	the devclass name to find or create
  * @param parentname	the parent devclass name or @c NULL
  * @param create	non-zero to create a devclass
  */
 static devclass_t
 devclass_find_internal(const char *classname, const char *parentname,
 		       int create)
 {
 	devclass_t dc;
 
 	PDEBUG(("looking for %s", classname));
 	if (!classname)
 		return (NULL);
 
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		if (!strcmp(dc->name, classname))
 			break;
 	}
 
 	if (create && !dc) {
 		PDEBUG(("creating %s", classname));
 		dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
 		    M_BUS, M_NOWAIT | M_ZERO);
 		if (!dc)
 			return (NULL);
 		dc->parent = NULL;
 		dc->name = (char*) (dc + 1);
 		strcpy(dc->name, classname);
 		TAILQ_INIT(&dc->drivers);
 		TAILQ_INSERT_TAIL(&devclasses, dc, link);
 
 		bus_data_generation_update();
 	}
 
 	/*
 	 * If a parent class is specified, then set that as our parent so
 	 * that this devclass will support drivers for the parent class as
 	 * well.  If the parent class has the same name don't do this though
 	 * as it creates a cycle that can trigger an infinite loop in
 	 * device_probe_child() if a device exists for which there is no
 	 * suitable driver.
 	 */
 	if (parentname && dc && !dc->parent &&
 	    strcmp(classname, parentname) != 0) {
 		dc->parent = devclass_find_internal(parentname, NULL, TRUE);
 		dc->parent->flags |= DC_HAS_CHILDREN;
 	}
 
 	return (dc);
 }
 
 /**
  * @brief Create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise create and return a new device class.
  *
  * @param classname	the devclass name to find or create
  */
 devclass_t
 devclass_create(const char *classname)
 {
 	return (devclass_find_internal(classname, NULL, TRUE));
 }
 
 /**
  * @brief Find a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise return @c NULL.
  *
  * @param classname	the devclass name to find
  */
 devclass_t
 devclass_find(const char *classname)
 {
 	return (devclass_find_internal(classname, NULL, FALSE));
 }
 
 /**
  * @brief Register that a device driver has been added to a devclass
  *
  * Register that a device driver has been added to a devclass.  This
  * is called by devclass_add_driver to accomplish the recursive
  * notification of all the children classes of dc, as well as dc.
  * Each layer will have BUS_DRIVER_ADDED() called for all instances of
  * the devclass.
  *
  * We do a full search here of the devclass list at each iteration
  * level to save storing children-lists in the devclass structure.  If
  * we ever move beyond a few dozen devices doing this, we may need to
  * reevaluate...
  *
  * @param dc		the devclass to edit
  * @param driver	the driver that was just added
  */
 static void
 devclass_driver_added(devclass_t dc, driver_t *driver)
 {
 	devclass_t parent;
 	int i;
 
 	/*
 	 * Call BUS_DRIVER_ADDED for any existing buses in this class.
 	 */
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i] && device_is_attached(dc->devices[i]))
 			BUS_DRIVER_ADDED(dc->devices[i], driver);
 
 	/*
 	 * Walk through the children classes.  Since we only keep a
 	 * single parent pointer around, we walk the entire list of
 	 * devclasses looking for children.  We set the
 	 * DC_HAS_CHILDREN flag when a child devclass is created on
 	 * the parent, so we only walk the list for those devclasses
 	 * that have children.
 	 */
 	if (!(dc->flags & DC_HAS_CHILDREN))
 		return;
 	parent = dc;
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		if (dc->parent == parent)
 			devclass_driver_added(dc, driver);
 	}
 }
 
 static void
 device_handle_nomatch(device_t dev)
 {
 	BUS_PROBE_NOMATCH(dev->parent, dev);
 	EVENTHANDLER_DIRECT_INVOKE(device_nomatch, dev);
 	dev->flags |= DF_DONENOMATCH;
 }
 
 /**
  * @brief Add a device driver to a device class
  *
  * Add a device driver to a devclass. This is normally called
  * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
  * all devices in the devclass will be called to allow them to attempt
  * to re-probe any unmatched children.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to register
  */
 int
 devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp)
 {
 	driverlink_t dl;
 	devclass_t child_dc;
 	const char *parentname;
 
 	PDEBUG(("%s", DRIVERNAME(driver)));
 
 	/* Don't allow invalid pass values. */
 	if (pass <= BUS_PASS_ROOT)
 		return (EINVAL);
 
 	dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dl)
 		return (ENOMEM);
 
 	/*
 	 * Compile the driver's methods. Also increase the reference count
 	 * so that the class doesn't get freed when the last instance
 	 * goes. This means we can safely use static methods and avoids a
 	 * double-free in devclass_delete_driver.
 	 */
 	kobj_class_compile((kobj_class_t) driver);
 
 	/*
 	 * If the driver has any base classes, make the
 	 * devclass inherit from the devclass of the driver's
 	 * first base class. This will allow the system to
 	 * search for drivers in both devclasses for children
 	 * of a device using this driver.
 	 */
 	if (driver->baseclasses)
 		parentname = driver->baseclasses[0]->name;
 	else
 		parentname = NULL;
 	child_dc = devclass_find_internal(driver->name, parentname, TRUE);
 	if (dcp != NULL)
 		*dcp = child_dc;
 
 	dl->driver = driver;
 	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
 	driver->refs++;		/* XXX: kobj_mtx */
 	dl->pass = pass;
 	driver_register_pass(dl);
 
 	if (device_frozen) {
 		dl->flags |= DL_DEFERRED_PROBE;
 	} else {
 		devclass_driver_added(dc, driver);
 	}
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Register that a device driver has been deleted from a devclass
  *
  * Register that a device driver has been removed from a devclass.
  * This is called by devclass_delete_driver to accomplish the
  * recursive notification of all the children classes of busclass, as
  * well as busclass.  Each layer will attempt to detach the driver
  * from any devices that are children of the bus's devclass.  The function
  * will return an error if a device fails to detach.
  *
  * We do a full search here of the devclass list at each iteration
  * level to save storing children-lists in the devclass structure.  If
  * we ever move beyond a few dozen devices doing this, we may need to
  * reevaluate...
  *
  * @param busclass	the devclass of the parent bus
  * @param dc		the devclass of the driver being deleted
  * @param driver	the driver being deleted
  */
 static int
 devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver)
 {
 	devclass_t parent;
 	device_t dev;
 	int error, i;
 
 	/*
 	 * Disassociate from any devices.  We iterate through all the
 	 * devices in the devclass of the driver and detach any which are
 	 * using the driver and which have a parent in the devclass which
 	 * we are deleting from.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not detach devices which are not children of devices in
 	 * the affected devclass.
 	 *
 	 * If we're frozen, we don't generate NOMATCH events. Mark to
 	 * generate later.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_detach(dev)) != 0)
 					return (error);
 				if (device_frozen) {
 					dev->flags &= ~DF_DONENOMATCH;
 					dev->flags |= DF_NEEDNOMATCH;
 				} else {
 					device_handle_nomatch(dev);
 				}
 			}
 		}
 	}
 
 	/*
 	 * Walk through the children classes.  Since we only keep a
 	 * single parent pointer around, we walk the entire list of
 	 * devclasses looking for children.  We set the
 	 * DC_HAS_CHILDREN flag when a child devclass is created on
 	 * the parent, so we only walk the list for those devclasses
 	 * that have children.
 	 */
 	if (!(busclass->flags & DC_HAS_CHILDREN))
 		return (0);
 	parent = busclass;
 	TAILQ_FOREACH(busclass, &devclasses, link) {
 		if (busclass->parent == parent) {
 			error = devclass_driver_deleted(busclass, dc, driver);
 			if (error)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Delete a device driver from a device class
  *
  * Delete a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_delete_driver() will first attempt to detach from each
  * device. If one of the detach calls fails, the driver will not be
  * deleted.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 int
 devclass_delete_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	error = devclass_driver_deleted(busclass, dc, driver);
 	if (error != 0)
 		return (error);
 
 	TAILQ_REMOVE(&busclass->drivers, dl, link);
 	free(dl, M_BUS);
 
 	/* XXX: kobj_mtx */
 	driver->refs--;
 	if (driver->refs == 0)
 		kobj_class_free((kobj_class_t) driver);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Quiesces a set of device drivers from a device class
  *
  * Quiesce a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_quiesece_driver() will first attempt to quiesce each
  * device.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 static int
 devclass_quiesce_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	device_t dev;
 	int i;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	/*
 	 * Quiesce all devices.  We iterate through all the devices in
 	 * the devclass of the driver and quiesce any which are using
 	 * the driver and which have a parent in the devclass which we
 	 * are quiescing.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not quiesce devices which are not children of
 	 * devices in the affected devclass.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_quiesce(dev)) != 0)
 					return (error);
 			}
 		}
 	}
 
 	return (0);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 devclass_find_driver_internal(devclass_t dc, const char *classname)
 {
 	driverlink_t dl;
 
 	PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		if (!strcmp(dl->driver->name, classname))
 			return (dl);
 	}
 
 	PDEBUG(("not found"));
 	return (NULL);
 }
 
 /**
  * @brief Return the name of the devclass
  */
 const char *
 devclass_get_name(devclass_t dc)
 {
 	return (dc->name);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  *
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 devclass_get_device(devclass_t dc, int unit)
 {
 	if (dc == NULL || unit < 0 || unit >= dc->maxunit)
 		return (NULL);
 	return (dc->devices[unit]);
 }
 
 /**
  * @brief Find the softc field of a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  *
  * @returns		the softc field of the device with the given
  *			unit number or @c NULL if there is no such
  *			device
  */
 void *
 devclass_get_softc(devclass_t dc, int unit)
 {
 	device_t dev;
 
 	dev = devclass_get_device(dc, unit);
 	if (!dev)
 		return (NULL);
 
 	return (device_get_softc(dev));
 }
 
 /**
  * @brief Get a list of devices in the devclass
  *
  * An array containing a list of all the devices in the given devclass
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP), even if @p *devcountp is 0.
  *
  * @param dc		the devclass to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
 {
 	int count, i;
 	device_t *list;
 
 	count = devclass_get_count(dc);
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			list[count] = dc->devices[i];
 			count++;
 		}
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get a list of drivers in the devclass
  *
  * An array containing a list of pointers to all the drivers in the
  * given devclass is allocated and returned in @p *listp.  The number
  * of drivers in the array is returned in @p *countp. The caller should
  * free the array using @c free(p, M_TEMP).
  *
  * @param dc		the devclass to examine
  * @param listp		gives location for array pointer return value
  * @param countp	gives location for number of array elements
  *			return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp)
 {
 	driverlink_t dl;
 	driver_t **list;
 	int count;
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link)
 		count++;
 	list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
 	if (list == NULL)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		list[count] = dl->driver;
 		count++;
 	}
 	*listp = list;
 	*countp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get the number of devices in a devclass
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_count(devclass_t dc)
 {
 	int count, i;
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			count++;
 	return (count);
 }
 
 /**
  * @brief Get the maximum unit number used in a devclass
  *
  * Note that this is one greater than the highest currently-allocated
  * unit.  If a null devclass_t is passed in, -1 is returned to indicate
  * that not even the devclass has been allocated yet.
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_maxunit(devclass_t dc)
 {
 	if (dc == NULL)
 		return (-1);
 	return (dc->maxunit);
 }
 
 /**
  * @brief Find a free unit number in a devclass
  *
  * This function searches for the first unused unit number greater
  * that or equal to @p unit.
  *
  * @param dc		the devclass to examine
  * @param unit		the first unit number to check
  */
 int
 devclass_find_free_unit(devclass_t dc, int unit)
 {
 	if (dc == NULL)
 		return (unit);
 	while (unit < dc->maxunit && dc->devices[unit] != NULL)
 		unit++;
 	return (unit);
 }
 
 /**
  * @brief Set the parent of a devclass
  *
  * The parent class is normally initialised automatically by
  * DRIVER_MODULE().
  *
  * @param dc		the devclass to edit
  * @param pdc		the new parent devclass
  */
 void
 devclass_set_parent(devclass_t dc, devclass_t pdc)
 {
 	dc->parent = pdc;
 }
 
 /**
  * @brief Get the parent of a devclass
  *
  * @param dc		the devclass to examine
  */
 devclass_t
 devclass_get_parent(devclass_t dc)
 {
 	return (dc->parent);
 }
 
 struct sysctl_ctx_list *
 devclass_get_sysctl_ctx(devclass_t dc)
 {
 	return (&dc->sysctl_ctx);
 }
 
 struct sysctl_oid *
 devclass_get_sysctl_tree(devclass_t dc)
 {
 	return (dc->sysctl_tree);
 }
 
 /**
  * @internal
  * @brief Allocate a unit number
  *
  * On entry, @p *unitp is the desired unit number (or @c -1 if any
  * will do). The allocated unit number is returned in @p *unitp.
 
  * @param dc		the devclass to allocate from
  * @param unitp		points at the location for the allocated unit
  *			number
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp)
 {
 	const char *s;
 	int unit = *unitp;
 
 	PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	/* Ask the parent bus if it wants to wire this device. */
 	if (unit == -1)
 		BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name,
 		    &unit);
 
 	/* If we were given a wired unit number, check for existing device */
 	/* XXX imp XXX */
 	if (unit != -1) {
 		if (unit >= 0 && unit < dc->maxunit &&
 		    dc->devices[unit] != NULL) {
 			if (bootverbose)
 				printf("%s: %s%d already exists; skipping it\n",
 				    dc->name, dc->name, *unitp);
 			return (EEXIST);
 		}
 	} else {
 		/* Unwired device, find the next available slot for it */
 		unit = 0;
 		for (unit = 0;; unit++) {
 			/* If this device slot is already in use, skip it. */
 			if (unit < dc->maxunit && dc->devices[unit] != NULL)
 				continue;
 
 			/* If there is an "at" hint for a unit then skip it. */
 			if (resource_string_value(dc->name, unit, "at", &s) ==
 			    0)
 				continue;
 
 			break;
 		}
 	}
 
 	/*
 	 * We've selected a unit beyond the length of the table, so let's
 	 * extend the table to make room for all units up to and including
 	 * this one.
 	 */
 	if (unit >= dc->maxunit) {
 		device_t *newlist, *oldlist;
 		int newsize;
 
 		oldlist = dc->devices;
 		newsize = roundup((unit + 1),
 		    MAX(1, MINALLOCSIZE / sizeof(device_t)));
 		newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
 		if (!newlist)
 			return (ENOMEM);
 		if (oldlist != NULL)
 			bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit);
 		bzero(newlist + dc->maxunit,
 		    sizeof(device_t) * (newsize - dc->maxunit));
 		dc->devices = newlist;
 		dc->maxunit = newsize;
 		if (oldlist != NULL)
 			free(oldlist, M_BUS);
 	}
 	PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	*unitp = unit;
 	return (0);
 }
 
 /**
  * @internal
  * @brief Add a device to a devclass
  *
  * A unit number is allocated for the device (using the device's
  * preferred unit number if any) and the device is registered in the
  * devclass. This allows the device to be looked up by its unit
  * number, e.g. by decoding a dev_t minor number.
  *
  * @param dc		the devclass to add to
  * @param dev		the device to add
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_add_device(devclass_t dc, device_t dev)
 {
 	int buflen, error;
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX);
 	if (buflen < 0)
 		return (ENOMEM);
 	dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev->nameunit)
 		return (ENOMEM);
 
 	if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) {
 		free(dev->nameunit, M_BUS);
 		dev->nameunit = NULL;
 		return (error);
 	}
 	dc->devices[dev->unit] = dev;
 	dev->devclass = dc;
 	snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Delete a device from a devclass
  *
  * The device is removed from the devclass's device list and its unit
  * number is freed.
 
  * @param dc		the devclass to delete from
  * @param dev		the device to delete
  *
  * @retval 0		success
  */
 static int
 devclass_delete_device(devclass_t dc, device_t dev)
 {
 	if (!dc || !dev)
 		return (0);
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	if (dev->devclass != dc || dc->devices[dev->unit] != dev)
 		panic("devclass_delete_device: inconsistent device class");
 	dc->devices[dev->unit] = NULL;
 	if (dev->flags & DF_WILDCARD)
 		dev->unit = -1;
 	dev->devclass = NULL;
 	free(dev->nameunit, M_BUS);
 	dev->nameunit = NULL;
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Make a new device and add it as a child of @p parent
  *
  * @param parent	the parent of the new device
  * @param name		the devclass name of the new device or @c NULL
  *			to leave the devclass unspecified
  * @parem unit		the unit number of the new device of @c -1 to
  *			leave the unit number unspecified
  *
  * @returns the new device
  */
 static device_t
 make_device(device_t parent, const char *name, int unit)
 {
 	device_t dev;
 	devclass_t dc;
 
 	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
 
 	if (name) {
 		dc = devclass_find_internal(name, NULL, TRUE);
 		if (!dc) {
 			printf("make_device: can't find device class %s\n",
 			    name);
 			return (NULL);
 		}
 	} else {
 		dc = NULL;
 	}
 
 	dev = malloc(sizeof(*dev), M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev)
 		return (NULL);
 
 	dev->parent = parent;
 	TAILQ_INIT(&dev->children);
 	kobj_init((kobj_t) dev, &null_class);
 	dev->driver = NULL;
 	dev->devclass = NULL;
 	dev->unit = unit;
 	dev->nameunit = NULL;
 	dev->desc = NULL;
 	dev->busy = 0;
 	dev->devflags = 0;
 	dev->flags = DF_ENABLED;
 	dev->order = 0;
 	if (unit == -1)
 		dev->flags |= DF_WILDCARD;
 	if (name) {
 		dev->flags |= DF_FIXEDCLASS;
 		if (devclass_add_device(dc, dev)) {
 			kobj_delete((kobj_t) dev, M_BUS);
 			return (NULL);
 		}
 	}
 	if (parent != NULL && device_has_quiet_children(parent))
 		dev->flags |= DF_QUIET | DF_QUIET_CHILDREN;
 	dev->ivars = NULL;
 	dev->softc = NULL;
 
 	dev->state = DS_NOTPRESENT;
 
 	TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
 	bus_data_generation_update();
 
 	return (dev);
 }
 
 /**
  * @internal
  * @brief Print a description of a device.
  */
 static int
 device_print_child(device_t dev, device_t child)
 {
 	int retval = 0;
 
 	if (device_is_alive(child))
 		retval += BUS_PRINT_CHILD(dev, child);
 	else
 		retval += device_printf(child, " not found\n");
 
 	return (retval);
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with order zero.
  *
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  *
  * @returns		the new device
  */
 device_t
 device_add_child(device_t dev, const char *name, int unit)
 {
 	return (device_add_child_ordered(dev, 0, name, unit));
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with the same order.
  *
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param order		a value which is used to partially sort the
  *			children of @p dev - devices created using
  *			lower values of @p order appear first in @p
  *			dev's list of children
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  *
  * @returns		the new device
  */
 device_t
 device_add_child_ordered(device_t dev, u_int order, const char *name, int unit)
 {
 	device_t child;
 	device_t place;
 
 	PDEBUG(("%s at %s with order %u as unit %d",
 	    name, DEVICENAME(dev), order, unit));
 	KASSERT(name != NULL || unit == -1,
 	    ("child device with wildcard name and specific unit number"));
 
 	child = make_device(dev, name, unit);
 	if (child == NULL)
 		return (child);
 	child->order = order;
 
 	TAILQ_FOREACH(place, &dev->children, link) {
 		if (place->order > order)
 			break;
 	}
 
 	if (place) {
 		/*
 		 * The device 'place' is the first device whose order is
 		 * greater than the new child.
 		 */
 		TAILQ_INSERT_BEFORE(place, child, link);
 	} else {
 		/*
 		 * The new child's order is greater or equal to the order of
 		 * any existing device. Add the child to the tail of the list.
 		 */
 		TAILQ_INSERT_TAIL(&dev->children, child, link);
 	}
 
 	bus_data_generation_update();
 	return (child);
 }
 
 /**
  * @brief Delete a device
  *
  * This function deletes a device along with all of its children. If
  * the device currently has a driver attached to it, the device is
  * detached first using device_detach().
  *
  * @param dev		the parent device
  * @param child		the device to delete
  *
  * @retval 0		success
  * @retval non-zero	a unit error code describing the error
  */
 int
 device_delete_child(device_t dev, device_t child)
 {
 	int error;
 	device_t grandchild;
 
 	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
 
 	/* detach parent before deleting children, if any */
 	if ((error = device_detach(child)) != 0)
 		return (error);
 
 	/* remove children second */
 	while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) {
 		error = device_delete_child(child, grandchild);
 		if (error)
 			return (error);
 	}
 
 	if (child->devclass)
 		devclass_delete_device(child->devclass, child);
 	if (child->parent)
 		BUS_CHILD_DELETED(dev, child);
 	TAILQ_REMOVE(&dev->children, child, link);
 	TAILQ_REMOVE(&bus_data_devices, child, devlink);
 	kobj_delete((kobj_t) child, M_BUS);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Delete all children devices of the given device, if any.
  *
  * This function deletes all children devices of the given device, if
  * any, using the device_delete_child() function for each device it
  * finds. If a child device cannot be deleted, this function will
  * return an error code.
  *
  * @param dev		the parent device
  *
  * @retval 0		success
  * @retval non-zero	a device would not detach
  */
 int
 device_delete_children(device_t dev)
 {
 	device_t child;
 	int error;
 
 	PDEBUG(("Deleting all children of %s", DEVICENAME(dev)));
 
 	error = 0;
 
 	while ((child = TAILQ_FIRST(&dev->children)) != NULL) {
 		error = device_delete_child(dev, child);
 		if (error) {
 			PDEBUG(("Failed deleting %s", DEVICENAME(child)));
 			break;
 		}
 	}
 	return (error);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * This is similar to devclass_get_devices() but only searches for
  * devices which have @p dev as a parent.
  *
  * @param dev		the parent device to search
  * @param unit		the unit number to search for.  If the unit is -1,
  *			return the first child of @p dev which has name
  *			@p classname (that is, the one with the lowest unit.)
  *
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 device_find_child(device_t dev, const char *classname, int unit)
 {
 	devclass_t dc;
 	device_t child;
 
 	dc = devclass_find(classname);
 	if (!dc)
 		return (NULL);
 
 	if (unit != -1) {
 		child = devclass_get_device(dc, unit);
 		if (child && child->parent == dev)
 			return (child);
 	} else {
 		for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
 			child = devclass_get_device(dc, unit);
 			if (child && child->parent == dev)
 				return (child);
 		}
 	}
 	return (NULL);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 first_matching_driver(devclass_t dc, device_t dev)
 {
 	if (dev->devclass)
 		return (devclass_find_driver_internal(dc, dev->devclass->name));
 	return (TAILQ_FIRST(&dc->drivers));
 }
 
 /**
  * @internal
  */
 static driverlink_t
 next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
 {
 	if (dev->devclass) {
 		driverlink_t dl;
 		for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
 			if (!strcmp(dev->devclass->name, dl->driver->name))
 				return (dl);
 		return (NULL);
 	}
 	return (TAILQ_NEXT(last, link));
 }
 
 /**
  * @internal
  */
 int
 device_probe_child(device_t dev, device_t child)
 {
 	devclass_t dc;
 	driverlink_t best = NULL;
 	driverlink_t dl;
 	int result, pri = 0;
 	/* We should preserve the devclass (or lack of) set by the bus. */
 	int hasclass = (child->devclass != NULL);
 
 	bus_topo_assert();
 
 	dc = dev->devclass;
 	if (!dc)
 		panic("device_probe_child: parent device has no devclass");
 
 	/*
 	 * If the state is already probed, then return.
 	 */
 	if (child->state == DS_ALIVE)
 		return (0);
 
 	for (; dc; dc = dc->parent) {
 		for (dl = first_matching_driver(dc, child);
 		     dl;
 		     dl = next_matching_driver(dc, child, dl)) {
 			/* If this driver's pass is too high, then ignore it. */
 			if (dl->pass > bus_current_pass)
 				continue;
 
 			PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
 			result = device_set_driver(child, dl->driver);
 			if (result == ENOMEM)
 				return (result);
 			else if (result != 0)
 				continue;
 			if (!hasclass) {
 				if (device_set_devclass(child,
 				    dl->driver->name) != 0) {
 					char const * devname =
 					    device_get_name(child);
 					if (devname == NULL)
 						devname = "(unknown)";
 					printf("driver bug: Unable to set "
 					    "devclass (class: %s "
 					    "devname: %s)\n",
 					    dl->driver->name,
 					    devname);
 					(void)device_set_driver(child, NULL);
 					continue;
 				}
 			}
 
 			/* Fetch any flags for the device before probing. */
 			resource_int_value(dl->driver->name, child->unit,
 			    "flags", &child->devflags);
 
 			result = DEVICE_PROBE(child);
 
 			/*
 			 * If the driver returns SUCCESS, there can be
 			 * no higher match for this device.
 			 */
 			if (result == 0) {
 				best = dl;
 				pri = 0;
 				break;
 			}
 
 			/* Reset flags and devclass before the next probe. */
 			child->devflags = 0;
 			if (!hasclass)
 				(void)device_set_devclass(child, NULL);
 
 			/*
 			 * Reset DF_QUIET in case this driver doesn't
 			 * end up as the best driver.
 			 */
 			device_verbose(child);
 
 			/*
 			 * Probes that return BUS_PROBE_NOWILDCARD or lower
 			 * only match on devices whose driver was explicitly
 			 * specified.
 			 */
 			if (result <= BUS_PROBE_NOWILDCARD &&
 			    !(child->flags & DF_FIXEDCLASS)) {
 				result = ENXIO;
 			}
 
 			/*
 			 * The driver returned an error so it
 			 * certainly doesn't match.
 			 */
 			if (result > 0) {
 				(void)device_set_driver(child, NULL);
 				continue;
 			}
 
 			/*
 			 * A priority lower than SUCCESS, remember the
 			 * best matching driver. Initialise the value
 			 * of pri for the first match.
 			 */
 			if (best == NULL || result > pri) {
 				best = dl;
 				pri = result;
 				continue;
 			}
 		}
 		/*
 		 * If we have an unambiguous match in this devclass,
 		 * don't look in the parent.
 		 */
 		if (best && pri == 0)
 			break;
 	}
 
 	if (best == NULL)
 		return (ENXIO);
 
 	/*
 	 * If we found a driver, change state and initialise the devclass.
 	 */
 	if (pri < 0) {
 		/* Set the winning driver, devclass, and flags. */
 		result = device_set_driver(child, best->driver);
 		if (result != 0)
 			return (result);
 		if (!child->devclass) {
 			result = device_set_devclass(child, best->driver->name);
 			if (result != 0) {
 				(void)device_set_driver(child, NULL);
 				return (result);
 			}
 		}
 		resource_int_value(best->driver->name, child->unit,
 		    "flags", &child->devflags);
 
 		/*
 		 * A bit bogus. Call the probe method again to make sure
 		 * that we have the right description.
 		 */
 		result = DEVICE_PROBE(child);
 		if (result > 0) {
 			if (!hasclass)
 				(void)device_set_devclass(child, NULL);
 			(void)device_set_driver(child, NULL);
 			return (result);
 		}
 	}
 
 	child->state = DS_ALIVE;
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Return the parent of a device
  */
 device_t
 device_get_parent(device_t dev)
 {
 	return (dev->parent);
 }
 
 /**
  * @brief Get a list of children of a device
  *
  * An array containing a list of all the children of the given device
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP).
  *
  * @param dev		the device to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 device_get_children(device_t dev, device_t **devlistp, int *devcountp)
 {
 	int count;
 	device_t child;
 	device_t *list;
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		count++;
 	}
 	if (count == 0) {
 		*devlistp = NULL;
 		*devcountp = 0;
 		return (0);
 	}
 
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		list[count] = child;
 		count++;
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Return the current driver for the device or @c NULL if there
  * is no driver currently attached
  */
 driver_t *
 device_get_driver(device_t dev)
 {
 	return (dev->driver);
 }
 
 /**
  * @brief Return the current devclass for the device or @c NULL if
  * there is none.
  */
 devclass_t
 device_get_devclass(device_t dev)
 {
 	return (dev->devclass);
 }
 
 /**
  * @brief Return the name of the device's devclass or @c NULL if there
  * is none.
  */
 const char *
 device_get_name(device_t dev)
 {
 	if (dev != NULL && dev->devclass)
 		return (devclass_get_name(dev->devclass));
 	return (NULL);
 }
 
 /**
  * @brief Return a string containing the device's devclass name
  * followed by an ascii representation of the device's unit number
  * (e.g. @c "foo2").
  */
 const char *
 device_get_nameunit(device_t dev)
 {
 	return (dev->nameunit);
 }
 
 /**
  * @brief Return the device's unit number.
  */
 int
 device_get_unit(device_t dev)
 {
 	return (dev->unit);
 }
 
 /**
  * @brief Return the device's description string
  */
 const char *
 device_get_desc(device_t dev)
 {
 	return (dev->desc);
 }
 
 /**
  * @brief Return the device's flags
  */
 uint32_t
 device_get_flags(device_t dev)
 {
 	return (dev->devflags);
 }
 
 struct sysctl_ctx_list *
 device_get_sysctl_ctx(device_t dev)
 {
 	return (&dev->sysctl_ctx);
 }
 
 struct sysctl_oid *
 device_get_sysctl_tree(device_t dev)
 {
 	return (dev->sysctl_tree);
 }
 
 /**
  * @brief Print the name of the device followed by a colon and a space
  *
  * @returns the number of characters printed
  */
 int
 device_print_prettyname(device_t dev)
 {
 	const char *name = device_get_name(dev);
 
 	if (name == NULL)
 		return (printf("unknown: "));
 	return (printf("%s%d: ", name, device_get_unit(dev)));
 }
 
 /**
  * @brief Print the name of the device followed by a colon, a space
  * and the result of calling vprintf() with the value of @p fmt and
  * the following arguments.
  *
  * @returns the number of characters printed
  */
 int
 device_printf(device_t dev, const char * fmt, ...)
 {
 	char buf[128];
 	struct sbuf sb;
 	const char *name;
 	va_list ap;
 	size_t retval;
 
 	retval = 0;
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 	sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
 
 	name = device_get_name(dev);
 
 	if (name == NULL)
 		sbuf_cat(&sb, "unknown: ");
 	else
 		sbuf_printf(&sb, "%s%d: ", name, device_get_unit(dev));
 
 	va_start(ap, fmt);
 	sbuf_vprintf(&sb, fmt, ap);
 	va_end(ap);
 
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	return (retval);
 }
 
 /**
  * @brief Print the name of the device followed by a colon, a space
  * and the result of calling log() with the value of @p fmt and
  * the following arguments.
  *
  * @returns the number of characters printed
  */
 int
 device_log(device_t dev, int pri, const char * fmt, ...)
 {
 	char buf[128];
 	struct sbuf sb;
 	const char *name;
 	va_list ap;
 	size_t retval;
 
 	retval = 0;
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 
 	name = device_get_name(dev);
 
 	if (name == NULL)
 		sbuf_cat(&sb, "unknown: ");
 	else
 		sbuf_printf(&sb, "%s%d: ", name, device_get_unit(dev));
 
 	va_start(ap, fmt);
 	sbuf_vprintf(&sb, fmt, ap);
 	va_end(ap);
 
 	sbuf_finish(&sb);
 
 	log(pri, "%.*s", (int) sbuf_len(&sb), sbuf_data(&sb));
 	retval = sbuf_len(&sb);
 
 	sbuf_delete(&sb);
 
 	return (retval);
 }
 
 /**
  * @internal
  */
 static void
 device_set_desc_internal(device_t dev, const char* desc, int copy)
 {
 	if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
 		free(dev->desc, M_BUS);
 		dev->flags &= ~DF_DESCMALLOCED;
 		dev->desc = NULL;
 	}
 
 	if (copy && desc) {
 		dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
 		if (dev->desc) {
 			strcpy(dev->desc, desc);
 			dev->flags |= DF_DESCMALLOCED;
 		}
 	} else {
 		/* Avoid a -Wcast-qual warning */
 		dev->desc = (char *)(uintptr_t) desc;
 	}
 
 	bus_data_generation_update();
 }
 
 /**
  * @brief Set the device's description
  *
  * The value of @c desc should be a string constant that will not
  * change (at least until the description is changed in a subsequent
  * call to device_set_desc() or device_set_desc_copy()).
  */
 void
 device_set_desc(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, FALSE);
 }
 
 /**
  * @brief Set the device's description
  *
  * The string pointed to by @c desc is copied. Use this function if
  * the device description is generated, (e.g. with sprintf()).
  */
 void
 device_set_desc_copy(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, TRUE);
 }
 
 /**
  * @brief Set the device's flags
  */
 void
 device_set_flags(device_t dev, uint32_t flags)
 {
 	dev->devflags = flags;
 }
 
 /**
  * @brief Return the device's softc field
  *
  * The softc is allocated and zeroed when a driver is attached, based
  * on the size field of the driver.
  */
 void *
 device_get_softc(device_t dev)
 {
 	return (dev->softc);
 }
 
 /**
  * @brief Set the device's softc field
  *
  * Most drivers do not need to use this since the softc is allocated
  * automatically when the driver is attached.
  */
 void
 device_set_softc(device_t dev, void *softc)
 {
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
 		free(dev->softc, M_BUS_SC);
 	dev->softc = softc;
 	if (dev->softc)
 		dev->flags |= DF_EXTERNALSOFTC;
 	else
 		dev->flags &= ~DF_EXTERNALSOFTC;
 }
 
 /**
  * @brief Free claimed softc
  *
  * Most drivers do not need to use this since the softc is freed
  * automatically when the driver is detached.
  */
 void
 device_free_softc(void *softc)
 {
 	free(softc, M_BUS_SC);
 }
 
 /**
  * @brief Claim softc
  *
  * This function can be used to let the driver free the automatically
  * allocated softc using "device_free_softc()". This function is
  * useful when the driver is refcounting the softc and the softc
  * cannot be freed when the "device_detach" method is called.
  */
 void
 device_claim_softc(device_t dev)
 {
 	if (dev->softc)
 		dev->flags |= DF_EXTERNALSOFTC;
 	else
 		dev->flags &= ~DF_EXTERNALSOFTC;
 }
 
 /**
  * @brief Get the device's ivars field
  *
  * The ivars field is used by the parent device to store per-device
  * state (e.g. the physical location of the device or a list of
  * resources).
  */
 void *
 device_get_ivars(device_t dev)
 {
 	KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
 	return (dev->ivars);
 }
 
 /**
  * @brief Set the device's ivars field
  */
 void
 device_set_ivars(device_t dev, void * ivars)
 {
 	KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
 	dev->ivars = ivars;
 }
 
 /**
  * @brief Return the device's state
  */
 device_state_t
 device_get_state(device_t dev)
 {
 	return (dev->state);
 }
 
 /**
  * @brief Set the DF_ENABLED flag for the device
  */
 void
 device_enable(device_t dev)
 {
 	dev->flags |= DF_ENABLED;
 }
 
 /**
  * @brief Clear the DF_ENABLED flag for the device
  */
 void
 device_disable(device_t dev)
 {
 	dev->flags &= ~DF_ENABLED;
 }
 
 /**
  * @brief Increment the busy counter for the device
  */
 void
 device_busy(device_t dev)
 {
 
 	/*
 	 * Mark the device as busy, recursively up the tree if this busy count
 	 * goes 0->1.
 	 */
 	if (refcount_acquire(&dev->busy) == 0 && dev->parent != NULL)
 		device_busy(dev->parent);
 }
 
 /**
  * @brief Decrement the busy counter for the device
  */
 void
 device_unbusy(device_t dev)
 {
 
 	/*
 	 * Mark the device as unbsy, recursively if this is the last busy count.
 	 */
 	if (refcount_release(&dev->busy) && dev->parent != NULL)
 		device_unbusy(dev->parent);
 }
 
 /**
  * @brief Set the DF_QUIET flag for the device
  */
 void
 device_quiet(device_t dev)
 {
 	dev->flags |= DF_QUIET;
 }
 
 /**
  * @brief Set the DF_QUIET_CHILDREN flag for the device
  */
 void
 device_quiet_children(device_t dev)
 {
 	dev->flags |= DF_QUIET_CHILDREN;
 }
 
 /**
  * @brief Clear the DF_QUIET flag for the device
  */
 void
 device_verbose(device_t dev)
 {
 	dev->flags &= ~DF_QUIET;
 }
 
 ssize_t
 device_get_property(device_t dev, const char *prop, void *val, size_t sz,
     device_property_type_t type)
 {
 	device_t bus = device_get_parent(dev);
 
 	switch (type) {
 	case DEVICE_PROP_ANY:
 	case DEVICE_PROP_BUFFER:
 	case DEVICE_PROP_HANDLE:	/* Size checks done in implementation. */
 		break;
 	case DEVICE_PROP_UINT32:
 		if (sz % 4 != 0)
 			return (-1);
 		break;
 	case DEVICE_PROP_UINT64:
 		if (sz % 8 != 0)
 			return (-1);
 		break;
 	default:
 		return (-1);
 	}
 
 	return (BUS_GET_PROPERTY(bus, dev, prop, val, sz, type));
 }
 
 bool
 device_has_property(device_t dev, const char *prop)
 {
 	return (device_get_property(dev, prop, NULL, 0, DEVICE_PROP_ANY) >= 0);
 }
 
 /**
  * @brief Return non-zero if the DF_QUIET_CHIDLREN flag is set on the device
  */
 int
 device_has_quiet_children(device_t dev)
 {
 	return ((dev->flags & DF_QUIET_CHILDREN) != 0);
 }
 
 /**
  * @brief Return non-zero if the DF_QUIET flag is set on the device
  */
 int
 device_is_quiet(device_t dev)
 {
 	return ((dev->flags & DF_QUIET) != 0);
 }
 
 /**
  * @brief Return non-zero if the DF_ENABLED flag is set on the device
  */
 int
 device_is_enabled(device_t dev)
 {
 	return ((dev->flags & DF_ENABLED) != 0);
 }
 
 /**
  * @brief Return non-zero if the device was successfully probed
  */
 int
 device_is_alive(device_t dev)
 {
 	return (dev->state >= DS_ALIVE);
 }
 
 /**
  * @brief Return non-zero if the device currently has a driver
  * attached to it
  */
 int
 device_is_attached(device_t dev)
 {
 	return (dev->state >= DS_ATTACHED);
 }
 
 /**
  * @brief Return non-zero if the device is currently suspended.
  */
 int
 device_is_suspended(device_t dev)
 {
 	return ((dev->flags & DF_SUSPENDED) != 0);
 }
 
 /**
  * @brief Set the devclass of a device
  * @see devclass_add_device().
  */
 int
 device_set_devclass(device_t dev, const char *classname)
 {
 	devclass_t dc;
 	int error;
 
 	if (!classname) {
 		if (dev->devclass)
 			devclass_delete_device(dev->devclass, dev);
 		return (0);
 	}
 
 	if (dev->devclass) {
 		printf("device_set_devclass: device class already set\n");
 		return (EINVAL);
 	}
 
 	dc = devclass_find_internal(classname, NULL, TRUE);
 	if (!dc)
 		return (ENOMEM);
 
 	error = devclass_add_device(dc, dev);
 
 	bus_data_generation_update();
 	return (error);
 }
 
 /**
  * @brief Set the devclass of a device and mark the devclass fixed.
  * @see device_set_devclass()
  */
 int
 device_set_devclass_fixed(device_t dev, const char *classname)
 {
 	int error;
 
 	if (classname == NULL)
 		return (EINVAL);
 
 	error = device_set_devclass(dev, classname);
 	if (error)
 		return (error);
 	dev->flags |= DF_FIXEDCLASS;
 	return (0);
 }
 
 /**
  * @brief Query the device to determine if it's of a fixed devclass
  * @see device_set_devclass_fixed()
  */
 bool
 device_is_devclass_fixed(device_t dev)
 {
 	return ((dev->flags & DF_FIXEDCLASS) != 0);
 }
 
 /**
  * @brief Set the driver of a device
  *
  * @retval 0		success
  * @retval EBUSY	the device already has a driver attached
  * @retval ENOMEM	a memory allocation failure occurred
  */
 int
 device_set_driver(device_t dev, driver_t *driver)
 {
 	int domain;
 	struct domainset *policy;
 
 	if (dev->state >= DS_ATTACHED)
 		return (EBUSY);
 
 	if (dev->driver == driver)
 		return (0);
 
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
 		free(dev->softc, M_BUS_SC);
 		dev->softc = NULL;
 	}
 	device_set_desc(dev, NULL);
 	kobj_delete((kobj_t) dev, NULL);
 	dev->driver = driver;
 	if (driver) {
 		kobj_init((kobj_t) dev, (kobj_class_t) driver);
 		if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
 			if (bus_get_domain(dev, &domain) == 0)
 				policy = DOMAINSET_PREF(domain);
 			else
 				policy = DOMAINSET_RR();
 			dev->softc = malloc_domainset(driver->size, M_BUS_SC,
 			    policy, M_NOWAIT | M_ZERO);
 			if (!dev->softc) {
 				kobj_delete((kobj_t) dev, NULL);
 				kobj_init((kobj_t) dev, &null_class);
 				dev->driver = NULL;
 				return (ENOMEM);
 			}
 		}
 	} else {
 		kobj_init((kobj_t) dev, &null_class);
 	}
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Probe a device, and return this status.
  *
  * This function is the core of the device autoconfiguration
  * system. Its purpose is to select a suitable driver for a device and
  * then call that driver to initialise the hardware appropriately. The
  * driver is selected by calling the DEVICE_PROBE() method of a set of
  * candidate drivers and then choosing the driver which returned the
  * best value. This driver is then attached to the device using
  * device_attach().
  *
  * The set of suitable drivers is taken from the list of drivers in
  * the parent device's devclass. If the device was originally created
  * with a specific class name (see device_add_child()), only drivers
  * with that name are probed, otherwise all drivers in the devclass
  * are probed. If no drivers return successful probe values in the
  * parent devclass, the search continues in the parent of that
  * devclass (see devclass_get_parent()) if any.
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  * @retval -1		Device already attached
  */
 int
 device_probe(device_t dev)
 {
 	int error;
 
 	bus_topo_assert();
 
 	if (dev->state >= DS_ALIVE)
 		return (-1);
 
 	if (!(dev->flags & DF_ENABLED)) {
 		if (bootverbose && device_get_name(dev) != NULL) {
 			device_print_prettyname(dev);
 			printf("not probed (disabled)\n");
 		}
 		return (-1);
 	}
 	if ((error = device_probe_child(dev->parent, dev)) != 0) {
 		if (bus_current_pass == BUS_PASS_DEFAULT &&
 		    !(dev->flags & DF_DONENOMATCH)) {
 			device_handle_nomatch(dev);
 		}
 		return (error);
 	}
 	return (0);
 }
 
 /**
  * @brief Probe a device and attach a driver if possible
  *
  * calls device_probe() and attaches if that was successful.
  */
 int
 device_probe_and_attach(device_t dev)
 {
 	int error;
 
 	bus_topo_assert();
 
 	error = device_probe(dev);
 	if (error == -1)
 		return (0);
 	else if (error != 0)
 		return (error);
 
 	CURVNET_SET_QUIET(vnet0);
 	error = device_attach(dev);
 	CURVNET_RESTORE();
 	return error;
 }
 
 /**
  * @brief Attach a device driver to a device
  *
  * This function is a wrapper around the DEVICE_ATTACH() driver
  * method. In addition to calling DEVICE_ATTACH(), it initialises the
  * device's sysctl tree, optionally prints a description of the device
  * and queues a notification event for user-based device management
  * services.
  *
  * Normally this function is only called internally from
  * device_probe_and_attach().
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_attach(device_t dev)
 {
 	uint64_t attachtime;
 	uint16_t attachentropy;
 	int error;
 
 	if (resource_disabled(dev->driver->name, dev->unit)) {
 		device_disable(dev);
 		if (bootverbose)
 			 device_printf(dev, "disabled via hints entry\n");
 		return (ENXIO);
 	}
 
 	device_sysctl_init(dev);
 	if (!device_is_quiet(dev))
 		device_print_child(dev->parent, dev);
 	attachtime = get_cyclecount();
 	dev->state = DS_ATTACHING;
 	if ((error = DEVICE_ATTACH(dev)) != 0) {
 		printf("device_attach: %s%d attach returned %d\n",
 		    dev->driver->name, dev->unit, error);
 		if (disable_failed_devs) {
 			/*
 			 * When the user has asked to disable failed devices, we
 			 * directly disable the device, but leave it in the
 			 * attaching state. It will not try to probe/attach the
 			 * device further. This leaves the device numbering
 			 * intact for other similar devices in the system. It
 			 * can be removed from this state with devctl.
 			 */
 			device_disable(dev);
 		} else {
 			/*
 			 * Otherwise, when attach fails, tear down the state
 			 * around that so we can retry when, for example, new
 			 * drivers are loaded.
 			 */
 			if (!(dev->flags & DF_FIXEDCLASS))
 				devclass_delete_device(dev->devclass, dev);
 			(void)device_set_driver(dev, NULL);
 			device_sysctl_fini(dev);
 			KASSERT(dev->busy == 0, ("attach failed but busy"));
 			dev->state = DS_NOTPRESENT;
 		}
 		return (error);
 	}
 	dev->flags |= DF_ATTACHED_ONCE;
 	/*
 	 * We only need the low bits of this time, but ranges from tens to thousands
 	 * have been seen, so keep 2 bytes' worth.
 	 */
 	attachentropy = (uint16_t)(get_cyclecount() - attachtime);
 	random_harvest_direct(&attachentropy, sizeof(attachentropy), RANDOM_ATTACH);
 	device_sysctl_update(dev);
 	dev->state = DS_ATTACHED;
 	dev->flags &= ~DF_DONENOMATCH;
 	EVENTHANDLER_DIRECT_INVOKE(device_attach, dev);
 	return (0);
 }
 
 /**
  * @brief Detach a driver from a device
  *
  * This function is a wrapper around the DEVICE_DETACH() driver
  * method. If the call to DEVICE_DETACH() succeeds, it calls
  * BUS_CHILD_DETACHED() for the parent of @p dev, queues a
  * notification event for user-based device management services and
  * cleans up the device's sysctl tree.
  *
  * @param dev		the device to un-initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_detach(device_t dev)
 {
 	int error;
 
 	bus_topo_assert();
 
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->busy > 0)
 		return (EBUSY);
 	if (dev->state == DS_ATTACHING) {
 		device_printf(dev, "device in attaching state! Deferring detach.\n");
 		return (EBUSY);
 	}
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	EVENTHANDLER_DIRECT_INVOKE(device_detach, dev, EVHDEV_DETACH_BEGIN);
 	if ((error = DEVICE_DETACH(dev)) != 0) {
 		EVENTHANDLER_DIRECT_INVOKE(device_detach, dev,
 		    EVHDEV_DETACH_FAILED);
 		return (error);
 	} else {
 		EVENTHANDLER_DIRECT_INVOKE(device_detach, dev,
 		    EVHDEV_DETACH_COMPLETE);
 	}
 	if (!device_is_quiet(dev))
 		device_printf(dev, "detached\n");
 	if (dev->parent)
 		BUS_CHILD_DETACHED(dev->parent, dev);
 
 	if (!(dev->flags & DF_FIXEDCLASS))
 		devclass_delete_device(dev->devclass, dev);
 
 	device_verbose(dev);
 	dev->state = DS_NOTPRESENT;
 	(void)device_set_driver(dev, NULL);
 	device_sysctl_fini(dev);
 
 	return (0);
 }
 
 /**
  * @brief Tells a driver to quiesce itself.
  *
  * This function is a wrapper around the DEVICE_QUIESCE() driver
  * method. If the call to DEVICE_QUIESCE() succeeds.
  *
  * @param dev		the device to quiesce
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_quiesce(device_t dev)
 {
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->busy > 0)
 		return (EBUSY);
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	return (DEVICE_QUIESCE(dev));
 }
 
 /**
  * @brief Notify a device of system shutdown
  *
  * This function calls the DEVICE_SHUTDOWN() driver method if the
  * device currently has an attached driver.
  *
  * @returns the value returned by DEVICE_SHUTDOWN()
  */
 int
 device_shutdown(device_t dev)
 {
 	if (dev->state < DS_ATTACHED)
 		return (0);
 	return (DEVICE_SHUTDOWN(dev));
 }
 
 /**
  * @brief Set the unit number of a device
  *
  * This function can be used to override the unit number used for a
  * device (e.g. to wire a device to a pre-configured unit number).
  */
 int
 device_set_unit(device_t dev, int unit)
 {
 	devclass_t dc;
 	int err;
 
 	if (unit == dev->unit)
 		return (0);
 	dc = device_get_devclass(dev);
 	if (unit < dc->maxunit && dc->devices[unit])
 		return (EBUSY);
 	err = devclass_delete_device(dc, dev);
 	if (err)
 		return (err);
 	dev->unit = unit;
 	err = devclass_add_device(dc, dev);
 	if (err)
 		return (err);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /*======================================*/
 /*
  * Some useful method implementations to make life easier for bus drivers.
  */
 
 void
 resource_init_map_request_impl(struct resource_map_request *args, size_t sz)
 {
 	bzero(args, sz);
 	args->size = sz;
 	args->memattr = VM_MEMATTR_DEVICE;
 }
 
 /**
  * @brief Initialise a resource list.
  *
  * @param rl		the resource list to initialise
  */
 void
 resource_list_init(struct resource_list *rl)
 {
 	STAILQ_INIT(rl);
 }
 
 /**
  * @brief Reclaim memory used by a resource list.
  *
  * This function frees the memory for all resource entries on the list
  * (if any).
  *
  * @param rl		the resource list to free
  */
 void
 resource_list_free(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	while ((rle = STAILQ_FIRST(rl)) != NULL) {
 		if (rle->res)
 			panic("resource_list_free: resource entry is busy");
 		STAILQ_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Add a resource entry.
  *
  * This function adds a resource entry using the given @p type, @p
  * start, @p end and @p count values. A rid value is chosen by
  * searching sequentially for the first unused rid starting at zero.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 int
 resource_list_add_next(struct resource_list *rl, int type, rman_res_t start,
     rman_res_t end, rman_res_t count)
 {
 	int rid;
 
 	rid = 0;
 	while (resource_list_find(rl, type, rid) != NULL)
 		rid++;
 	resource_list_add(rl, type, rid, start, end, count);
 	return (rid);
 }
 
 /**
  * @brief Add or modify a resource entry.
  *
  * If an existing entry exists with the same type and rid, it will be
  * modified using the given values of @p start, @p end and @p
  * count. If no entry exists, a new one will be created using the
  * given values.  The resource list entry that matches is then returned.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 struct resource_list_entry *
 resource_list_add(struct resource_list *rl, int type, int rid,
     rman_res_t start, rman_res_t end, rman_res_t count)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle) {
 		rle = malloc(sizeof(struct resource_list_entry), M_BUS,
 		    M_NOWAIT);
 		if (!rle)
 			panic("resource_list_add: can't record entry");
 		STAILQ_INSERT_TAIL(rl, rle, link);
 		rle->type = type;
 		rle->rid = rid;
 		rle->res = NULL;
 		rle->flags = 0;
 	}
 
 	if (rle->res)
 		panic("resource_list_add: resource entry is busy");
 
 	rle->start = start;
 	rle->end = end;
 	rle->count = count;
 	return (rle);
 }
 
 /**
  * @brief Determine if a resource entry is busy.
  *
  * Returns true if a resource entry is busy meaning that it has an
  * associated resource that is not an unallocated "reserved" resource.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns Non-zero if the entry is busy, zero otherwise.
  */
 int
 resource_list_busy(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (rle == NULL || rle->res == NULL)
 		return (0);
 	if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) {
 		KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE),
 		    ("reserved resource is active"));
 		return (0);
 	}
 	return (1);
 }
 
 /**
  * @brief Determine if a resource entry is reserved.
  *
  * Returns true if a resource entry is reserved meaning that it has an
  * associated "reserved" resource.  The resource can either be
  * allocated or unallocated.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns Non-zero if the entry is reserved, zero otherwise.
  */
 int
 resource_list_reserved(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (rle != NULL && rle->flags & RLE_RESERVED)
 		return (1);
 	return (0);
 }
 
 /**
  * @brief Find a resource entry by type and rid.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns the resource entry pointer or NULL if there is no such
  * entry.
  */
 struct resource_list_entry *
 resource_list_find(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type == type && rle->rid == rid)
 			return (rle);
 	}
 	return (NULL);
 }
 
 /**
  * @brief Delete a resource entry.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  */
 void
 resource_list_delete(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle = resource_list_find(rl, type, rid);
 
 	if (rle) {
 		if (rle->res != NULL)
 			panic("resource_list_delete: resource has not been released");
 		STAILQ_REMOVE(rl, rle, resource_list_entry, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Allocate a reserved resource
  *
  * This can be used by buses to force the allocation of resources
  * that are always active in the system even if they are not allocated
  * by a driver (e.g. PCI BARs).  This function is usually called when
  * adding a new child to the bus.  The resource is allocated from the
  * parent bus when it is reserved.  The resource list entry is marked
  * with RLE_RESERVED to note that it is a reserved resource.
  *
  * Subsequent attempts to allocate the resource with
  * resource_list_alloc() will succeed the first time and will set
  * RLE_ALLOCATED to note that it has been allocated.  When a reserved
  * resource that has been allocated is released with
  * resource_list_release() the resource RLE_ALLOCATED is cleared, but
  * the actual resource remains allocated.  The resource can be released to
  * the parent bus by calling resource_list_unreserve().
  *
  * @param rl		the resource list to allocate from
  * @param bus		the parent device of @p child
  * @param child		the device for which the resource is being reserved
  * @param type		the type of resource to allocate
  * @param rid		a pointer to the resource identifier
  * @param start		hint at the start of the resource range - pass
  *			@c 0 for any start address
  * @param end		hint at the end of the resource range - pass
  *			@c ~0 for any end address
  * @param count		hint at the size of range required - pass @c 1
  *			for any size
  * @param flags		any extra flags to control the resource
  *			allocation - see @c RF_XXX flags in
  *			<sys/rman.h> for details
  *
  * @returns		the resource which was allocated or @c NULL if no
  *			resource could be allocated
  */
 struct resource *
 resource_list_reserve(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	struct resource *r;
 
 	if (passthrough)
 		panic(
     "resource_list_reserve() should only be called for direct children");
 	if (flags & RF_ACTIVE)
 		panic(
     "resource_list_reserve() should only reserve inactive resources");
 
 	r = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
 	    flags);
 	if (r != NULL) {
 		rle = resource_list_find(rl, type, *rid);
 		rle->flags |= RLE_RESERVED;
 	}
 	return (r);
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE()
  *
  * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
  * and passing the allocation up to the parent of @p bus. This assumes
  * that the first entry of @c device_get_ivars(child) is a struct
  * resource_list. This also handles 'passthrough' allocations where a
  * child is a remote descendant of bus by passing the allocation up to
  * the parent of bus.
  *
  * Typically, a bus driver would store a list of child resources
  * somewhere in the child device's ivars (see device_get_ivars()) and
  * its implementation of BUS_ALLOC_RESOURCE() would find that list and
  * then call resource_list_alloc() to perform the allocation.
  *
  * @param rl		the resource list to allocate from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting an allocation
  * @param type		the type of resource to allocate
  * @param rid		a pointer to the resource identifier
  * @param start		hint at the start of the resource range - pass
  *			@c 0 for any start address
  * @param end		hint at the end of the resource range - pass
  *			@c ~0 for any end address
  * @param count		hint at the size of range required - pass @c 1
  *			for any size
  * @param flags		any extra flags to control the resource
  *			allocation - see @c RF_XXX flags in
  *			<sys/rman.h> for details
  *
  * @returns		the resource which was allocated or @c NULL if no
  *			resource could be allocated
  */
 struct resource *
 resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	int isdefault = RMAN_IS_DEFAULT_RANGE(start, end);
 
 	if (passthrough) {
 		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 		    type, rid, start, end, count, flags));
 	}
 
 	rle = resource_list_find(rl, type, *rid);
 
 	if (!rle)
 		return (NULL);		/* no resource of that type/rid */
 
 	if (rle->res) {
 		if (rle->flags & RLE_RESERVED) {
 			if (rle->flags & RLE_ALLOCATED)
 				return (NULL);
 			if ((flags & RF_ACTIVE) &&
 			    bus_activate_resource(child, type, *rid,
 			    rle->res) != 0)
 				return (NULL);
 			rle->flags |= RLE_ALLOCATED;
 			return (rle->res);
 		}
 		device_printf(bus,
 		    "resource entry %#x type %d for child %s is busy\n", *rid,
 		    type, device_get_nameunit(child));
 		return (NULL);
 	}
 
 	if (isdefault) {
 		start = rle->start;
 		count = ulmax(count, rle->count);
 		end = ulmax(rle->end, start + count - 1);
 	}
 
 	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 	    type, rid, start, end, count, flags);
 
 	/*
 	 * Record the new range.
 	 */
 	if (rle->res) {
 		rle->start = rman_get_start(rle->res);
 		rle->end = rman_get_end(rle->res);
 		rle->count = count;
 	}
 
 	return (rle->res);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE()
  *
  * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
  * used with resource_list_alloc().
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting a release
  * @param type		the type of resource to release
  * @param rid		the resource identifier
  * @param res		the resource to release
  *
  * @retval 0		success
  * @retval non-zero	a standard unix error code indicating what
  *			error condition prevented the operation
  */
 int
 resource_list_release(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid, struct resource *res)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	int error;
 
 	if (passthrough) {
 		return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 		    type, rid, res));
 	}
 
 	rle = resource_list_find(rl, type, rid);
 
 	if (!rle)
 		panic("resource_list_release: can't find resource");
 	if (!rle->res)
 		panic("resource_list_release: resource entry is not busy");
 	if (rle->flags & RLE_RESERVED) {
 		if (rle->flags & RLE_ALLOCATED) {
 			if (rman_get_flags(res) & RF_ACTIVE) {
 				error = bus_deactivate_resource(child, type,
 				    rid, res);
 				if (error)
 					return (error);
 			}
 			rle->flags &= ~RLE_ALLOCATED;
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 	    type, rid, res);
 	if (error)
 		return (error);
 
 	rle->res = NULL;
 	return (0);
 }
 
 /**
  * @brief Release all active resources of a given type
  *
  * Release all active resources of a specified type.  This is intended
  * to be used to cleanup resources leaked by a driver after detach or
  * a failed attach.
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device whose active resources are being released
  * @param type		the type of resources to release
  *
  * @retval 0		success
  * @retval EBUSY	at least one resource was active
  */
 int
 resource_list_release_active(struct resource_list *rl, device_t bus,
     device_t child, int type)
 {
 	struct resource_list_entry *rle;
 	int error, retval;
 
 	retval = 0;
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type != type)
 			continue;
 		if (rle->res == NULL)
 			continue;
 		if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) ==
 		    RLE_RESERVED)
 			continue;
 		retval = EBUSY;
 		error = resource_list_release(rl, bus, child, type,
 		    rman_get_rid(rle->res), rle->res);
 		if (error != 0)
 			device_printf(bus,
 			    "Failed to release active resource: %d\n", error);
 	}
 	return (retval);
 }
 
 /**
  * @brief Fully release a reserved resource
  *
  * Fully releases a resource reserved via resource_list_reserve().
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device whose reserved resource is being released
  * @param type		the type of resource to release
  * @param rid		the resource identifier
  * @param res		the resource to release
  *
  * @retval 0		success
  * @retval non-zero	a standard unix error code indicating what
  *			error condition prevented the operation
  */
 int
 resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 
 	if (passthrough)
 		panic(
     "resource_list_unreserve() should only be called for direct children");
 
 	rle = resource_list_find(rl, type, rid);
 
 	if (!rle)
 		panic("resource_list_unreserve: can't find resource");
 	if (!(rle->flags & RLE_RESERVED))
 		return (EINVAL);
 	if (rle->flags & RLE_ALLOCATED)
 		return (EBUSY);
 	rle->flags &= ~RLE_RESERVED;
 	return (resource_list_release(rl, bus, child, type, rid, rle->res));
 }
 
 /**
  * @brief Print a description of resources in a resource list
  *
  * Print all resources of a specified type, for use in BUS_PRINT_CHILD().
  * The name is printed if at least one resource of the given type is available.
  * The format is used to print resource start and end.
  *
  * @param rl		the resource list to print
  * @param name		the name of @p type, e.g. @c "memory"
  * @param type		type type of resource entry to print
  * @param format	printf(9) format string to print resource
  *			start and end values
  *
  * @returns		the number of characters printed
  */
 int
 resource_list_print_type(struct resource_list *rl, const char *name, int type,
     const char *format)
 {
 	struct resource_list_entry *rle;
 	int printed, retval;
 
 	printed = 0;
 	retval = 0;
 	/* Yes, this is kinda cheating */
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type == type) {
 			if (printed == 0)
 				retval += printf(" %s ", name);
 			else
 				retval += printf(",");
 			printed++;
 			retval += printf(format, rle->start);
 			if (rle->count > 1) {
 				retval += printf("-");
 				retval += printf(format, rle->start +
 						 rle->count - 1);
 			}
 		}
 	}
 	return (retval);
 }
 
 /**
  * @brief Releases all the resources in a list.
  *
  * @param rl		The resource list to purge.
  *
  * @returns		nothing
  */
 void
 resource_list_purge(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	while ((rle = STAILQ_FIRST(rl)) != NULL) {
 		if (rle->res)
 			bus_release_resource(rman_get_device(rle->res),
 			    rle->type, rle->rid, rle->res);
 		STAILQ_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 device_t
 bus_generic_add_child(device_t dev, u_int order, const char *name, int unit)
 {
 	return (device_add_child_ordered(dev, order, name, unit));
 }
 
 /**
  * @brief Helper function for implementing DEVICE_PROBE()
  *
  * This function can be used to help implement the DEVICE_PROBE() for
  * a bus (i.e. a device which has other devices attached to it). It
  * calls the DEVICE_IDENTIFY() method of each driver in the device's
  * devclass.
  */
 int
 bus_generic_probe(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 	driverlink_t dl;
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		/*
 		 * If this driver's pass is too high, then ignore it.
 		 * For most drivers in the default pass, this will
 		 * never be true.  For early-pass drivers they will
 		 * only call the identify routines of eligible drivers
 		 * when this routine is called.  Drivers for later
 		 * passes should have their identify routines called
 		 * on early-pass buses during BUS_NEW_PASS().
 		 */
 		if (dl->pass > bus_current_pass)
 			continue;
 		DEVICE_IDENTIFY(dl->driver, dev);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_ATTACH()
  *
  * This function can be used to help implement the DEVICE_ATTACH() for
  * a bus. It calls device_probe_and_attach() for each of the device's
  * children.
  */
 int
 bus_generic_attach(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_probe_and_attach(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for delaying attaching children
  *
  * Many buses can't run transactions on the bus which children need to probe and
  * attach until after interrupts and/or timers are running.  This function
  * delays their attach until interrupts and timers are enabled.
  */
 int
 bus_delayed_attach_children(device_t dev)
 {
 	/* Probe and attach the bus children when interrupts are available */
 	config_intrhook_oneshot((ich_func_t)bus_generic_attach, dev);
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_DETACH()
  *
  * This function can be used to help implement the DEVICE_DETACH() for
  * a bus. It calls device_detach() for each of the device's
  * children.
  */
 int
 bus_generic_detach(device_t dev)
 {
 	device_t child;
 	int error;
 
 	if (dev->state != DS_ATTACHED)
 		return (EBUSY);
 
 	/*
 	 * Detach children in the reverse order.
 	 * See bus_generic_suspend for details.
 	 */
 	TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) {
 		if ((error = device_detach(child)) != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SHUTDOWN()
  *
  * This function can be used to help implement the DEVICE_SHUTDOWN()
  * for a bus. It calls device_shutdown() for each of the device's
  * children.
  */
 int
 bus_generic_shutdown(device_t dev)
 {
 	device_t child;
 
 	/*
 	 * Shut down children in the reverse order.
 	 * See bus_generic_suspend for details.
 	 */
 	TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) {
 		device_shutdown(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Default function for suspending a child device.
  *
  * This function is to be used by a bus's DEVICE_SUSPEND_CHILD().
  */
 int
 bus_generic_suspend_child(device_t dev, device_t child)
 {
 	int	error;
 
 	error = DEVICE_SUSPEND(child);
 
 	if (error == 0)
 		child->flags |= DF_SUSPENDED;
 
 	return (error);
 }
 
 /**
  * @brief Default function for resuming a child device.
  *
  * This function is to be used by a bus's DEVICE_RESUME_CHILD().
  */
 int
 bus_generic_resume_child(device_t dev, device_t child)
 {
 	DEVICE_RESUME(child);
 	child->flags &= ~DF_SUSPENDED;
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SUSPEND()
  *
  * This function can be used to help implement the DEVICE_SUSPEND()
  * for a bus. It calls DEVICE_SUSPEND() for each of the device's
  * children. If any call to DEVICE_SUSPEND() fails, the suspend
  * operation is aborted and any devices which were suspended are
  * resumed immediately by calling their DEVICE_RESUME() methods.
  */
 int
 bus_generic_suspend(device_t dev)
 {
 	int		error;
 	device_t	child;
 
 	/*
 	 * Suspend children in the reverse order.
 	 * For most buses all children are equal, so the order does not matter.
 	 * Other buses, such as acpi, carefully order their child devices to
 	 * express implicit dependencies between them.  For such buses it is
 	 * safer to bring down devices in the reverse order.
 	 */
 	TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) {
 		error = BUS_SUSPEND_CHILD(dev, child);
 		if (error != 0) {
 			child = TAILQ_NEXT(child, link);
 			if (child != NULL) {
 				TAILQ_FOREACH_FROM(child, &dev->children, link)
 					BUS_RESUME_CHILD(dev, child);
 			}
 			return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_RESUME()
  *
  * This function can be used to help implement the DEVICE_RESUME() for
  * a bus. It calls DEVICE_RESUME() on each of the device's children.
  */
 int
 bus_generic_resume(device_t dev)
 {
 	device_t	child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		BUS_RESUME_CHILD(dev, child);
 		/* if resume fails, there's nothing we can usefully do... */
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_RESET_POST
  *
  * Bus can use this function to implement common operations of
  * re-attaching or resuming the children after the bus itself was
  * reset, and after restoring bus-unique state of children.
  *
  * @param dev	The bus
  * #param flags	DEVF_RESET_*
  */
 int
 bus_helper_reset_post(device_t dev, int flags)
 {
 	device_t child;
 	int error, error1;
 
 	error = 0;
 	TAILQ_FOREACH(child, &dev->children,link) {
 		BUS_RESET_POST(dev, child);
 		error1 = (flags & DEVF_RESET_DETACH) != 0 ?
 		    device_probe_and_attach(child) :
 		    BUS_RESUME_CHILD(dev, child);
 		if (error == 0 && error1 != 0)
 			error = error1;
 	}
 	return (error);
 }
 
 static void
 bus_helper_reset_prepare_rollback(device_t dev, device_t child, int flags)
 {
 	child = TAILQ_NEXT(child, link);
 	if (child == NULL)
 		return;
 	TAILQ_FOREACH_FROM(child, &dev->children,link) {
 		BUS_RESET_POST(dev, child);
 		if ((flags & DEVF_RESET_DETACH) != 0)
 			device_probe_and_attach(child);
 		else
 			BUS_RESUME_CHILD(dev, child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_RESET_PREPARE
  *
  * Bus can use this function to implement common operations of
  * detaching or suspending the children before the bus itself is
  * reset, and then save bus-unique state of children that must
  * persists around reset.
  *
  * @param dev	The bus
  * #param flags	DEVF_RESET_*
  */
 int
 bus_helper_reset_prepare(device_t dev, int flags)
 {
 	device_t child;
 	int error;
 
 	if (dev->state != DS_ATTACHED)
 		return (EBUSY);
 
 	TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) {
 		if ((flags & DEVF_RESET_DETACH) != 0) {
 			error = device_get_state(child) == DS_ATTACHED ?
 			    device_detach(child) : 0;
 		} else {
 			error = BUS_SUSPEND_CHILD(dev, child);
 		}
 		if (error == 0) {
 			error = BUS_RESET_PREPARE(dev, child);
 			if (error != 0) {
 				if ((flags & DEVF_RESET_DETACH) != 0)
 					device_probe_and_attach(child);
 				else
 					BUS_RESUME_CHILD(dev, child);
 			}
 		}
 		if (error != 0) {
 			bus_helper_reset_prepare_rollback(dev, child, flags);
 			return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the first part of the ascii representation of
  * @p child, including its name, unit and description (if any - see
  * device_set_desc()).
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_header(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	if (device_get_desc(child)) {
 		retval += device_printf(child, "<%s>", device_get_desc(child));
 	} else {
 		retval += printf("%s", device_get_nameunit(child));
 	}
 
 	return (retval);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the last part of the ascii representation of
  * @p child, which consists of the string @c " on " followed by the
  * name and unit of the @p dev.
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_footer(device_t dev, device_t child)
 {
 	return (printf(" on %s\n", device_get_nameunit(dev)));
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints out the VM domain for the given device.
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_domain(device_t dev, device_t child)
 {
 	int domain;
 
 	/* No domain? Don't print anything */
 	if (BUS_GET_DOMAIN(dev, child, &domain) != 0)
 		return (0);
 
 	return (printf(" numa-domain %d", domain));
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function simply calls bus_print_child_header() followed by
  * bus_print_child_footer().
  *
  * @returns the number of characters printed
  */
 int
 bus_generic_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += bus_print_child_domain(dev, child);
 	retval += bus_print_child_footer(dev, child);
 
 	return (retval);
 }
 
 /**
  * @brief Stub function for implementing BUS_READ_IVAR().
  *
  * @returns ENOENT
  */
 int
 bus_generic_read_ivar(device_t dev, device_t child, int index,
     uintptr_t * result)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_WRITE_IVAR().
  *
  * @returns ENOENT
  */
 int
 bus_generic_write_ivar(device_t dev, device_t child, int index,
     uintptr_t value)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_PROPERTY().
  *
  * This simply calls the BUS_GET_PROPERTY of the parent of dev,
  * until a non-default implementation is found.
  */
 ssize_t
 bus_generic_get_property(device_t dev, device_t child, const char *propname,
     void *propvalue, size_t size, device_property_type_t type)
 {
 	if (device_get_parent(dev) != NULL)
 		return (BUS_GET_PROPERTY(device_get_parent(dev), child,
 		    propname, propvalue, size, type));
 
 	return (-1);
 }
 
 /**
  * @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
  *
  * @returns NULL
  */
 struct resource_list *
 bus_generic_get_resource_list(device_t dev, device_t child)
 {
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DRIVER_ADDED().
  *
  * This implementation of BUS_DRIVER_ADDED() simply calls the driver's
  * DEVICE_IDENTIFY() method to allow it to add new children to the bus
  * and then calls device_probe_and_attach() for each unattached child.
  */
 void
 bus_generic_driver_added(device_t dev, driver_t *driver)
 {
 	device_t child;
 
 	DEVICE_IDENTIFY(driver, dev);
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if (child->state == DS_NOTPRESENT)
 			device_probe_and_attach(child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_NEW_PASS().
  *
  * This implementing of BUS_NEW_PASS() first calls the identify
  * routines for any drivers that probe at the current pass.  Then it
  * walks the list of devices for this bus.  If a device is already
  * attached, then it calls BUS_NEW_PASS() on that device.  If the
  * device is not already attached, it attempts to attach a driver to
  * it.
  */
 void
 bus_generic_new_pass(device_t dev)
 {
 	driverlink_t dl;
 	devclass_t dc;
 	device_t child;
 
 	dc = dev->devclass;
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		if (dl->pass == bus_current_pass)
 			DEVICE_IDENTIFY(dl->driver, dev);
 	}
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if (child->state >= DS_ATTACHED)
 			BUS_NEW_PASS(child);
 		else if (child->state == DS_NOTPRESENT)
 			device_probe_and_attach(child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_SETUP_INTR().
  *
  * This simple implementation of BUS_SETUP_INTR() simply calls the
  * BUS_SETUP_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
     int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg,
     void **cookiep)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
 		    filter, intr, arg, cookiep));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_TEARDOWN_INTR().
  *
  * This simple implementation of BUS_TEARDOWN_INTR() simply calls the
  * BUS_TEARDOWN_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_SUSPEND_INTR().
  *
  * This simple implementation of BUS_SUSPEND_INTR() simply calls the
  * BUS_SUSPEND_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_suspend_intr(device_t dev, device_t child, struct resource *irq)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_SUSPEND_INTR(dev->parent, child, irq));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_RESUME_INTR().
  *
  * This simple implementation of BUS_RESUME_INTR() simply calls the
  * BUS_RESUME_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_resume_intr(device_t dev, device_t child, struct resource *irq)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_RESUME_INTR(dev->parent, child, irq));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ADJUST_RESOURCE().
  *
  * This simple implementation of BUS_ADJUST_RESOURCE() simply calls the
  * BUS_ADJUST_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_adjust_resource(device_t dev, device_t child, int type,
     struct resource *r, rman_res_t start, rman_res_t end)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start,
 		    end));
 	return (EINVAL);
 }
 
 /*
  * @brief Helper function for implementing BUS_TRANSLATE_RESOURCE().
  *
  * This simple implementation of BUS_TRANSLATE_RESOURCE() simply calls the
  * BUS_TRANSLATE_RESOURCE() method of the parent of @p dev.  If there is no
  * parent, no translation happens.
  */
 int
 bus_generic_translate_resource(device_t dev, int type, rman_res_t start,
     rman_res_t *newstart)
 {
 	if (dev->parent)
 		return (BUS_TRANSLATE_RESOURCE(dev->parent, type, start,
 		    newstart));
 	*newstart = start;
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
  * BUS_ALLOC_RESOURCE() method of the parent of @p dev.
  */
 struct resource *
 bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
 		    start, end, count, flags));
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
  * BUS_RELEASE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
  * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
  * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_deactivate_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_MAP_RESOURCE().
  *
  * This simple implementation of BUS_MAP_RESOURCE() simply calls the
  * BUS_MAP_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_map_resource(device_t dev, device_t child, int type,
     struct resource *r, struct resource_map_request *args,
     struct resource_map *map)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_MAP_RESOURCE(dev->parent, child, type, r, args,
 		    map));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_UNMAP_RESOURCE().
  *
  * This simple implementation of BUS_UNMAP_RESOURCE() simply calls the
  * BUS_UNMAP_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_unmap_resource(device_t dev, device_t child, int type,
     struct resource *r, struct resource_map *map)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_UNMAP_RESOURCE(dev->parent, child, type, r, map));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_BIND_INTR().
  *
  * This simple implementation of BUS_BIND_INTR() simply calls the
  * BUS_BIND_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq,
     int cpu)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_BIND_INTR(dev->parent, child, irq, cpu));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_CONFIG_INTR().
  *
  * This simple implementation of BUS_CONFIG_INTR() simply calls the
  * BUS_CONFIG_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DESCRIBE_INTR().
  *
  * This simple implementation of BUS_DESCRIBE_INTR() simply calls the
  * BUS_DESCRIBE_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie, const char *descr)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie,
 		    descr));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_CPUS().
  *
  * This simple implementation of BUS_GET_CPUS() simply calls the
  * BUS_GET_CPUS() method of the parent of @p dev.
  */
 int
 bus_generic_get_cpus(device_t dev, device_t child, enum cpu_sets op,
     size_t setsize, cpuset_t *cpuset)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent != NULL)
 		return (BUS_GET_CPUS(dev->parent, child, op, setsize, cpuset));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_DMA_TAG().
  *
  * This simple implementation of BUS_GET_DMA_TAG() simply calls the
  * BUS_GET_DMA_TAG() method of the parent of @p dev.
  */
 bus_dma_tag_t
 bus_generic_get_dma_tag(device_t dev, device_t child)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent != NULL)
 		return (BUS_GET_DMA_TAG(dev->parent, child));
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_BUS_TAG().
  *
  * This simple implementation of BUS_GET_BUS_TAG() simply calls the
  * BUS_GET_BUS_TAG() method of the parent of @p dev.
  */
 bus_space_tag_t
 bus_generic_get_bus_tag(device_t dev, device_t child)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent != NULL)
 		return (BUS_GET_BUS_TAG(dev->parent, child));
 	return ((bus_space_tag_t)0);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_RESOURCE().
  *
  * This implementation of BUS_GET_RESOURCE() uses the
  * resource_list_find() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * search.
  */
 int
 bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t *startp, rman_res_t *countp)
 {
 	struct resource_list *		rl = NULL;
 	struct resource_list_entry *	rle = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle)
 		return (ENOENT);
 
 	if (startp)
 		*startp = rle->start;
 	if (countp)
 		*countp = rle->count;
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_SET_RESOURCE().
  *
  * This implementation of BUS_SET_RESOURCE() uses the
  * resource_list_add() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 int
 bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t start, rman_res_t count)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	resource_list_add(rl, type, rid, start, (start + count - 1), count);
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_DELETE_RESOURCE().
  *
  * This implementation of BUS_DELETE_RESOURCE() uses the
  * resource_list_delete() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 void
 bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return;
 
 	resource_list_delete(rl, type, rid);
 
 	return;
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This implementation of BUS_RELEASE_RESOURCE() uses the
  * resource_list_release() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 int
 bus_generic_rl_release_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	struct resource_list *		rl = NULL;
 
 	if (device_get_parent(child) != dev)
 		return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
 		    type, rid, r));
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	return (resource_list_release(rl, dev, child, type, rid, r));
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This implementation of BUS_ALLOC_RESOURCE() uses the
  * resource_list_alloc() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 struct resource *
 bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
     int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource_list *		rl = NULL;
 
 	if (device_get_parent(child) != dev)
 		return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
 		    type, rid, start, end, count, flags));
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (NULL);
 
 	return (resource_list_alloc(rl, dev, child, type, rid,
 	    start, end, count, flags));
 }
 
 /**
  * @brief Helper function for implementing BUS_CHILD_PRESENT().
  *
  * This simple implementation of BUS_CHILD_PRESENT() simply calls the
  * BUS_CHILD_PRESENT() method of the parent of @p dev.
  */
 int
 bus_generic_child_present(device_t dev, device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
 }
 
 int
 bus_generic_get_domain(device_t dev, device_t child, int *domain)
 {
 	if (dev->parent)
 		return (BUS_GET_DOMAIN(dev->parent, dev, domain));
 
 	return (ENOENT);
 }
 
 /**
  * @brief Helper function to implement normal BUS_GET_DEVICE_PATH()
  *
  * This function knows how to (a) pass the request up the tree if there's
  * a parent and (b) Knows how to supply a FreeBSD locator.
  *
  * @param bus		bus in the walk up the tree
  * @param child		leaf node to print information about
  * @param locator	BUS_LOCATOR_xxx string for locator
  * @param sb		Buffer to print information into
  */
 int
 bus_generic_get_device_path(device_t bus, device_t child, const char *locator,
     struct sbuf *sb)
 {
 	int rv = 0;
 	device_t parent;
 
 	/*
 	 * We don't recurse on ACPI since either we know the handle for the
 	 * device or we don't. And if we're in the generic routine, we don't
 	 * have a ACPI override. All other locators build up a path by having
 	 * their parents create a path and then adding the path element for this
 	 * node. That's why we recurse with parent, bus rather than the typical
 	 * parent, child: each spot in the tree is independent of what our child
 	 * will do with this path.
 	 */
 	parent = device_get_parent(bus);
 	if (parent != NULL && strcmp(locator, BUS_LOCATOR_ACPI) != 0) {
 		rv = BUS_GET_DEVICE_PATH(parent, bus, locator, sb);
 	}
 	if (strcmp(locator, BUS_LOCATOR_FREEBSD) == 0) {
 		if (rv == 0) {
 			sbuf_printf(sb, "/%s", device_get_nameunit(child));
 		}
 		return (rv);
 	}
 	/*
 	 * Don't know what to do. So assume we do nothing. Not sure that's
 	 * the right thing, but keeps us from having a big list here.
 	 */
 	return (0);
 }
 
 
 /**
  * @brief Helper function for implementing BUS_RESCAN().
  *
  * This null implementation of BUS_RESCAN() always fails to indicate
  * the bus does not support rescanning.
  */
 int
 bus_null_rescan(device_t dev)
 {
 	return (ENODEV);
 }
 
 /*
  * Some convenience functions to make it easier for drivers to use the
  * resource-management functions.  All these really do is hide the
  * indirection through the parent's method table, making for slightly
  * less-wordy code.  In the future, it might make sense for this code
  * to maintain some sort of a list of resources allocated by each device.
  */
 
 int
 bus_alloc_resources(device_t dev, struct resource_spec *rs,
     struct resource **res)
 {
 	int i;
 
 	for (i = 0; rs[i].type != -1; i++)
 		res[i] = NULL;
 	for (i = 0; rs[i].type != -1; i++) {
 		res[i] = bus_alloc_resource_any(dev,
 		    rs[i].type, &rs[i].rid, rs[i].flags);
 		if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
 			bus_release_resources(dev, rs, res);
 			return (ENXIO);
 		}
 	}
 	return (0);
 }
 
 void
 bus_release_resources(device_t dev, const struct resource_spec *rs,
     struct resource **res)
 {
 	int i;
 
 	for (i = 0; rs[i].type != -1; i++)
 		if (res[i] != NULL) {
 			bus_release_resource(
 			    dev, rs[i].type, rs[i].rid, res[i]);
 			res[i] = NULL;
 		}
 }
 
 /**
  * @brief Wrapper function for BUS_ALLOC_RESOURCE().
  *
  * This function simply calls the BUS_ALLOC_RESOURCE() method of the
  * parent of @p dev.
  */
 struct resource *
 bus_alloc_resource(device_t dev, int type, int *rid, rman_res_t start,
     rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource *res;
 
 	if (dev->parent == NULL)
 		return (NULL);
 	res = BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
 	    count, flags);
 	return (res);
 }
 
 /**
  * @brief Wrapper function for BUS_ADJUST_RESOURCE().
  *
  * This function simply calls the BUS_ADJUST_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_adjust_resource(device_t dev, int type, struct resource *r, rman_res_t start,
     rman_res_t end)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end));
 }
 
 /**
  * @brief Wrapper function for BUS_TRANSLATE_RESOURCE().
  *
  * This function simply calls the BUS_TRANSLATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_translate_resource(device_t dev, int type, rman_res_t start,
     rman_res_t *newstart)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_TRANSLATE_RESOURCE(dev->parent, type, start, newstart));
 }
 
 /**
  * @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_MAP_RESOURCE().
  *
  * This function simply calls the BUS_MAP_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_map_resource(device_t dev, int type, struct resource *r,
     struct resource_map_request *args, struct resource_map *map)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_MAP_RESOURCE(dev->parent, dev, type, r, args, map));
 }
 
 /**
  * @brief Wrapper function for BUS_UNMAP_RESOURCE().
  *
  * This function simply calls the BUS_UNMAP_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_unmap_resource(device_t dev, int type, struct resource *r,
     struct resource_map *map)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_UNMAP_RESOURCE(dev->parent, dev, type, r, map));
 }
 
 /**
  * @brief Wrapper function for BUS_RELEASE_RESOURCE().
  *
  * This function simply calls the BUS_RELEASE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_release_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	int rv;
 
 	if (dev->parent == NULL)
 		return (EINVAL);
 	rv = BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r);
 	return (rv);
 }
 
 /**
  * @brief Wrapper function for BUS_SETUP_INTR().
  *
  * This function simply calls the BUS_SETUP_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_setup_intr(device_t dev, struct resource *r, int flags,
     driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
 {
 	int error;
 
 	if (dev->parent == NULL)
 		return (EINVAL);
 	error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler,
 	    arg, cookiep);
 	if (error != 0)
 		return (error);
 	if (handler != NULL && !(flags & INTR_MPSAFE))
 		device_printf(dev, "[GIANT-LOCKED]\n");
 	return (0);
 }
 
 /**
  * @brief Wrapper function for BUS_TEARDOWN_INTR().
  *
  * This function simply calls the BUS_TEARDOWN_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
 }
 
 /**
  * @brief Wrapper function for BUS_SUSPEND_INTR().
  *
  * This function simply calls the BUS_SUSPEND_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_suspend_intr(device_t dev, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_SUSPEND_INTR(dev->parent, dev, r));
 }
 
 /**
  * @brief Wrapper function for BUS_RESUME_INTR().
  *
  * This function simply calls the BUS_RESUME_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_resume_intr(device_t dev, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_RESUME_INTR(dev->parent, dev, r));
 }
 
 /**
  * @brief Wrapper function for BUS_BIND_INTR().
  *
  * This function simply calls the BUS_BIND_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_bind_intr(device_t dev, struct resource *r, int cpu)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_BIND_INTR(dev->parent, dev, r, cpu));
 }
 
 /**
  * @brief Wrapper function for BUS_DESCRIBE_INTR().
  *
  * This function first formats the requested description into a
  * temporary buffer and then calls the BUS_DESCRIBE_INTR() method of
  * the parent of @p dev.
  */
 int
 bus_describe_intr(device_t dev, struct resource *irq, void *cookie,
     const char *fmt, ...)
 {
 	va_list ap;
 	char descr[MAXCOMLEN + 1];
 
 	if (dev->parent == NULL)
 		return (EINVAL);
 	va_start(ap, fmt);
 	vsnprintf(descr, sizeof(descr), fmt, ap);
 	va_end(ap);
 	return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr));
 }
 
 /**
  * @brief Wrapper function for BUS_SET_RESOURCE().
  *
  * This function simply calls the BUS_SET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_set_resource(device_t dev, int type, int rid,
     rman_res_t start, rman_res_t count)
 {
 	return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    start, count));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_get_resource(device_t dev, int type, int rid,
     rman_res_t *startp, rman_res_t *countp)
 {
 	return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    startp, countp));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the start value.
  */
 rman_res_t
 bus_get_resource_start(device_t dev, int type, int rid)
 {
 	rman_res_t start;
 	rman_res_t count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (start);
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the count value.
  */
 rman_res_t
 bus_get_resource_count(device_t dev, int type, int rid)
 {
 	rman_res_t start;
 	rman_res_t count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (count);
 }
 
 /**
  * @brief Wrapper function for BUS_DELETE_RESOURCE().
  *
  * This function simply calls the BUS_DELETE_RESOURCE() method of the
  * parent of @p dev.
  */
 void
 bus_delete_resource(device_t dev, int type, int rid)
 {
 	BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PRESENT().
  *
  * This function simply calls the BUS_CHILD_PRESENT() method of the
  * parent of @p dev.
  */
 int
 bus_child_present(device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(child), child));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PNPINFO().
  *
  * This function simply calls the BUS_CHILD_PNPINFO() method of the parent of @p
  * dev.
  */
 int
 bus_child_pnpinfo(device_t child, struct sbuf *sb)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL)
 		return (0);
 	return (BUS_CHILD_PNPINFO(parent, child, sb));
 }
 
 /**
  * @brief Generic implementation that does nothing for bus_child_pnpinfo
  *
  * This function has the right signature and returns 0 since the sbuf is passed
  * to us to append to.
  */
 int
 bus_generic_child_pnpinfo(device_t dev, device_t child, struct sbuf *sb)
 {
 	return (0);
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_LOCATION().
  *
  * This function simply calls the BUS_CHILD_LOCATION() method of the parent of
  * @p dev.
  */
 int
 bus_child_location(device_t child, struct sbuf *sb)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL)
 		return (0);
 	return (BUS_CHILD_LOCATION(parent, child, sb));
 }
 
 /**
  * @brief Generic implementation that does nothing for bus_child_location
  *
  * This function has the right signature and returns 0 since the sbuf is passed
  * to us to append to.
  */
 int
 bus_generic_child_location(device_t dev, device_t child, struct sbuf *sb)
 {
 	return (0);
 }
 
 /**
  * @brief Wrapper function for BUS_GET_CPUS().
  *
  * This function simply calls the BUS_GET_CPUS() method of the
  * parent of @p dev.
  */
 int
 bus_get_cpus(device_t dev, enum cpu_sets op, size_t setsize, cpuset_t *cpuset)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return (EINVAL);
 	return (BUS_GET_CPUS(parent, dev, op, setsize, cpuset));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_DMA_TAG().
  *
  * This function simply calls the BUS_GET_DMA_TAG() method of the
  * parent of @p dev.
  */
 bus_dma_tag_t
 bus_get_dma_tag(device_t dev)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return (NULL);
 	return (BUS_GET_DMA_TAG(parent, dev));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_BUS_TAG().
  *
  * This function simply calls the BUS_GET_BUS_TAG() method of the
  * parent of @p dev.
  */
 bus_space_tag_t
 bus_get_bus_tag(device_t dev)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return ((bus_space_tag_t)0);
 	return (BUS_GET_BUS_TAG(parent, dev));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_DOMAIN().
  *
  * This function simply calls the BUS_GET_DOMAIN() method of the
  * parent of @p dev.
  */
 int
 bus_get_domain(device_t dev, int *domain)
 {
 	return (BUS_GET_DOMAIN(device_get_parent(dev), dev, domain));
 }
 
 /* Resume all devices and then notify userland that we're up again. */
 static int
 root_resume(device_t dev)
 {
 	int error;
 
 	error = bus_generic_resume(dev);
 	if (error == 0) {
 		devctl_notify("kernel", "power", "resume", NULL);
 	}
 	return (error);
 }
 
 static int
 root_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += printf("\n");
 
 	return (retval);
 }
 
 static int
 root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
     driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep)
 {
 	/*
 	 * If an interrupt mapping gets to here something bad has happened.
 	 */
 	panic("root_setup_intr");
 }
 
 /*
  * If we get here, assume that the device is permanent and really is
  * present in the system.  Removable bus drivers are expected to intercept
  * this call long before it gets here.  We return -1 so that drivers that
  * really care can check vs -1 or some ERRNO returned higher in the food
  * chain.
  */
 static int
 root_child_present(device_t dev, device_t child)
 {
 	return (-1);
 }
 
 static int
 root_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
     cpuset_t *cpuset)
 {
 	switch (op) {
 	case INTR_CPUS:
 		/* Default to returning the set of all CPUs. */
 		if (setsize != sizeof(cpuset_t))
 			return (EINVAL);
 		*cpuset = all_cpus;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 static kobj_method_t root_methods[] = {
 	/* Device interface */
 	KOBJMETHOD(device_shutdown,	bus_generic_shutdown),
 	KOBJMETHOD(device_suspend,	bus_generic_suspend),
 	KOBJMETHOD(device_resume,	root_resume),
 
 	/* Bus interface */
 	KOBJMETHOD(bus_print_child,	root_print_child),
 	KOBJMETHOD(bus_read_ivar,	bus_generic_read_ivar),
 	KOBJMETHOD(bus_write_ivar,	bus_generic_write_ivar),
 	KOBJMETHOD(bus_setup_intr,	root_setup_intr),
 	KOBJMETHOD(bus_child_present,	root_child_present),
 	KOBJMETHOD(bus_get_cpus,	root_get_cpus),
 
 	KOBJMETHOD_END
 };
 
 static driver_t root_driver = {
 	"root",
 	root_methods,
 	1,			/* no softc */
 };
 
 device_t	root_bus;
 devclass_t	root_devclass;
 
 static int
 root_bus_module_handler(module_t mod, int what, void* arg)
 {
 	switch (what) {
 	case MOD_LOAD:
 		TAILQ_INIT(&bus_data_devices);
 		kobj_class_compile((kobj_class_t) &root_driver);
 		root_bus = make_device(NULL, "root", 0);
 		root_bus->desc = "System root bus";
 		kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
 		root_bus->driver = &root_driver;
 		root_bus->state = DS_ATTACHED;
 		root_devclass = devclass_find_internal("root", NULL, FALSE);
 		devctl2_init();
 		return (0);
 
 	case MOD_SHUTDOWN:
 		device_shutdown(root_bus);
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t root_bus_mod = {
 	"rootbus",
 	root_bus_module_handler,
 	NULL
 };
 DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 
 /**
  * @brief Automatically configure devices
  *
  * This function begins the autoconfiguration process by calling
  * device_probe_and_attach() for each child of the @c root0 device.
  */
 void
 root_bus_configure(void)
 {
 	PDEBUG(("."));
 
 	/* Eventually this will be split up, but this is sufficient for now. */
 	bus_set_pass(BUS_PASS_DEFAULT);
 }
 
 /**
  * @brief Module handler for registering device drivers
  *
  * This module handler is used to automatically register device
  * drivers when modules are loaded. If @p what is MOD_LOAD, it calls
  * devclass_add_driver() for the driver described by the
  * driver_module_data structure pointed to by @p arg
  */
 int
 driver_module_handler(module_t mod, int what, void *arg)
 {
 	struct driver_module_data *dmd;
 	devclass_t bus_devclass;
 	kobj_class_t driver;
 	int error, pass;
 
 	dmd = (struct driver_module_data *)arg;
 	bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
 	error = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 		if (dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 
 		pass = dmd->dmd_pass;
 		driver = dmd->dmd_driver;
 		PDEBUG(("Loading module: driver %s on bus %s (pass %d)",
 		    DRIVERNAME(driver), dmd->dmd_busname, pass));
 		error = devclass_add_driver(bus_devclass, driver, pass,
 		    dmd->dmd_devclass);
 		break;
 
 	case MOD_UNLOAD:
 		PDEBUG(("Unloading module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_delete_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	case MOD_QUIESCE:
 		PDEBUG(("Quiesce module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_quiesce_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 /**
  * @brief Enumerate all hinted devices for this bus.
  *
  * Walks through the hints for this bus and calls the bus_hinted_child
  * routine for each one it fines.  It searches first for the specific
  * bus that's being probed for hinted children (eg isa0), and then for
  * generic children (eg isa).
  *
  * @param	dev	bus device to enumerate
  */
 void
 bus_enumerate_hinted_children(device_t bus)
 {
 	int i;
 	const char *dname, *busname;
 	int dunit;
 
 	/*
 	 * enumerate all devices on the specific bus
 	 */
 	busname = device_get_nameunit(bus);
 	i = 0;
 	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
 		BUS_HINTED_CHILD(bus, dname, dunit);
 
 	/*
 	 * and all the generic ones.
 	 */
 	busname = device_get_name(bus);
 	i = 0;
 	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
 		BUS_HINTED_CHILD(bus, dname, dunit);
 }
 
 #ifdef BUS_DEBUG
 
 /* the _short versions avoid iteration by not calling anything that prints
  * more than oneliners. I love oneliners.
  */
 
 static void
 print_device_short(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
 	    dev->unit, dev->desc,
 	    (dev->parent? "":"no "),
 	    (TAILQ_EMPTY(&dev->children)? "no ":""),
 	    (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
 	    (dev->flags&DF_FIXEDCLASS? "fixed,":""),
 	    (dev->flags&DF_WILDCARD? "wildcard,":""),
 	    (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
 	    (dev->flags&DF_SUSPENDED? "suspended,":""),
 	    (dev->ivars? "":"no "),
 	    (dev->softc? "":"no "),
 	    dev->busy));
 }
 
 static void
 print_device(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	indentprintf(("Parent:\n"));
 	print_device_short(dev->parent, indent+1);
 	indentprintf(("Driver:\n"));
 	print_driver_short(dev->driver, indent+1);
 	indentprintf(("Devclass:\n"));
 	print_devclass_short(dev->devclass, indent+1);
 }
 
 void
 print_device_tree_short(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree_short(child, indent+1);
 	}
 }
 
 void
 print_device_tree(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree(child, indent+1);
 	}
 }
 
 static void
 print_driver_short(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	indentprintf(("driver %s: softc size = %zd\n",
 	    driver->name, driver->size));
 }
 
 static void
 print_driver(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	print_driver_short(driver, indent);
 }
 
 static void
 print_driver_list(driver_list_t drivers, int indent)
 {
 	driverlink_t driver;
 
 	TAILQ_FOREACH(driver, &drivers, link) {
 		print_driver(driver->driver, indent);
 	}
 }
 
 static void
 print_devclass_short(devclass_t dc, int indent)
 {
 	if ( !dc )
 		return;
 
 	indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
 }
 
 static void
 print_devclass(devclass_t dc, int indent)
 {
 	int i;
 
 	if ( !dc )
 		return;
 
 	print_devclass_short(dc, indent);
 	indentprintf(("Drivers:\n"));
 	print_driver_list(dc->drivers, indent+1);
 
 	indentprintf(("Devices:\n"));
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			print_device(dc->devices[i], indent+1);
 }
 
 void
 print_devclass_list_short(void)
 {
 	devclass_t dc;
 
 	printf("Short listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass_short(dc, 0);
 	}
 }
 
 void
 print_devclass_list(void)
 {
 	devclass_t dc;
 
 	printf("Full listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass(dc, 0);
 	}
 }
 
 #endif
 
 /*
  * User-space access to the device tree.
  *
  * We implement a small set of nodes:
  *
  * hw.bus			Single integer read method to obtain the
  *				current generation count.
  * hw.bus.devices		Reads the entire device tree in flat space.
  * hw.bus.rman			Resource manager interface
  *
  * We might like to add the ability to scan devclasses and/or drivers to
  * determine what else is currently loaded/available.
  */
 
 static int
 sysctl_bus_info(SYSCTL_HANDLER_ARGS)
 {
 	struct u_businfo	ubus;
 
 	ubus.ub_version = BUS_USER_VERSION;
 	ubus.ub_generation = bus_data_generation;
 
 	return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
 }
 SYSCTL_PROC(_hw_bus, OID_AUTO, info, CTLTYPE_STRUCT | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_bus_info, "S,u_businfo",
     "bus-related data");
 
 static int
 sysctl_devices(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf		sb;
 	int			*name = (int *)arg1;
 	u_int			namelen = arg2;
 	int			index;
 	device_t		dev;
 	struct u_device		*udev;
 	int			error;
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	if (bus_data_generation_check(name[0]))
 		return (EINVAL);
 
 	index = name[1];
 
 	/*
 	 * Scan the list of devices, looking for the requested index.
 	 */
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		if (index-- == 0)
 			break;
 	}
 	if (dev == NULL)
 		return (ENOENT);
 
 	/*
 	 * Populate the return item, careful not to overflow the buffer.
 	 */
 	udev = malloc(sizeof(*udev), M_BUS, M_WAITOK | M_ZERO);
 	if (udev == NULL)
 		return (ENOMEM);
 	udev->dv_handle = (uintptr_t)dev;
 	udev->dv_parent = (uintptr_t)dev->parent;
 	udev->dv_devflags = dev->devflags;
 	udev->dv_flags = dev->flags;
 	udev->dv_state = dev->state;
 	sbuf_new(&sb, udev->dv_fields, sizeof(udev->dv_fields), SBUF_FIXEDLEN);
 	if (dev->nameunit != NULL)
 		sbuf_cat(&sb, dev->nameunit);
 	sbuf_putc(&sb, '\0');
 	if (dev->desc != NULL)
 		sbuf_cat(&sb, dev->desc);
 	sbuf_putc(&sb, '\0');
 	if (dev->driver != NULL)
 		sbuf_cat(&sb, dev->driver->name);
 	sbuf_putc(&sb, '\0');
 	bus_child_pnpinfo(dev, &sb);
 	sbuf_putc(&sb, '\0');
 	bus_child_location(dev, &sb);
 	sbuf_putc(&sb, '\0');
 	error = sbuf_finish(&sb);
 	if (error == 0)
 		error = SYSCTL_OUT(req, udev, sizeof(*udev));
 	sbuf_delete(&sb);
 	free(udev, M_BUS);
 	return (error);
 }
 
 SYSCTL_NODE(_hw_bus, OID_AUTO, devices,
     CTLFLAG_RD | CTLFLAG_NEEDGIANT, sysctl_devices,
     "system device tree");
 
 int
 bus_data_generation_check(int generation)
 {
 	if (generation != bus_data_generation)
 		return (1);
 
 	/* XXX generate optimised lists here? */
 	return (0);
 }
 
 void
 bus_data_generation_update(void)
 {
 	atomic_add_int(&bus_data_generation, 1);
 }
 
 int
 bus_free_resource(device_t dev, int type, struct resource *r)
 {
 	if (r == NULL)
 		return (0);
 	return (bus_release_resource(dev, type, rman_get_rid(r), r));
 }
 
 device_t
 device_lookup_by_name(const char *name)
 {
 	device_t dev;
 
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		if (dev->nameunit != NULL && strcmp(dev->nameunit, name) == 0)
 			return (dev);
 	}
 	return (NULL);
 }
 
 /*
  * /dev/devctl2 implementation.  The existing /dev/devctl device has
  * implicit semantics on open, so it could not be reused for this.
  * Another option would be to call this /dev/bus?
  */
 static int
 find_device(struct devreq *req, device_t *devp)
 {
 	device_t dev;
 
 	/*
 	 * First, ensure that the name is nul terminated.
 	 */
 	if (memchr(req->dr_name, '\0', sizeof(req->dr_name)) == NULL)
 		return (EINVAL);
 
 	/*
 	 * Second, try to find an attached device whose name matches
 	 * 'name'.
 	 */
 	dev = device_lookup_by_name(req->dr_name);
 	if (dev != NULL) {
 		*devp = dev;
 		return (0);
 	}
 
 	/* Finally, give device enumerators a chance. */
 	dev = NULL;
 	EVENTHANDLER_DIRECT_INVOKE(dev_lookup, req->dr_name, &dev);
 	if (dev == NULL)
 		return (ENOENT);
 	*devp = dev;
 	return (0);
 }
 
 static bool
 driver_exists(device_t bus, const char *driver)
 {
 	devclass_t dc;
 
 	for (dc = bus->devclass; dc != NULL; dc = dc->parent) {
 		if (devclass_find_driver_internal(dc, driver) != NULL)
 			return (true);
 	}
 	return (false);
 }
 
 static void
 device_gen_nomatch(device_t dev)
 {
 	device_t child;
 
 	if (dev->flags & DF_NEEDNOMATCH &&
 	    dev->state == DS_NOTPRESENT) {
 		device_handle_nomatch(dev);
 	}
 	dev->flags &= ~DF_NEEDNOMATCH;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_gen_nomatch(child);
 	}
 }
 
 static void
 device_do_deferred_actions(void)
 {
 	devclass_t dc;
 	driverlink_t dl;
 
 	/*
 	 * Walk through the devclasses to find all the drivers we've tagged as
 	 * deferred during the freeze and call the driver added routines. They
 	 * have already been added to the lists in the background, so the driver
 	 * added routines that trigger a probe will have all the right bidders
 	 * for the probe auction.
 	 */
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		TAILQ_FOREACH(dl, &dc->drivers, link) {
 			if (dl->flags & DL_DEFERRED_PROBE) {
 				devclass_driver_added(dc, dl->driver);
 				dl->flags &= ~DL_DEFERRED_PROBE;
 			}
 		}
 	}
 
 	/*
 	 * We also defer no-match events during a freeze. Walk the tree and
 	 * generate all the pent-up events that are still relevant.
 	 */
 	device_gen_nomatch(root_bus);
 	bus_data_generation_update();
 }
 
 static int
 device_get_path(device_t dev, const char *locator, struct sbuf *sb)
 {
 	device_t parent;
 	int error;
 
 	KASSERT(sb != NULL, ("sb is NULL"));
 	parent = device_get_parent(dev);
 	if (parent == NULL) {
-		error = sbuf_printf(sb, "/");
+		error = sbuf_putc(sb, '/');
 	} else {
 		error = BUS_GET_DEVICE_PATH(parent, dev, locator, sb);
 		if (error == 0) {
 			error = sbuf_error(sb);
 			if (error == 0 && sbuf_len(sb) <= 1)
 				error = EIO;
 		}
 	}
 	sbuf_finish(sb);
 	return (error);
 }
 
 static int
 devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct devreq *req;
 	device_t dev;
 	int error, old;
 
 	/* Locate the device to control. */
 	bus_topo_lock();
 	req = (struct devreq *)data;
 	switch (cmd) {
 	case DEV_ATTACH:
 	case DEV_DETACH:
 	case DEV_ENABLE:
 	case DEV_DISABLE:
 	case DEV_SUSPEND:
 	case DEV_RESUME:
 	case DEV_SET_DRIVER:
 	case DEV_CLEAR_DRIVER:
 	case DEV_RESCAN:
 	case DEV_DELETE:
 	case DEV_RESET:
 		error = priv_check(td, PRIV_DRIVER);
 		if (error == 0)
 			error = find_device(req, &dev);
 		break;
 	case DEV_FREEZE:
 	case DEV_THAW:
 		error = priv_check(td, PRIV_DRIVER);
 		break;
 	case DEV_GET_PATH:
 		error = find_device(req, &dev);
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	if (error) {
 		bus_topo_unlock();
 		return (error);
 	}
 
 	/* Perform the requested operation. */
 	switch (cmd) {
 	case DEV_ATTACH:
 		if (device_is_attached(dev))
 			error = EBUSY;
 		else if (!device_is_enabled(dev))
 			error = ENXIO;
 		else
 			error = device_probe_and_attach(dev);
 		break;
 	case DEV_DETACH:
 		if (!device_is_attached(dev)) {
 			error = ENXIO;
 			break;
 		}
 		if (!(req->dr_flags & DEVF_FORCE_DETACH)) {
 			error = device_quiesce(dev);
 			if (error)
 				break;
 		}
 		error = device_detach(dev);
 		break;
 	case DEV_ENABLE:
 		if (device_is_enabled(dev)) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If the device has been probed but not attached (e.g.
 		 * when it has been disabled by a loader hint), just
 		 * attach the device rather than doing a full probe.
 		 */
 		device_enable(dev);
 		if (device_is_alive(dev)) {
 			/*
 			 * If the device was disabled via a hint, clear
 			 * the hint.
 			 */
 			if (resource_disabled(dev->driver->name, dev->unit))
 				resource_unset_value(dev->driver->name,
 				    dev->unit, "disabled");
 			error = device_attach(dev);
 		} else
 			error = device_probe_and_attach(dev);
 		break;
 	case DEV_DISABLE:
 		if (!device_is_enabled(dev)) {
 			error = ENXIO;
 			break;
 		}
 
 		if (!(req->dr_flags & DEVF_FORCE_DETACH)) {
 			error = device_quiesce(dev);
 			if (error)
 				break;
 		}
 
 		/*
 		 * Force DF_FIXEDCLASS on around detach to preserve
 		 * the existing name.
 		 */
 		old = dev->flags;
 		dev->flags |= DF_FIXEDCLASS;
 		error = device_detach(dev);
 		if (!(old & DF_FIXEDCLASS))
 			dev->flags &= ~DF_FIXEDCLASS;
 		if (error == 0)
 			device_disable(dev);
 		break;
 	case DEV_SUSPEND:
 		if (device_is_suspended(dev)) {
 			error = EBUSY;
 			break;
 		}
 		if (device_get_parent(dev) == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = BUS_SUSPEND_CHILD(device_get_parent(dev), dev);
 		break;
 	case DEV_RESUME:
 		if (!device_is_suspended(dev)) {
 			error = EINVAL;
 			break;
 		}
 		if (device_get_parent(dev) == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = BUS_RESUME_CHILD(device_get_parent(dev), dev);
 		break;
 	case DEV_SET_DRIVER: {
 		devclass_t dc;
 		char driver[128];
 
 		error = copyinstr(req->dr_data, driver, sizeof(driver), NULL);
 		if (error)
 			break;
 		if (driver[0] == '\0') {
 			error = EINVAL;
 			break;
 		}
 		if (dev->devclass != NULL &&
 		    strcmp(driver, dev->devclass->name) == 0)
 			/* XXX: Could possibly force DF_FIXEDCLASS on? */
 			break;
 
 		/*
 		 * Scan drivers for this device's bus looking for at
 		 * least one matching driver.
 		 */
 		if (dev->parent == NULL) {
 			error = EINVAL;
 			break;
 		}
 		if (!driver_exists(dev->parent, driver)) {
 			error = ENOENT;
 			break;
 		}
 		dc = devclass_create(driver);
 		if (dc == NULL) {
 			error = ENOMEM;
 			break;
 		}
 
 		/* Detach device if necessary. */
 		if (device_is_attached(dev)) {
 			if (req->dr_flags & DEVF_SET_DRIVER_DETACH)
 				error = device_detach(dev);
 			else
 				error = EBUSY;
 			if (error)
 				break;
 		}
 
 		/* Clear any previously-fixed device class and unit. */
 		if (dev->flags & DF_FIXEDCLASS)
 			devclass_delete_device(dev->devclass, dev);
 		dev->flags |= DF_WILDCARD;
 		dev->unit = -1;
 
 		/* Force the new device class. */
 		error = devclass_add_device(dc, dev);
 		if (error)
 			break;
 		dev->flags |= DF_FIXEDCLASS;
 		error = device_probe_and_attach(dev);
 		break;
 	}
 	case DEV_CLEAR_DRIVER:
 		if (!(dev->flags & DF_FIXEDCLASS)) {
 			error = 0;
 			break;
 		}
 		if (device_is_attached(dev)) {
 			if (req->dr_flags & DEVF_CLEAR_DRIVER_DETACH)
 				error = device_detach(dev);
 			else
 				error = EBUSY;
 			if (error)
 				break;
 		}
 
 		dev->flags &= ~DF_FIXEDCLASS;
 		dev->flags |= DF_WILDCARD;
 		devclass_delete_device(dev->devclass, dev);
 		error = device_probe_and_attach(dev);
 		break;
 	case DEV_RESCAN:
 		if (!device_is_attached(dev)) {
 			error = ENXIO;
 			break;
 		}
 		error = BUS_RESCAN(dev);
 		break;
 	case DEV_DELETE: {
 		device_t parent;
 
 		parent = device_get_parent(dev);
 		if (parent == NULL) {
 			error = EINVAL;
 			break;
 		}
 		if (!(req->dr_flags & DEVF_FORCE_DELETE)) {
 			if (bus_child_present(dev) != 0) {
 				error = EBUSY;
 				break;
 			}
 		}
 		
 		error = device_delete_child(parent, dev);
 		break;
 	}
 	case DEV_FREEZE:
 		if (device_frozen)
 			error = EBUSY;
 		else
 			device_frozen = true;
 		break;
 	case DEV_THAW:
 		if (!device_frozen)
 			error = EBUSY;
 		else {
 			device_do_deferred_actions();
 			device_frozen = false;
 		}
 		break;
 	case DEV_RESET:
 		if ((req->dr_flags & ~(DEVF_RESET_DETACH)) != 0) {
 			error = EINVAL;
 			break;
 		}
 		error = BUS_RESET_CHILD(device_get_parent(dev), dev,
 		    req->dr_flags);
 		break;
 	case DEV_GET_PATH: {
 		struct sbuf *sb;
 		char locator[64];
 		ssize_t len;
 
 		error = copyinstr(req->dr_buffer.buffer, locator,
 		    sizeof(locator), NULL);
 		if (error != 0)
 			break;
 		sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND |
 		    SBUF_INCLUDENUL /* | SBUF_WAITOK */);
 		error = device_get_path(dev, locator, sb);
 		if (error == 0) {
 			len = sbuf_len(sb);
 			if (req->dr_buffer.length < len) {
 				error = ENAMETOOLONG;
 			} else {
 				error = copyout(sbuf_data(sb),
 				    req->dr_buffer.buffer, len);
 			}
 			req->dr_buffer.length = len;
 		}
 		sbuf_delete(sb);
 		break;
 	}
 	}
 	bus_topo_unlock();
 	return (error);
 }
 
 static struct cdevsw devctl2_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	devctl2_ioctl,
 	.d_name =	"devctl2",
 };
 
 static void
 devctl2_init(void)
 {
 	make_dev_credf(MAKEDEV_ETERNAL, &devctl2_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0644, "devctl2");
 }
 
 /*
  * For maintaining device 'at' location info to avoid recomputing it
  */
 struct device_location_node {
 	const char *dln_locator;
 	const char *dln_path;
 	TAILQ_ENTRY(device_location_node) dln_link;
 };
 typedef TAILQ_HEAD(device_location_list, device_location_node) device_location_list_t;
 
 struct device_location_cache {
 	device_location_list_t dlc_list;
 };
 
 
 /*
  * Location cache for wired devices.
  */
 device_location_cache_t *
 dev_wired_cache_init(void)
 {
 	device_location_cache_t *dcp;
 
 	dcp = malloc(sizeof(*dcp), M_BUS, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&dcp->dlc_list);
 
 	return (dcp);
 }
 
 void
 dev_wired_cache_fini(device_location_cache_t *dcp)
 {
 	struct device_location_node *dln, *tdln;
 
 	TAILQ_FOREACH_SAFE(dln, &dcp->dlc_list, dln_link, tdln) {
 		free(dln, M_BUS);
 	}
 	free(dcp, M_BUS);
 }
 
 static struct device_location_node *
 dev_wired_cache_lookup(device_location_cache_t *dcp, const char *locator)
 {
 	struct device_location_node *dln;
 
 	TAILQ_FOREACH(dln, &dcp->dlc_list, dln_link) {
 		if (strcmp(locator, dln->dln_locator) == 0)
 			return (dln);
 	}
 
 	return (NULL);
 }
 
 static struct device_location_node *
 dev_wired_cache_add(device_location_cache_t *dcp, const char *locator, const char *path)
 {
 	struct device_location_node *dln;
 	size_t loclen, pathlen;
 
 	loclen = strlen(locator) + 1;
 	pathlen = strlen(path) + 1;
 	dln = malloc(sizeof(*dln) + loclen + pathlen, M_BUS, M_WAITOK | M_ZERO);
 	dln->dln_locator = (char *)(dln + 1);
 	memcpy(__DECONST(char *, dln->dln_locator), locator, loclen);
 	dln->dln_path = dln->dln_locator + loclen;
 	memcpy(__DECONST(char *, dln->dln_path), path, pathlen);
 	TAILQ_INSERT_HEAD(&dcp->dlc_list, dln, dln_link);
 
 	return (dln);
 }
 
 bool
 dev_wired_cache_match(device_location_cache_t *dcp, device_t dev,
     const char *at)
 {
 	struct sbuf *sb;
 	const char *cp;
 	char locator[32];
 	int error, len;
 	struct device_location_node *res;
 
 	cp = strchr(at, ':');
 	if (cp == NULL)
 		return (false);
 	len = cp - at;
 	if (len > sizeof(locator) - 1)	/* Skip too long locator */
 		return (false);
 	memcpy(locator, at, len);
 	locator[len] = '\0';
 	cp++;
 
 	error = 0;
 	/* maybe cache this inside device_t and look that up, but not yet */
 	res = dev_wired_cache_lookup(dcp, locator);
 	if (res == NULL) {
 		sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND |
 		    SBUF_INCLUDENUL | SBUF_NOWAIT);
 		if (sb != NULL) {
 			error = device_get_path(dev, locator, sb);
 			if (error == 0) {
 				res = dev_wired_cache_add(dcp, locator,
 				    sbuf_data(sb));
 			}
 			sbuf_delete(sb);
 		}
 	}
 	if (error != 0 || res == NULL || res->dln_path == NULL)
 		return (false);
 
 	return (strcmp(res->dln_path, cp) == 0);
 }
 
 /*
  * APIs to manage deprecation and obsolescence.
  */
 static int obsolete_panic = 0;
 SYSCTL_INT(_debug, OID_AUTO, obsolete_panic, CTLFLAG_RWTUN, &obsolete_panic, 0,
     "Panic when obsolete features are used (0 = never, 1 = if obsolete, "
     "2 = if deprecated)");
 
 static void
 gone_panic(int major, int running, const char *msg)
 {
 	switch (obsolete_panic)
 	{
 	case 0:
 		return;
 	case 1:
 		if (running < major)
 			return;
 		/* FALLTHROUGH */
 	default:
 		panic("%s", msg);
 	}
 }
 
 void
 _gone_in(int major, const char *msg)
 {
 	gone_panic(major, P_OSREL_MAJOR(__FreeBSD_version), msg);
 	if (P_OSREL_MAJOR(__FreeBSD_version) >= major)
 		printf("Obsolete code will be removed soon: %s\n", msg);
 	else
 		printf("Deprecated code (to be removed in FreeBSD %d): %s\n",
 		    major, msg);
 }
 
 void
 _gone_in_dev(device_t dev, int major, const char *msg)
 {
 	gone_panic(major, P_OSREL_MAJOR(__FreeBSD_version), msg);
 	if (P_OSREL_MAJOR(__FreeBSD_version) >= major)
 		device_printf(dev,
 		    "Obsolete code will be removed soon: %s\n", msg);
 	else
 		device_printf(dev,
 		    "Deprecated code (to be removed in FreeBSD %d): %s\n",
 		    major, msg);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(device, db_show_device)
 {
 	device_t dev;
 
 	if (!have_addr)
 		return;
 
 	dev = (device_t)addr;
 
 	db_printf("name:    %s\n", device_get_nameunit(dev));
 	db_printf("  driver:  %s\n", DRIVERNAME(dev->driver));
 	db_printf("  class:   %s\n", DEVCLANAME(dev->devclass));
 	db_printf("  addr:    %p\n", dev);
 	db_printf("  parent:  %p\n", dev->parent);
 	db_printf("  softc:   %p\n", dev->softc);
 	db_printf("  ivars:   %p\n", dev->ivars);
 }
 
 DB_SHOW_ALL_COMMAND(devices, db_show_all_devices)
 {
 	device_t dev;
 
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		db_show_device((db_expr_t)dev, true, count, modif);
 	}
 }
 #endif
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
index 5c06bf8270f6..5136ece359e5 100644
--- a/sys/kern/subr_prf.c
+++ b/sys/kern/subr_prf.c
@@ -1,1339 +1,1339 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 #ifdef _KERNEL
 #include "opt_ddb.h"
 #include "opt_printf.h"
 #endif  /* _KERNEL */
 
 #include <sys/param.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kdb.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/kernel.h>
 #include <sys/msgbuf.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stddef.h>
 #include <sys/sysctl.h>
 #include <sys/tslog.h>
 #include <sys/tty.h>
 #include <sys/syslog.h>
 #include <sys/cons.h>
 #include <sys/uio.h>
 #else /* !_KERNEL */
 #include <errno.h>
 #endif
 #include <sys/ctype.h>
 #include <sys/sbuf.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #ifdef _KERNEL
 #include <machine/stdarg.h>
 #else
 #include <stdarg.h>
 #endif
 
 /*
  * This is needed for sbuf_putbuf() when compiled into userland.  Due to the
  * shared nature of this file, it's the only place to put it.
  */
 #ifndef _KERNEL
 #include <stdio.h>
 #endif
 
 #ifdef _KERNEL
 
 #define TOCONS	0x01
 #define TOTTY	0x02
 #define TOLOG	0x04
 
 /* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
 #define MAXNBUF	(sizeof(intmax_t) * NBBY + 1)
 
 struct putchar_arg {
 	int	flags;
 	int	pri;
 	struct	tty *tty;
 	char	*p_bufr;
 	size_t	n_bufr;
 	char	*p_next;
 	size_t	remain;
 };
 
 struct snprintf_arg {
 	char	*str;
 	size_t	remain;
 };
 
 extern	int log_open;
 
 static void  msglogchar(int c, int pri);
 static void  msglogstr(char *str, int pri, int filter_cr);
 static void  prf_putbuf(char *bufr, int flags, int pri);
 static void  putchar(int ch, void *arg);
 static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper);
 static void  snprintf_func(int ch, void *arg);
 
 static bool msgbufmapped;		/* Set when safe to use msgbuf */
 int msgbuftrigger;
 struct msgbuf *msgbufp;
 
 #ifndef BOOT_TAG_SZ
 #define	BOOT_TAG_SZ	32
 #endif
 #ifndef BOOT_TAG
 /* Tag used to mark the start of a boot in dmesg */
 #define	BOOT_TAG	"---<<BOOT>>---"
 #endif
 
 static char current_boot_tag[BOOT_TAG_SZ + 1] = BOOT_TAG;
 SYSCTL_STRING(_kern, OID_AUTO, boot_tag, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     current_boot_tag, 0, "Tag added to dmesg at start of boot");
 
 static int log_console_output = 1;
 SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RWTUN,
     &log_console_output, 0, "Duplicate console output to the syslog");
 
 /*
  * See the comment in log_console() below for more explanation of this.
  */
 static int log_console_add_linefeed;
 SYSCTL_INT(_kern, OID_AUTO, log_console_add_linefeed, CTLFLAG_RWTUN,
     &log_console_add_linefeed, 0, "log_console() adds extra newlines");
 
 static int always_console_output;
 SYSCTL_INT(_kern, OID_AUTO, always_console_output, CTLFLAG_RWTUN,
     &always_console_output, 0, "Always output to console despite TIOCCONS");
 
 /*
  * Warn that a system table is full.
  */
 void
 tablefull(const char *tab)
 {
 
 	log(LOG_ERR, "%s: table is full\n", tab);
 }
 
 /*
  * Uprintf prints to the controlling terminal for the current process.
  */
 int
 uprintf(const char *fmt, ...)
 {
 	va_list ap;
 	struct putchar_arg pca;
 	struct proc *p;
 	struct thread *td;
 	int retval;
 
 	td = curthread;
 	if (TD_IS_IDLETHREAD(td))
 		return (0);
 
 	if (td->td_proc == initproc) {
 		/* Produce output when we fail to load /sbin/init: */
 		va_start(ap, fmt);
 		retval = vprintf(fmt, ap);
 		va_end(ap);
 		return (retval);
 	}
 
 	sx_slock(&proctree_lock);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if ((p->p_flag & P_CONTROLT) == 0) {
 		PROC_UNLOCK(p);
 		sx_sunlock(&proctree_lock);
 		return (0);
 	}
 	SESS_LOCK(p->p_session);
 	pca.tty = p->p_session->s_ttyp;
 	SESS_UNLOCK(p->p_session);
 	PROC_UNLOCK(p);
 	if (pca.tty == NULL) {
 		sx_sunlock(&proctree_lock);
 		return (0);
 	}
 	pca.flags = TOTTY;
 	pca.p_bufr = NULL;
 	va_start(ap, fmt);
 	tty_lock(pca.tty);
 	sx_sunlock(&proctree_lock);
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 	tty_unlock(pca.tty);
 	va_end(ap);
 	return (retval);
 }
 
 /*
  * tprintf and vtprintf print on the controlling terminal associated with the
  * given session, possibly to the log as well.
  */
 void
 tprintf(struct proc *p, int pri, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vtprintf(p, pri, fmt, ap);
 	va_end(ap);
 }
 
 void
 vtprintf(struct proc *p, int pri, const char *fmt, va_list ap)
 {
 	struct tty *tp = NULL;
 	int flags = 0;
 	struct putchar_arg pca;
 	struct session *sess = NULL;
 
 	sx_slock(&proctree_lock);
 	if (pri != -1)
 		flags |= TOLOG;
 	if (p != NULL) {
 		PROC_LOCK(p);
 		if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
 			sess = p->p_session;
 			sess_hold(sess);
 			PROC_UNLOCK(p);
 			tp = sess->s_ttyp;
 			if (tp != NULL && tty_checkoutq(tp))
 				flags |= TOTTY;
 			else
 				tp = NULL;
 		} else
 			PROC_UNLOCK(p);
 	}
 	pca.pri = pri;
 	pca.tty = tp;
 	pca.flags = flags;
 	pca.p_bufr = NULL;
 	if (pca.tty != NULL)
 		tty_lock(pca.tty);
 	sx_sunlock(&proctree_lock);
 	kvprintf(fmt, putchar, &pca, 10, ap);
 	if (pca.tty != NULL)
 		tty_unlock(pca.tty);
 	if (sess != NULL)
 		sess_release(sess);
 	msgbuftrigger = 1;
 }
 
 static int
 _vprintf(int level, int flags, const char *fmt, va_list ap)
 {
 	struct putchar_arg pca;
 	int retval;
 #ifdef PRINTF_BUFR_SIZE
 	char bufr[PRINTF_BUFR_SIZE];
 #endif
 
 	TSENTER();
 	pca.tty = NULL;
 	pca.pri = level;
 	pca.flags = flags;
 #ifdef PRINTF_BUFR_SIZE
 	pca.p_bufr = bufr;
 	pca.p_next = pca.p_bufr;
 	pca.n_bufr = sizeof(bufr);
 	pca.remain = sizeof(bufr);
 	*pca.p_next = '\0';
 #else
 	/* Don't buffer console output. */
 	pca.p_bufr = NULL;
 #endif
 
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 
 #ifdef PRINTF_BUFR_SIZE
 	/* Write any buffered console/log output: */
 	if (*pca.p_bufr != '\0')
 		prf_putbuf(pca.p_bufr, flags, level);
 #endif
 
 	TSEXIT();
 	return (retval);
 }
 
 /*
  * Log writes to the log buffer, and guarantees not to sleep (so can be
  * called by interrupt routines).  If there is no process reading the
  * log yet, it writes to the console also.
  */
 void
 log(int level, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vlog(level, fmt, ap);
 	va_end(ap);
 }
 
 void
 vlog(int level, const char *fmt, va_list ap)
 {
 
 	(void)_vprintf(level, log_open ? TOLOG : TOCONS | TOLOG, fmt, ap);
 	msgbuftrigger = 1;
 }
 
 #define CONSCHUNK 128
 
 void
 log_console(struct uio *uio)
 {
 	int c, error, nl;
 	char *consbuffer;
 	int pri;
 
 	if (!log_console_output)
 		return;
 
 	pri = LOG_INFO | LOG_CONSOLE;
 	uio = cloneuio(uio);
 	consbuffer = malloc(CONSCHUNK, M_TEMP, M_WAITOK);
 
 	nl = 0;
 	while (uio->uio_resid > 0) {
 		c = imin(uio->uio_resid, CONSCHUNK - 1);
 		error = uiomove(consbuffer, c, uio);
 		if (error != 0)
 			break;
 		/* Make sure we're NUL-terminated */
 		consbuffer[c] = '\0';
 		if (consbuffer[c - 1] == '\n')
 			nl = 1;
 		else
 			nl = 0;
 		msglogstr(consbuffer, pri, /*filter_cr*/ 1);
 	}
 	/*
 	 * The previous behavior in log_console() is preserved when
 	 * log_console_add_linefeed is non-zero.  For that behavior, if an
 	 * individual console write came in that was not terminated with a
 	 * line feed, it would add a line feed.
 	 *
 	 * This results in different data in the message buffer than
 	 * appears on the system console (which doesn't add extra line feed
 	 * characters).
 	 *
 	 * A number of programs and rc scripts write a line feed, or a period
 	 * and a line feed when they have completed their operation.  On
 	 * the console, this looks seamless, but when displayed with
 	 * 'dmesg -a', you wind up with output that looks like this:
 	 *
 	 * Updating motd:
 	 * .
 	 *
 	 * On the console, it looks like this:
 	 * Updating motd:.
 	 *
 	 * We could add logic to detect that situation, or just not insert
 	 * the extra newlines.  Set the kern.log_console_add_linefeed
 	 * sysctl/tunable variable to get the old behavior.
 	 */
 	if (!nl && log_console_add_linefeed) {
 		consbuffer[0] = '\n';
 		consbuffer[1] = '\0';
 		msglogstr(consbuffer, pri, /*filter_cr*/ 1);
 	}
 	msgbuftrigger = 1;
 	free(uio, M_IOV);
 	free(consbuffer, M_TEMP);
 }
 
 int
 printf(const char *fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	va_start(ap, fmt);
 	retval = vprintf(fmt, ap);
 	va_end(ap);
 
 	return (retval);
 }
 
 int
 vprintf(const char *fmt, va_list ap)
 {
 	int retval;
 
 	retval = _vprintf(-1, TOCONS | TOLOG, fmt, ap);
 
 	if (!KERNEL_PANICKED())
 		msgbuftrigger = 1;
 
 	return (retval);
 }
 
 static void
 prf_putchar(int c, int flags, int pri)
 {
 
 	if (flags & TOLOG) {
 		msglogchar(c, pri);
 		msgbuftrigger = 1;
 	}
 
 	if (flags & TOCONS) {
 		if ((!KERNEL_PANICKED()) && (constty != NULL))
 			msgbuf_addchar(&consmsgbuf, c);
 
 		if ((constty == NULL) || always_console_output)
 			cnputc(c);
 	}
 }
 
 static void
 prf_putbuf(char *bufr, int flags, int pri)
 {
 
 	if (flags & TOLOG) {
 		msglogstr(bufr, pri, /*filter_cr*/1);
 		msgbuftrigger = 1;
 	}
 
 	if (flags & TOCONS) {
 		if ((!KERNEL_PANICKED()) && (constty != NULL))
 			msgbuf_addstr(&consmsgbuf, -1,
 			    bufr, /*filter_cr*/ 0);
 
 		if ((constty == NULL) || always_console_output)
 			cnputs(bufr);
 	}
 }
 
 static void
 putbuf(int c, struct putchar_arg *ap)
 {
 	/* Check if no console output buffer was provided. */
 	if (ap->p_bufr == NULL) {
 		prf_putchar(c, ap->flags, ap->pri);
 	} else {
 		/* Buffer the character: */
 		*ap->p_next++ = c;
 		ap->remain--;
 
 		/* Always leave the buffer zero terminated. */
 		*ap->p_next = '\0';
 
 		/* Check if the buffer needs to be flushed. */
 		if (ap->remain == 2 || c == '\n') {
 			prf_putbuf(ap->p_bufr, ap->flags, ap->pri);
 
 			ap->p_next = ap->p_bufr;
 			ap->remain = ap->n_bufr;
 			*ap->p_next = '\0';
 		}
 
 		/*
 		 * Since we fill the buffer up one character at a time,
 		 * this should not happen.  We should always catch it when
 		 * ap->remain == 2 (if not sooner due to a newline), flush
 		 * the buffer and move on.  One way this could happen is
 		 * if someone sets PRINTF_BUFR_SIZE to 1 or something
 		 * similarly silly.
 		 */
 		KASSERT(ap->remain > 2, ("Bad buffer logic, remain = %zd",
 		    ap->remain));
 	}
 }
 
 /*
  * Print a character on console or users terminal.  If destination is
  * the console then the last bunch of characters are saved in msgbuf for
  * inspection later.
  */
 static void
 putchar(int c, void *arg)
 {
 	struct putchar_arg *ap = (struct putchar_arg*) arg;
 	struct tty *tp = ap->tty;
 	int flags = ap->flags;
 
 	/* Don't use the tty code after a panic or while in ddb. */
 	if (kdb_active) {
 		if (c != '\0')
 			cnputc(c);
 		return;
 	}
 
 	if ((flags & TOTTY) && tp != NULL && !KERNEL_PANICKED())
 		tty_putchar(tp, c);
 
 	if ((flags & (TOCONS | TOLOG)) && c != '\0')
 		putbuf(c, ap);
 }
 
 /*
  * Scaled down version of sprintf(3).
  */
 int
 sprintf(char *buf, const char *cfmt, ...)
 {
 	int retval;
 	va_list ap;
 
 	va_start(ap, cfmt);
 	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
 	buf[retval] = '\0';
 	va_end(ap);
 	return (retval);
 }
 
 /*
  * Scaled down version of vsprintf(3).
  */
 int
 vsprintf(char *buf, const char *cfmt, va_list ap)
 {
 	int retval;
 
 	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
 	buf[retval] = '\0';
 	return (retval);
 }
 
 /*
  * Scaled down version of snprintf(3).
  */
 int
 snprintf(char *str, size_t size, const char *format, ...)
 {
 	int retval;
 	va_list ap;
 
 	va_start(ap, format);
 	retval = vsnprintf(str, size, format, ap);
 	va_end(ap);
 	return(retval);
 }
 
 /*
  * Scaled down version of vsnprintf(3).
  */
 int
 vsnprintf(char *str, size_t size, const char *format, va_list ap)
 {
 	struct snprintf_arg info;
 	int retval;
 
 	info.str = str;
 	info.remain = size;
 	retval = kvprintf(format, snprintf_func, &info, 10, ap);
 	if (info.remain >= 1)
 		*info.str++ = '\0';
 	return (retval);
 }
 
 /*
  * Kernel version which takes radix argument vsnprintf(3).
  */
 int
 vsnrprintf(char *str, size_t size, int radix, const char *format, va_list ap)
 {
 	struct snprintf_arg info;
 	int retval;
 
 	info.str = str;
 	info.remain = size;
 	retval = kvprintf(format, snprintf_func, &info, radix, ap);
 	if (info.remain >= 1)
 		*info.str++ = '\0';
 	return (retval);
 }
 
 static void
 snprintf_func(int ch, void *arg)
 {
 	struct snprintf_arg *const info = arg;
 
 	if (info->remain >= 2) {
 		*info->str++ = ch;
 		info->remain--;
 	}
 }
 
 /*
  * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
  * order; return an optional length and a pointer to the last character
  * written in the buffer (i.e., the first character of the string).
  * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
  */
 static char *
 ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper)
 {
 	char *p, c;
 
 	p = nbuf;
 	*p = '\0';
 	do {
 		c = hex2ascii(num % base);
 		*++p = upper ? toupper(c) : c;
 	} while (num /= base);
 	if (lenp)
 		*lenp = p - nbuf;
 	return (p);
 }
 
 /*
  * Scaled down version of printf(3).
  *
  * Two additional formats:
  *
  * The format %b is supported to decode error registers.
  * Its usage is:
  *
  *	printf("reg=%b\n", regval, "<base><arg>*");
  *
  * where <base> is the output base expressed as a control character, e.g.
  * \10 gives octal; \20 gives hex.  Each arg is a sequence of characters,
  * the first of which gives the bit number to be inspected (origin 1), and
  * the next characters (up to a control character, i.e. a character <= 32),
  * give the name of the register.  Thus:
  *
  *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE");
  *
  * would produce output:
  *
  *	reg=3<BITTWO,BITONE>
  *
  * XXX:  %D  -- Hexdump, takes pointer and separator string:
  *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
  *		("%*D", len, ptr, " " -> XX XX XX XX ...
  */
 int
 kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
 {
 #define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
 	char nbuf[MAXNBUF];
 	char *d;
 	const char *p, *percent, *q;
 	u_char *up;
 	int ch, n;
 	uintmax_t num;
 	int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
 	int cflag, hflag, jflag, tflag, zflag;
 	int bconv, dwidth, upper;
 	char padc;
 	int stop = 0, retval = 0;
 
 	num = 0;
 	q = NULL;
 	if (!func)
 		d = (char *) arg;
 	else
 		d = NULL;
 
 	if (fmt == NULL)
 		fmt = "(fmt null)\n";
 
 	if (radix < 2 || radix > 36)
 		radix = 10;
 
 	for (;;) {
 		padc = ' ';
 		width = 0;
 		while ((ch = (u_char)*fmt++) != '%' || stop) {
 			if (ch == '\0')
 				return (retval);
 			PCHAR(ch);
 		}
 		percent = fmt - 1;
 		qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
 		sign = 0; dot = 0; bconv = 0; dwidth = 0; upper = 0;
 		cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
 reswitch:	switch (ch = (u_char)*fmt++) {
 		case '.':
 			dot = 1;
 			goto reswitch;
 		case '#':
 			sharpflag = 1;
 			goto reswitch;
 		case '+':
 			sign = 1;
 			goto reswitch;
 		case '-':
 			ladjust = 1;
 			goto reswitch;
 		case '%':
 			PCHAR(ch);
 			break;
 		case '*':
 			if (!dot) {
 				width = va_arg(ap, int);
 				if (width < 0) {
 					ladjust = !ladjust;
 					width = -width;
 				}
 			} else {
 				dwidth = va_arg(ap, int);
 			}
 			goto reswitch;
 		case '0':
 			if (!dot) {
 				padc = '0';
 				goto reswitch;
 			}
 			/* FALLTHROUGH */
 		case '1': case '2': case '3': case '4':
 		case '5': case '6': case '7': case '8': case '9':
 				for (n = 0;; ++fmt) {
 					n = n * 10 + ch - '0';
 					ch = *fmt;
 					if (ch < '0' || ch > '9')
 						break;
 				}
 			if (dot)
 				dwidth = n;
 			else
 				width = n;
 			goto reswitch;
 		case 'b':
 			ladjust = 1;
 			bconv = 1;
 			goto handle_nosign;
 		case 'c':
 			width -= 1;
 
 			if (!ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			PCHAR(va_arg(ap, int));
 			if (ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			break;
 		case 'D':
 			up = va_arg(ap, u_char *);
 			p = va_arg(ap, char *);
 			if (!width)
 				width = 16;
 			while(width--) {
 				PCHAR(hex2ascii(*up >> 4));
 				PCHAR(hex2ascii(*up & 0x0f));
 				up++;
 				if (width)
 					for (q=p;*q;q++)
 						PCHAR(*q);
 			}
 			break;
 		case 'd':
 		case 'i':
 			base = 10;
 			sign = 1;
 			goto handle_sign;
 		case 'h':
 			if (hflag) {
 				hflag = 0;
 				cflag = 1;
 			} else
 				hflag = 1;
 			goto reswitch;
 		case 'j':
 			jflag = 1;
 			goto reswitch;
 		case 'l':
 			if (lflag) {
 				lflag = 0;
 				qflag = 1;
 			} else
 				lflag = 1;
 			goto reswitch;
 		case 'n':
 			/*
 			 * We do not support %n in kernel, but consume the
 			 * argument.
 			 */
 			if (jflag)
 				(void)va_arg(ap, intmax_t *);
 			else if (qflag)
 				(void)va_arg(ap, quad_t *);
 			else if (lflag)
 				(void)va_arg(ap, long *);
 			else if (zflag)
 				(void)va_arg(ap, size_t *);
 			else if (hflag)
 				(void)va_arg(ap, short *);
 			else if (cflag)
 				(void)va_arg(ap, char *);
 			else
 				(void)va_arg(ap, int *);
 			break;
 		case 'o':
 			base = 8;
 			goto handle_nosign;
 		case 'p':
 			base = 16;
 			sharpflag = (width == 0);
 			sign = 0;
 			num = (uintptr_t)va_arg(ap, void *);
 			goto number;
 		case 'q':
 			qflag = 1;
 			goto reswitch;
 		case 'r':
 			base = radix;
 			if (sign)
 				goto handle_sign;
 			goto handle_nosign;
 		case 's':
 			p = va_arg(ap, char *);
 			if (p == NULL)
 				p = "(null)";
 			if (!dot)
 				n = strlen (p);
 			else
 				for (n = 0; n < dwidth && p[n]; n++)
 					continue;
 
 			width -= n;
 
 			if (!ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			while (n--)
 				PCHAR(*p++);
 			if (ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			break;
 		case 't':
 			tflag = 1;
 			goto reswitch;
 		case 'u':
 			base = 10;
 			goto handle_nosign;
 		case 'X':
 			upper = 1;
 			/* FALLTHROUGH */
 		case 'x':
 			base = 16;
 			goto handle_nosign;
 		case 'y':
 			base = 16;
 			sign = 1;
 			goto handle_sign;
 		case 'z':
 			zflag = 1;
 			goto reswitch;
 handle_nosign:
 			sign = 0;
 			if (jflag)
 				num = va_arg(ap, uintmax_t);
 			else if (qflag)
 				num = va_arg(ap, u_quad_t);
 			else if (tflag)
 				num = va_arg(ap, ptrdiff_t);
 			else if (lflag)
 				num = va_arg(ap, u_long);
 			else if (zflag)
 				num = va_arg(ap, size_t);
 			else if (hflag)
 				num = (u_short)va_arg(ap, int);
 			else if (cflag)
 				num = (u_char)va_arg(ap, int);
 			else
 				num = va_arg(ap, u_int);
 			if (bconv) {
 				q = va_arg(ap, char *);
 				base = *q++;
 			}
 			goto number;
 handle_sign:
 			if (jflag)
 				num = va_arg(ap, intmax_t);
 			else if (qflag)
 				num = va_arg(ap, quad_t);
 			else if (tflag)
 				num = va_arg(ap, ptrdiff_t);
 			else if (lflag)
 				num = va_arg(ap, long);
 			else if (zflag)
 				num = va_arg(ap, ssize_t);
 			else if (hflag)
 				num = (short)va_arg(ap, int);
 			else if (cflag)
 				num = (char)va_arg(ap, int);
 			else
 				num = va_arg(ap, int);
 number:
 			if (sign && (intmax_t)num < 0) {
 				neg = 1;
 				num = -(intmax_t)num;
 			}
 			p = ksprintn(nbuf, num, base, &n, upper);
 			tmp = 0;
 			if (sharpflag && num != 0) {
 				if (base == 8)
 					tmp++;
 				else if (base == 16)
 					tmp += 2;
 			}
 			if (neg)
 				tmp++;
 
 			if (!ladjust && padc == '0')
 				dwidth = width - tmp;
 			width -= tmp + imax(dwidth, n);
 			dwidth -= n;
 			if (!ladjust)
 				while (width-- > 0)
 					PCHAR(' ');
 			if (neg)
 				PCHAR('-');
 			if (sharpflag && num != 0) {
 				if (base == 8) {
 					PCHAR('0');
 				} else if (base == 16) {
 					PCHAR('0');
 					PCHAR('x');
 				}
 			}
 			while (dwidth-- > 0)
 				PCHAR('0');
 
 			while (*p)
 				PCHAR(*p--);
 
 			if (bconv && num != 0) {
 				/* %b conversion flag format. */
 				tmp = retval;
 				while (*q) {
 					n = *q++;
 					if (num & (1 << (n - 1))) {
 						PCHAR(retval != tmp ?
 						    ',' : '<');
 						for (; (n = *q) > ' '; ++q)
 							PCHAR(n);
 					} else
 						for (; *q > ' '; ++q)
 							continue;
 				}
 				if (retval != tmp) {
 					PCHAR('>');
 					width -= retval - tmp;
 				}
 			}
 
 			if (ladjust)
 				while (width-- > 0)
 					PCHAR(' ');
 
 			break;
 		default:
 			while (percent < fmt)
 				PCHAR(*percent++);
 			/*
 			 * Since we ignore a formatting argument it is no
 			 * longer safe to obey the remaining formatting
 			 * arguments as the arguments will no longer match
 			 * the format specs.
 			 */
 			stop = 1;
 			break;
 		}
 	}
 #undef PCHAR
 }
 
 /*
  * Put character in log buffer with a particular priority.
  */
 static void
 msglogchar(int c, int pri)
 {
 	static int lastpri = -1;
 	static int dangling;
 	char nbuf[MAXNBUF];
 	char *p;
 
 	if (!msgbufmapped)
 		return;
 	if (c == '\0' || c == '\r')
 		return;
 	if (pri != -1 && pri != lastpri) {
 		if (dangling) {
 			msgbuf_addchar(msgbufp, '\n');
 			dangling = 0;
 		}
 		msgbuf_addchar(msgbufp, '<');
 		for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;)
 			msgbuf_addchar(msgbufp, *p--);
 		msgbuf_addchar(msgbufp, '>');
 		lastpri = pri;
 	}
 	msgbuf_addchar(msgbufp, c);
 	if (c == '\n') {
 		dangling = 0;
 		lastpri = -1;
 	} else {
 		dangling = 1;
 	}
 }
 
 static void
 msglogstr(char *str, int pri, int filter_cr)
 {
 	if (!msgbufmapped)
 		return;
 
 	msgbuf_addstr(msgbufp, pri, str, filter_cr);
 }
 
 void
 msgbufinit(void *ptr, int size)
 {
 	char *cp;
 	static struct msgbuf *oldp = NULL;
 	bool print_boot_tag;
 
 	TSENTER();
 	size -= sizeof(*msgbufp);
 	cp = (char *)ptr;
 	print_boot_tag = !msgbufmapped;
 	/* Attempt to fetch kern.boot_tag tunable on first mapping */
 	if (!msgbufmapped)
 		TUNABLE_STR_FETCH("kern.boot_tag", current_boot_tag,
 		    sizeof(current_boot_tag));
 	msgbufp = (struct msgbuf *)(cp + size);
 	msgbuf_reinit(msgbufp, cp, size);
 	if (msgbufmapped && oldp != msgbufp)
 		msgbuf_copy(oldp, msgbufp);
 	msgbufmapped = true;
 	if (print_boot_tag && *current_boot_tag != '\0')
 		printf("%s\n", current_boot_tag);
 	oldp = msgbufp;
 	TSEXIT();
 }
 
 /* Sysctls for accessing/clearing the msgbuf */
 static int
 sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS)
 {
 	char buf[128], *bp;
 	u_int seq;
 	int error, len;
 	bool wrap;
 
 	error = priv_check(req->td, PRIV_MSGBUF);
 	if (error)
 		return (error);
 
 	/* Read the whole buffer, one chunk at a time. */
 	mtx_lock(&msgbuf_lock);
 	msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
 	wrap = (seq != 0);
 	for (;;) {
 		len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
 		mtx_unlock(&msgbuf_lock);
 		if (len == 0)
 			return (SYSCTL_OUT(req, "", 1)); /* add nulterm */
 		if (wrap) {
 			/* Skip the first line, as it is probably incomplete. */
 			bp = memchr(buf, '\n', len);
 			if (bp == NULL) {
 				mtx_lock(&msgbuf_lock);
 				continue;
 			}
 			wrap = false;
 			bp++;
 			len -= bp - buf;
 			if (len == 0) {
 				mtx_lock(&msgbuf_lock);
 				continue;
 			}
 		} else
 			bp = buf;
 		error = sysctl_handle_opaque(oidp, bp, len, req);
 		if (error)
 			return (error);
 
 		mtx_lock(&msgbuf_lock);
 	}
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, msgbuf,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer");
 
 static int msgbuf_clearflag;
 
 static int
 sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (!error && req->newptr) {
 		mtx_lock(&msgbuf_lock);
 		msgbuf_clear(msgbufp);
 		mtx_unlock(&msgbuf_lock);
 		msgbuf_clearflag = 0;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE,
     &msgbuf_clearflag, 0, sysctl_kern_msgbuf_clear, "I",
     "Clear kernel message buffer");
 
 #ifdef DDB
 
 DB_SHOW_COMMAND_FLAGS(msgbuf, db_show_msgbuf, DB_CMD_MEMSAFE)
 {
 	int i, j;
 
 	if (!msgbufmapped) {
 		db_printf("msgbuf not mapped yet\n");
 		return;
 	}
 	db_printf("msgbufp = %p\n", msgbufp);
 	db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n",
 	    msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq,
 	    msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum);
 	for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) {
 		j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq);
 		db_printf("%c", msgbufp->msg_ptr[j]);
 	}
 	db_printf("\n");
 }
 
 #endif /* DDB */
 
 void
 hexdump(const void *ptr, int length, const char *hdr, int flags)
 {
 	int i, j, k;
 	int cols;
 	const unsigned char *cp;
 	char delim;
 
 	if ((flags & HD_DELIM_MASK) != 0)
 		delim = (flags & HD_DELIM_MASK) >> 8;
 	else
 		delim = ' ';
 
 	if ((flags & HD_COLUMN_MASK) != 0)
 		cols = flags & HD_COLUMN_MASK;
 	else
 		cols = 16;
 
 	cp = ptr;
 	for (i = 0; i < length; i+= cols) {
 		if (hdr != NULL)
 			printf("%s", hdr);
 
 		if ((flags & HD_OMIT_COUNT) == 0)
 			printf("%04x  ", i);
 
 		if ((flags & HD_OMIT_HEX) == 0) {
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k < length)
 					printf("%c%02x", delim, cp[k]);
 				else
 					printf("   ");
 			}
 		}
 
 		if ((flags & HD_OMIT_CHARS) == 0) {
 			printf("  |");
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k >= length)
 					printf(" ");
 				else if (cp[k] >= ' ' && cp[k] <= '~')
 					printf("%c", cp[k]);
 				else
 					printf(".");
 			}
 			printf("|");
 		}
 		printf("\n");
 	}
 }
 #endif /* _KERNEL */
 
 void
 sbuf_hexdump(struct sbuf *sb, const void *ptr, int length, const char *hdr,
 	     int flags)
 {
 	int i, j, k;
 	int cols;
 	const unsigned char *cp;
 	char delim;
 
 	if ((flags & HD_DELIM_MASK) != 0)
 		delim = (flags & HD_DELIM_MASK) >> 8;
 	else
 		delim = ' ';
 
 	if ((flags & HD_COLUMN_MASK) != 0)
 		cols = flags & HD_COLUMN_MASK;
 	else
 		cols = 16;
 
 	cp = ptr;
 	for (i = 0; i < length; i+= cols) {
 		if (hdr != NULL)
 			sbuf_printf(sb, "%s", hdr);
 
 		if ((flags & HD_OMIT_COUNT) == 0)
 			sbuf_printf(sb, "%04x  ", i);
 
 		if ((flags & HD_OMIT_HEX) == 0) {
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k < length)
 					sbuf_printf(sb, "%c%02x", delim, cp[k]);
 				else
-					sbuf_printf(sb, "   ");
+					sbuf_cat(sb, "   ");
 			}
 		}
 
 		if ((flags & HD_OMIT_CHARS) == 0) {
-			sbuf_printf(sb, "  |");
+			sbuf_cat(sb, "  |");
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k >= length)
-					sbuf_printf(sb, " ");
+					sbuf_putc(sb, ' ');
 				else if (cp[k] >= ' ' && cp[k] <= '~')
-					sbuf_printf(sb, "%c", cp[k]);
+					sbuf_putc(sb, cp[k]);
 				else
-					sbuf_printf(sb, ".");
+					sbuf_putc(sb, '.');
 			}
-			sbuf_printf(sb, "|");
+			sbuf_putc(sb, '|');
 		}
-		sbuf_printf(sb, "\n");
+		sbuf_putc(sb, '\n');
 	}
 }
 
 #ifdef _KERNEL
 void
 counted_warning(unsigned *counter, const char *msg)
 {
 	struct thread *td;
 	unsigned c;
 
 	for (;;) {
 		c = *counter;
 		if (c == 0)
 			break;
 		if (atomic_cmpset_int(counter, c, c - 1)) {
 			td = curthread;
 			log(LOG_INFO, "pid %d (%s) %s%s\n",
 			    td->td_proc->p_pid, td->td_name, msg,
 			    c > 1 ? "" : " - not logging anymore");
 			break;
 		}
 	}
 }
 #endif
 
 #ifdef _KERNEL
 void
 sbuf_putbuf(struct sbuf *sb)
 {
 
 	prf_putbuf(sbuf_data(sb), TOLOG | TOCONS, -1);
 }
 #else
 void
 sbuf_putbuf(struct sbuf *sb)
 {
 
 	printf("%s", sbuf_data(sb));
 }
 #endif
 
 int
 sbuf_printf_drain(void *arg, const char *data, int len)
 {
 	size_t *retvalptr;
 	int r;
 #ifdef _KERNEL
 	char *dataptr;
 	char oldchr;
 
 	/*
 	 * This is allowed as an extra byte is always resvered for
 	 * terminating NUL byte.  Save and restore the byte because
 	 * we might be flushing a record, and there may be valid
 	 * data after the buffer.
 	 */
 	oldchr = data[len];
 	dataptr = __DECONST(char *, data);
 	dataptr[len] = '\0';
 
 	prf_putbuf(dataptr, TOLOG | TOCONS, -1);
 	r = len;
 
 	dataptr[len] = oldchr;
 
 #else /* !_KERNEL */
 
 	r = printf("%.*s", len, data);
 	if (r < 0)
 		return (-errno);
 
 #endif
 
 	retvalptr = arg;
 	if (retvalptr != NULL)
 		*retvalptr += r;
 
 	return (r);
 }
diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c
index bbbf753e3df0..15dbf396c557 100644
--- a/sys/kern/subr_sleepqueue.c
+++ b/sys/kern/subr_sleepqueue.c
@@ -1,1528 +1,1528 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Implementation of sleep queues used to hold queue of threads blocked on
  * a wait channel.  Sleep queues are different from turnstiles in that wait
  * channels are not owned by anyone, so there is no priority propagation.
  * Sleep queues can also provide a timeout and can also be interrupted by
  * signals.  That said, there are several similarities between the turnstile
  * and sleep queue implementations.  (Note: turnstiles were implemented
  * first.)  For example, both use a hash table of the same size where each
  * bucket is referred to as a "chain" that contains both a spin lock and
  * a linked list of queues.  An individual queue is located by using a hash
  * to pick a chain, locking the chain, and then walking the chain searching
  * for the queue.  This means that a wait channel object does not need to
  * embed its queue head just as locks do not embed their turnstile queue
  * head.  Threads also carry around a sleep queue that they lend to the
  * wait channel when blocking.  Just as in turnstiles, the queue includes
  * a free list of the sleep queues of other threads blocked on the same
  * wait channel in the case of multiple waiters.
  *
  * Some additional functionality provided by sleep queues include the
  * ability to set a timeout.  The timeout is managed using a per-thread
  * callout that resumes a thread if it is asleep.  A thread may also
  * catch signals while it is asleep (aka an interruptible sleep).  The
  * signal code uses sleepq_abort() to interrupt a sleeping thread.  Finally,
  * sleep queues also provide some extra assertions.  One is not allowed to
  * mix the sleep/wakeup and cv APIs for a given wait channel.  Also, one
  * must consistently use the same lock to synchronize with a wait channel,
  * though this check is currently only a warning for sleep/wakeup due to
  * pre-existing abuse of that API.  The same lock must also be held when
  * awakening threads, though that is currently only enforced for condition
  * variables.
  */
 
 #include <sys/cdefs.h>
 #include "opt_sleepqueue_profiling.h"
 #include "opt_ddb.h"
 #include "opt_sched.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #ifdef EPOCH_TRACE
 #include <sys/epoch.h>
 #endif
 
 #include <machine/atomic.h>
 
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 /*
  * Constants for the hash table of sleep queue chains.
  * SC_TABLESIZE must be a power of two for SC_MASK to work properly.
  */
 #ifndef SC_TABLESIZE
 #define	SC_TABLESIZE	256
 #endif
 CTASSERT(powerof2(SC_TABLESIZE));
 #define	SC_MASK		(SC_TABLESIZE - 1)
 #define	SC_SHIFT	8
 #define	SC_HASH(wc)	((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
 			    SC_MASK)
 #define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
 #define NR_SLEEPQS      2
 /*
  * There are two different lists of sleep queues.  Both lists are connected
  * via the sq_hash entries.  The first list is the sleep queue chain list
  * that a sleep queue is on when it is attached to a wait channel.  The
  * second list is the free list hung off of a sleep queue that is attached
  * to a wait channel.
  *
  * Each sleep queue also contains the wait channel it is attached to, the
  * list of threads blocked on that wait channel, flags specific to the
  * wait channel, and the lock used to synchronize with a wait channel.
  * The flags are used to catch mismatches between the various consumers
  * of the sleep queue API (e.g. sleep/wakeup and condition variables).
  * The lock pointer is only used when invariants are enabled for various
  * debugging checks.
  *
  * Locking key:
  *  c - sleep queue chain lock
  */
 struct sleepqueue {
 	struct threadqueue sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */
 	u_int sq_blockedcnt[NR_SLEEPQS];	/* (c) N. of blocked threads. */
 	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
 	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
 	const void	*sq_wchan;		/* (c) Wait channel. */
 	int	sq_type;			/* (c) Queue type. */
 #ifdef INVARIANTS
 	struct lock_object *sq_lock;		/* (c) Associated lock. */
 #endif
 };
 
 struct sleepqueue_chain {
 	LIST_HEAD(, sleepqueue) sc_queues;	/* List of sleep queues. */
 	struct mtx sc_lock;			/* Spin lock for this chain. */
 #ifdef SLEEPQUEUE_PROFILING
 	u_int	sc_depth;			/* Length of sc_queues. */
 	u_int	sc_max_depth;			/* Max length of sc_queues. */
 #endif
 } __aligned(CACHE_LINE_SIZE);
 
 #ifdef SLEEPQUEUE_PROFILING
 static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "sleepq profiling");
 static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains,
     CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "sleepq chain stats");
 static u_int sleepq_max_depth;
 SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
     0, "maxmimum depth achieved of a single chain");
 
 static void	sleepq_profile(const char *wmesg);
 static int	prof_enabled;
 #endif
 static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
 static uma_zone_t sleepq_zone;
 
 /*
  * Prototypes for non-exported routines.
  */
 static int	sleepq_catch_signals(const void *wchan, int pri);
 static inline int sleepq_check_signals(void);
 static inline int sleepq_check_timeout(void);
 #ifdef INVARIANTS
 static void	sleepq_dtor(void *mem, int size, void *arg);
 #endif
 static int	sleepq_init(void *mem, int size, int flags);
 static int	sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
 		    int pri, int srqflags);
 static void	sleepq_remove_thread(struct sleepqueue *sq, struct thread *td);
 static void	sleepq_switch(const void *wchan, int pri);
 static void	sleepq_timeout(void *arg);
 
 SDT_PROBE_DECLARE(sched, , , sleep);
 SDT_PROBE_DECLARE(sched, , , wakeup);
 
 /*
  * Initialize SLEEPQUEUE_PROFILING specific sysctl nodes.
  * Note that it must happen after sleepinit() has been fully executed, so
  * it must happen after SI_SUB_KMEM SYSINIT() subsystem setup.
  */
 #ifdef SLEEPQUEUE_PROFILING
 static void
 init_sleepqueue_profiling(void)
 {
 	char chain_name[10];
 	struct sysctl_oid *chain_oid;
 	u_int i;
 
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		snprintf(chain_name, sizeof(chain_name), "%u", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL,
 		    SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
 		    chain_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 		    "sleepq chain stats");
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
 		    NULL);
 	}
 }
 
 SYSINIT(sleepqueue_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
     init_sleepqueue_profiling, NULL);
 #endif
 
 /*
  * Early initialization of sleep queues that is called from the sleepinit()
  * SYSINIT.
  */
 void
 init_sleepqueues(void)
 {
 	int i;
 
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_INIT(&sleepq_chains[i].sc_queues);
 		mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
 		    MTX_SPIN);
 	}
 	sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
 #ifdef INVARIANTS
 	    NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #else
 	    NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #endif
 
 	thread0.td_sleepqueue = sleepq_alloc();
 }
 
 /*
  * Get a sleep queue for a new thread.
  */
 struct sleepqueue *
 sleepq_alloc(void)
 {
 
 	return (uma_zalloc(sleepq_zone, M_WAITOK));
 }
 
 /*
  * Free a sleep queue when a thread is destroyed.
  */
 void
 sleepq_free(struct sleepqueue *sq)
 {
 
 	uma_zfree(sleepq_zone, sq);
 }
 
 /*
  * Lock the sleep queue chain associated with the specified wait channel.
  */
 void
 sleepq_lock(const void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_lock_spin(&sc->sc_lock);
 }
 
 /*
  * Look up the sleep queue associated with a given wait channel in the hash
  * table locking the associated sleep queue chain.  If no queue is found in
  * the table, NULL is returned.
  */
 struct sleepqueue *
 sleepq_lookup(const void *wchan)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			return (sq);
 	return (NULL);
 }
 
 /*
  * Unlock the sleep queue chain associated with a given wait channel.
  */
 void
 sleepq_release(const void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_unlock_spin(&sc->sc_lock);
 }
 
 /*
  * Places the current thread on the sleep queue for the specified wait
  * channel.  If INVARIANTS is enabled, then it associates the passed in
  * lock with the sleepq to make sure it is held when that sleep queue is
  * woken up.
  */
 void
 sleepq_add(const void *wchan, struct lock_object *lock, const char *wmesg,
     int flags, int queue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(td->td_sleepqueue != NULL);
 	MPASS(wchan != NULL);
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	/* If this thread is not allowed to sleep, die a horrible death. */
 	if (__predict_false(!THREAD_CAN_SLEEP())) {
 #ifdef EPOCH_TRACE
 		epoch_trace_list(curthread);
 #endif
 		KASSERT(0,
 		    ("%s: td %p to sleep on wchan %p with sleeping prohibited",
 		    __func__, td, wchan));
 	}
 
 	/* Look up the sleep queue associated with the wait channel 'wchan'. */
 	sq = sleepq_lookup(wchan);
 
 	/*
 	 * If the wait channel does not already have a sleep queue, use
 	 * this thread's sleep queue.  Otherwise, insert the current thread
 	 * into the sleep queue already in use by this wait channel.
 	 */
 	if (sq == NULL) {
 #ifdef INVARIANTS
 		int i;
 
 		sq = td->td_sleepqueue;
 		for (i = 0; i < NR_SLEEPQS; i++) {
 			KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
 			    ("thread's sleep queue %d is not empty", i));
 			KASSERT(sq->sq_blockedcnt[i] == 0,
 			    ("thread's sleep queue %d count mismatches", i));
 		}
 		KASSERT(LIST_EMPTY(&sq->sq_free),
 		    ("thread's sleep queue has a non-empty free list"));
 		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
 		sq->sq_lock = lock;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth++;
 		if (sc->sc_depth > sc->sc_max_depth) {
 			sc->sc_max_depth = sc->sc_depth;
 			if (sc->sc_max_depth > sleepq_max_depth)
 				sleepq_max_depth = sc->sc_max_depth;
 		}
 #endif
 		sq = td->td_sleepqueue;
 		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
 		sq->sq_wchan = wchan;
 		sq->sq_type = flags & SLEEPQ_TYPE;
 	} else {
 		MPASS(wchan == sq->sq_wchan);
 		MPASS(lock == sq->sq_lock);
 		MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
 		LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
 	}
 	thread_lock(td);
 	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
 	sq->sq_blockedcnt[queue]++;
 	td->td_sleepqueue = NULL;
 	td->td_sqqueue = queue;
 	td->td_wchan = wchan;
 	td->td_wmesg = wmesg;
 	if (flags & SLEEPQ_INTERRUPTIBLE) {
 		td->td_intrval = 0;
 		td->td_flags |= TDF_SINTR;
 	}
 	td->td_flags &= ~TDF_TIMEOUT;
 	thread_unlock(td);
 }
 
 /*
  * Sets a timeout that will remove the current thread from the
  * specified sleep queue at the specified time if the thread has not
  * already been awakened.  Flags are from C_* (callout) namespace.
  */
 void
 sleepq_set_timeout_sbt(const void *wchan, sbintime_t sbt, sbintime_t pr,
     int flags)
 {
 	struct sleepqueue_chain *sc __unused;
 	struct thread *td;
 	sbintime_t pr1;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_sleepqueue == NULL);
 	MPASS(wchan != NULL);
 	if (cold && td == &thread0)
 		panic("timed sleep before timers are working");
 	KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx",
 	    td->td_tid, td, (uintmax_t)td->td_sleeptimo));
 	thread_lock(td);
 	callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1);
 	thread_unlock(td);
 	callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1,
 	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC |
 	    C_DIRECT_EXEC);
 }
 
 /*
  * Return the number of actual sleepers for the specified queue.
  */
 u_int
 sleepq_sleepcnt(const void *wchan, int queue)
 {
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	return (sq->sq_blockedcnt[queue]);
 }
 
 static int
 sleepq_check_ast_sc_locked(struct thread *td, struct sleepqueue_chain *sc)
 {
 	struct proc *p;
 	int ret;
 
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
 	if ((td->td_pflags & TDP_WAKEUP) != 0) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		thread_lock(td);
 		return (EINTR);
 	}
 
 	/*
 	 * See if there are any pending signals or suspension requests for this
 	 * thread.  If not, we can switch immediately.
 	 */
 	thread_lock(td);
 	if (!td_ast_pending(td, TDA_SIG) && !td_ast_pending(td, TDA_SUSPEND))
 		return (0);
 
 	thread_unlock(td);
 	mtx_unlock_spin(&sc->sc_lock);
 
 	p = td->td_proc;
 	CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
 	    (void *)td, (long)p->p_pid, td->td_name);
 	PROC_LOCK(p);
 
 	/*
 	 * Check for suspension first. Checking for signals and then
 	 * suspending could result in a missed signal, since a signal
 	 * can be delivered while this thread is suspended.
 	 */
 	ret = sig_ast_checksusp(td);
 	if (ret != 0) {
 		PROC_UNLOCK(p);
 		mtx_lock_spin(&sc->sc_lock);
 		thread_lock(td);
 		return (ret);
 	}
 
 	ret = sig_ast_needsigchk(td);
 
 	/*
 	 * Lock the per-process spinlock prior to dropping the
 	 * PROC_LOCK to avoid a signal delivery race.
 	 * PROC_LOCK, PROC_SLOCK, and thread_lock() are
 	 * currently held in tdsendsignal() and thread_single().
 	 */
 	PROC_SLOCK(p);
 	mtx_lock_spin(&sc->sc_lock);
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	PROC_SUNLOCK(p);
 
 	return (ret);
 }
 
 /*
  * Marks the pending sleep of the current thread as interruptible and
  * makes an initial check for pending signals before putting a thread
  * to sleep. Enters and exits with the thread lock held.  Thread lock
  * may have transitioned from the sleepq lock to a run lock.
  */
 static int
 sleepq_catch_signals(const void *wchan, int pri)
 {
 	struct thread *td;
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	int ret;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(wchan != NULL);
 	td = curthread;
 
 	ret = sleepq_check_ast_sc_locked(td, sc);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
 	if (ret == 0) {
 		/*
 		 * No pending signals and no suspension requests found.
 		 * Switch the thread off the cpu.
 		 */
 		sleepq_switch(wchan, pri);
 	} else {
 		/*
 		 * There were pending signals and this thread is still
 		 * on the sleep queue, remove it from the sleep queue.
 		 */
 		if (TD_ON_SLEEPQ(td)) {
 			sq = sleepq_lookup(wchan);
 			sleepq_remove_thread(sq, td);
 		}
 		MPASS(td->td_lock != &sc->sc_lock);
 		mtx_unlock_spin(&sc->sc_lock);
 		thread_unlock(td);
 	}
 	return (ret);
 }
 
 /*
  * Switches to another thread if we are still asleep on a sleep queue.
  * Returns with thread lock.
  */
 static void
 sleepq_switch(const void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	bool rtc_changed;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If we have a sleep queue, then we've already been woken up, so
 	 * just return.
 	 */
 	if (td->td_sleepqueue != NULL) {
 		mtx_unlock_spin(&sc->sc_lock);
 		thread_unlock(td);
 		return;
 	}
 
 	/*
 	 * If TDF_TIMEOUT is set, then our sleep has been timed out
 	 * already but we are still on the sleep queue, so dequeue the
 	 * thread and return.
 	 *
 	 * Do the same if the real-time clock has been adjusted since this
 	 * thread calculated its timeout based on that clock.  This handles
 	 * the following race:
 	 * - The Ts thread needs to sleep until an absolute real-clock time.
 	 *   It copies the global rtc_generation into curthread->td_rtcgen,
 	 *   reads the RTC, and calculates a sleep duration based on that time.
 	 *   See umtxq_sleep() for an example.
 	 * - The Tc thread adjusts the RTC, bumps rtc_generation, and wakes
 	 *   threads that are sleeping until an absolute real-clock time.
 	 *   See tc_setclock() and the POSIX specification of clock_settime().
 	 * - Ts reaches the code below.  It holds the sleepqueue chain lock,
 	 *   so Tc has finished waking, so this thread must test td_rtcgen.
 	 * (The declaration of td_rtcgen refers to this comment.)
 	 */
 	rtc_changed = td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation;
 	if ((td->td_flags & TDF_TIMEOUT) || rtc_changed) {
 		if (rtc_changed) {
 			td->td_rtcgen = 0;
 		}
 		MPASS(TD_ON_SLEEPQ(td));
 		sq = sleepq_lookup(wchan);
 		sleepq_remove_thread(sq, td);
 		mtx_unlock_spin(&sc->sc_lock);
 		thread_unlock(td);
 		return;
 	}
 #ifdef SLEEPQUEUE_PROFILING
 	if (prof_enabled)
 		sleepq_profile(td->td_wmesg);
 #endif
 	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td, pri);
 	thread_lock_set(td, &sc->sc_lock);
 	SDT_PROBE0(sched, , , sleep);
 	TD_SET_SLEEPING(td);
 	mi_switch(SW_VOL | SWT_SLEEPQ);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 }
 
 /*
  * Check to see if we timed out.
  */
 static inline int
 sleepq_check_timeout(void)
 {
 	struct thread *td;
 	int res;
 
 	res = 0;
 	td = curthread;
 	if (td->td_sleeptimo != 0) {
 		if (td->td_sleeptimo <= sbinuptime())
 			res = EWOULDBLOCK;
 		td->td_sleeptimo = 0;
 	}
 	return (res);
 }
 
 /*
  * Check to see if we were awoken by a signal.
  */
 static inline int
 sleepq_check_signals(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	KASSERT((td->td_flags & TDF_SINTR) == 0,
 	    ("thread %p still in interruptible sleep?", td));
 
 	return (td->td_intrval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue.
  */
 void
 sleepq_wait(const void *wchan, int pri)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it is interrupted by a signal.
  */
 int
 sleepq_wait_sig(const void *wchan, int pri)
 {
 	int rcatch;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	if (rcatch)
 		return (rcatch);
 	return (sleepq_check_signals());
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it times out while waiting.
  */
 int
 sleepq_timedwait(const void *wchan, int pri)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 
 	return (sleepq_check_timeout());
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue,
  * it is interrupted by a signal, or it times out waiting to be awakened.
  */
 int
 sleepq_timedwait_sig(const void *wchan, int pri)
 {
 	int rcatch, rvalt, rvals;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	/* We must always call check_timeout() to clear sleeptimo. */
 	rvalt = sleepq_check_timeout();
 	rvals = sleepq_check_signals();
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
 		return (rvals);
 	return (rvalt);
 }
 
 /*
  * Returns the type of sleepqueue given a waitchannel.
  */
 int
 sleepq_type(const void *wchan)
 {
 	struct sleepqueue *sq;
 	int type;
 
 	MPASS(wchan != NULL);
 
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (-1);
 	type = sq->sq_type;
 
 	return (type);
 }
 
 /*
  * Removes a thread from a sleep queue and makes it
  * runnable.
  *
  * Requires the sc chain locked on entry.  If SRQ_HOLD is specified it will
  * be locked on return.  Returns without the thread lock held.
  */
 static int
 sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri,
     int srqflags)
 {
 	struct sleepqueue_chain *sc;
 	bool drop;
 
 	MPASS(td != NULL);
 	MPASS(sq->sq_wchan != NULL);
 	MPASS(td->td_wchan == sq->sq_wchan);
 
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
 	/*
 	 * Avoid recursing on the chain lock.  If the locks don't match we
 	 * need to acquire the thread lock which setrunnable will drop for
 	 * us.  In this case we need to drop the chain lock afterwards.
 	 *
 	 * There is no race that will make td_lock equal to sc_lock because
 	 * we hold sc_lock.
 	 */
 	drop = false;
 	if (!TD_IS_SLEEPING(td)) {
 		thread_lock(td);
 		drop = true;
 	} else
 		thread_lock_block_wait(td);
 
 	/* Remove thread from the sleepq. */
 	sleepq_remove_thread(sq, td);
 
 	/* If we're done with the sleepqueue release it. */
 	if ((srqflags & SRQ_HOLD) == 0 && drop)
 		mtx_unlock_spin(&sc->sc_lock);
 
 	/* Adjust priority if requested. */
 	MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX));
 	if (pri != 0 && td->td_priority > pri &&
 	    PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, pri);
 
 	/*
 	 * Note that thread td might not be sleeping if it is running
 	 * sleepq_catch_signals() on another CPU or is blocked on its
 	 * proc lock to check signals.  There's no need to mark the
 	 * thread runnable in that case.
 	 */
 	if (TD_IS_SLEEPING(td)) {
 		MPASS(!drop);
 		TD_CLR_SLEEPING(td);
 		return (setrunnable(td, srqflags));
 	}
 	MPASS(drop);
 	thread_unlock(td);
 
 	return (0);
 }
 
 static void
 sleepq_remove_thread(struct sleepqueue *sq, struct thread *td)
 {
 	struct sleepqueue_chain *sc __unused;
 
 	MPASS(td != NULL);
 	MPASS(sq->sq_wchan != NULL);
 	MPASS(td->td_wchan == sq->sq_wchan);
 	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
 	SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
 
 	/* Remove the thread from the queue. */
 	sq->sq_blockedcnt[td->td_sqqueue]--;
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
 
 	/*
 	 * Get a sleep queue for this thread.  If this is the last waiter,
 	 * use the queue itself and take it out of the chain, otherwise,
 	 * remove a queue from the free list.
 	 */
 	if (LIST_EMPTY(&sq->sq_free)) {
 		td->td_sleepqueue = sq;
 #ifdef INVARIANTS
 		sq->sq_wchan = NULL;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth--;
 #endif
 	} else
 		td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
 	LIST_REMOVE(td->td_sleepqueue, sq_hash);
 
 	if ((td->td_flags & TDF_TIMEOUT) == 0 && td->td_sleeptimo != 0 &&
 	    td->td_lock == &sc->sc_lock) {
 		/*
 		 * We ignore the situation where timeout subsystem was
 		 * unable to stop our callout.  The struct thread is
 		 * type-stable, the callout will use the correct
 		 * memory when running.  The checks of the
 		 * td_sleeptimo value in this function and in
 		 * sleepq_timeout() ensure that the thread does not
 		 * get spurious wakeups, even if the callout was reset
 		 * or thread reused.
 		 *
 		 * We also cannot safely stop the callout if a scheduler
 		 * lock is held since softclock_thread() forces a lock
 		 * order of callout lock -> scheduler lock.  The thread
 		 * lock will be a scheduler lock only if the thread is
 		 * preparing to go to sleep, so this is hopefully a rare
 		 * scenario.
 		 */
 		callout_stop(&td->td_slpcallout);
 	}
 
 	td->td_wmesg = NULL;
 	td->td_wchan = NULL;
 	td->td_flags &= ~(TDF_SINTR | TDF_TIMEOUT);
 
 	CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, td->td_name);
 }
 
 void
 sleepq_remove_nested(struct thread *td)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	const void *wchan;
 
 	MPASS(TD_ON_SLEEPQ(td));
 
 	wchan = td->td_wchan;
 	sc = SC_LOOKUP(wchan);
 	mtx_lock_spin(&sc->sc_lock);
 	sq = sleepq_lookup(wchan);
 	MPASS(sq != NULL);
 	thread_lock(td);
 	sleepq_remove_thread(sq, td);
 	mtx_unlock_spin(&sc->sc_lock);
 	/* Returns with the thread lock owned. */
 }
 
 #ifdef INVARIANTS
 /*
  * UMA zone item deallocator.
  */
 static void
 sleepq_dtor(void *mem, int size, void *arg)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
 		MPASS(sq->sq_blockedcnt[i] == 0);
 	}
 }
 #endif
 
 /*
  * UMA zone item initializer.
  */
 static int
 sleepq_init(void *mem, int size, int flags)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	bzero(mem, size);
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		TAILQ_INIT(&sq->sq_blocked[i]);
 		sq->sq_blockedcnt[i] = 0;
 	}
 	LIST_INIT(&sq->sq_free);
 	return (0);
 }
 
 /*
  * Find thread sleeping on a wait channel and resume it.
  */
 int
 sleepq_signal(const void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct threadqueue *head;
 	struct thread *td, *besttd;
 	int wakeup_swapper;
 
 	CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL) {
 		if (flags & SLEEPQ_DROP)
 			sleepq_release(wchan);
 		return (0);
 	}
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	head = &sq->sq_blocked[queue];
 	if (flags & SLEEPQ_UNFAIR) {
 		/*
 		 * Find the most recently sleeping thread, but try to
 		 * skip threads still in process of context switch to
 		 * avoid spinning on the thread lock.
 		 */
 		sc = SC_LOOKUP(wchan);
 		besttd = TAILQ_LAST_FAST(head, thread, td_slpq);
 		while (besttd->td_lock != &sc->sc_lock) {
 			td = TAILQ_PREV_FAST(besttd, head, thread, td_slpq);
 			if (td == NULL)
 				break;
 			besttd = td;
 		}
 	} else {
 		/*
 		 * Find the highest priority thread on the queue.  If there
 		 * is a tie, use the thread that first appears in the queue
 		 * as it has been sleeping the longest since threads are
 		 * always added to the tail of sleep queues.
 		 */
 		besttd = td = TAILQ_FIRST(head);
 		while ((td = TAILQ_NEXT(td, td_slpq)) != NULL) {
 			if (td->td_priority < besttd->td_priority)
 				besttd = td;
 		}
 	}
 	MPASS(besttd != NULL);
 	wakeup_swapper = sleepq_resume_thread(sq, besttd, pri,
 	    (flags & SLEEPQ_DROP) ? 0 : SRQ_HOLD);
 	return (wakeup_swapper);
 }
 
 static bool
 match_any(struct thread *td __unused)
 {
 
 	return (true);
 }
 
 /*
  * Resume all threads sleeping on a specified wait channel.
  */
 int
 sleepq_broadcast(const void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 
 	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	return (sleepq_remove_matching(sq, queue, match_any, pri));
 }
 
 /*
  * Resume threads on the sleep queue that match the given predicate.
  */
 int
 sleepq_remove_matching(struct sleepqueue *sq, int queue,
     bool (*matches)(struct thread *), int pri)
 {
 	struct thread *td, *tdn;
 	int wakeup_swapper;
 
 	/*
 	 * The last thread will be given ownership of sq and may
 	 * re-enqueue itself before sleepq_resume_thread() returns,
 	 * so we must cache the "next" queue item at the beginning
 	 * of the final iteration.
 	 */
 	wakeup_swapper = 0;
 	TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
 		if (matches(td))
 			wakeup_swapper |= sleepq_resume_thread(sq, td, pri,
 			    SRQ_HOLD);
 	}
 
 	return (wakeup_swapper);
 }
 
 /*
  * Time sleeping threads out.  When the timeout expires, the thread is
  * removed from the sleep queue and made runnable if it is still asleep.
  */
 static void
 sleepq_timeout(void *arg)
 {
 	struct sleepqueue_chain *sc __unused;
 	struct sleepqueue *sq;
 	struct thread *td;
 	const void *wchan;
 	int wakeup_swapper;
 
 	td = arg;
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
 	thread_lock(td);
 	if (td->td_sleeptimo == 0 ||
 	    td->td_sleeptimo > td->td_slpcallout.c_time) {
 		/*
 		 * The thread does not want a timeout (yet).
 		 */
 	} else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
 		/*
 		 * See if the thread is asleep and get the wait
 		 * channel if it is.
 		 */
 		wchan = td->td_wchan;
 		sc = SC_LOOKUP(wchan);
 		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
 		sq = sleepq_lookup(wchan);
 		MPASS(sq != NULL);
 		td->td_flags |= TDF_TIMEOUT;
 		wakeup_swapper = sleepq_resume_thread(sq, td, 0, 0);
 		if (wakeup_swapper)
 			kick_proc0();
 		return;
 	} else if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If the thread is on the SLEEPQ but isn't sleeping
 		 * yet, it can either be on another CPU in between
 		 * sleepq_add() and one of the sleepq_*wait*()
 		 * routines or it can be in sleepq_catch_signals().
 		 */
 		td->td_flags |= TDF_TIMEOUT;
 	}
 	thread_unlock(td);
 }
 
 /*
  * Resumes a specific thread from the sleep queue associated with a specific
  * wait channel if it is on that queue.
  */
 void
 sleepq_remove(struct thread *td, const void *wchan)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	int wakeup_swapper;
 
 	/*
 	 * Look up the sleep queue for this wait channel, then re-check
 	 * that the thread is asleep on that channel, if it is not, then
 	 * bail.
 	 */
 	MPASS(wchan != NULL);
 	sc = SC_LOOKUP(wchan);
 	mtx_lock_spin(&sc->sc_lock);
 	/*
 	 * We can not lock the thread here as it may be sleeping on a
 	 * different sleepq.  However, holding the sleepq lock for this
 	 * wchan can guarantee that we do not miss a wakeup for this
 	 * channel.  The asserts below will catch any false positives.
 	 */
 	if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
 
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	sq = sleepq_lookup(wchan);
 	MPASS(sq != NULL);
 	MPASS(td->td_wchan == wchan);
 	wakeup_swapper = sleepq_resume_thread(sq, td, 0, 0);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Abort a thread as if an interrupt had occurred.  Only abort
  * interruptible waits (unfortunately it isn't safe to abort others).
  *
  * Requires thread lock on entry, releases on return.
  */
 int
 sleepq_abort(struct thread *td, int intrval)
 {
 	struct sleepqueue *sq;
 	const void *wchan;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_flags & TDF_SINTR);
 	MPASS((intrval == 0 && (td->td_flags & TDF_SIGWAIT) != 0) ||
 	    intrval == EINTR || intrval == ERESTART);
 
 	/*
 	 * If the TDF_TIMEOUT flag is set, just leave. A
 	 * timeout is scheduled anyhow.
 	 */
 	if (td->td_flags & TDF_TIMEOUT) {
 		thread_unlock(td);
 		return (0);
 	}
 
 	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 	td->td_intrval = intrval;
 
 	/*
 	 * If the thread has not slept yet it will find the signal in
 	 * sleepq_catch_signals() and call sleepq_resume_thread.  Otherwise
 	 * we have to do it here.
 	 */
 	if (!TD_IS_SLEEPING(td)) {
 		thread_unlock(td);
 		return (0);
 	}
 	wchan = td->td_wchan;
 	MPASS(wchan != NULL);
 	sq = sleepq_lookup(wchan);
 	MPASS(sq != NULL);
 
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	return (sleepq_resume_thread(sq, td, 0, 0));
 }
 
 void
 sleepq_chains_remove_matching(bool (*matches)(struct thread *))
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq, *sq1;
 	int i, wakeup_swapper;
 
 	wakeup_swapper = 0;
 	for (sc = &sleepq_chains[0]; sc < sleepq_chains + SC_TABLESIZE; ++sc) {
 		if (LIST_EMPTY(&sc->sc_queues)) {
 			continue;
 		}
 		mtx_lock_spin(&sc->sc_lock);
 		LIST_FOREACH_SAFE(sq, &sc->sc_queues, sq_hash, sq1) {
 			for (i = 0; i < NR_SLEEPQS; ++i) {
 				wakeup_swapper |= sleepq_remove_matching(sq, i,
 				    matches, 0);
 			}
 		}
 		mtx_unlock_spin(&sc->sc_lock);
 	}
 	if (wakeup_swapper) {
 		kick_proc0();
 	}
 }
 
 /*
  * Prints the stacks of all threads presently sleeping on wchan/queue to
  * the sbuf sb.  Sets count_stacks_printed to the number of stacks actually
  * printed.  Typically, this will equal the number of threads sleeping on the
  * queue, but may be less if sb overflowed before all stacks were printed.
  */
 #ifdef STACK
 int
 sleepq_sbuf_print_stacks(struct sbuf *sb, const void *wchan, int queue,
     int *count_stacks_printed)
 {
 	struct thread *td, *td_next;
 	struct sleepqueue *sq;
 	struct stack **st;
 	struct sbuf **td_infos;
 	int i, stack_idx, error, stacks_to_allocate;
 	bool finished;
 
 	error = 0;
 	finished = false;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	stacks_to_allocate = 10;
 	for (i = 0; i < 3 && !finished ; i++) {
 		/* We cannot malloc while holding the queue's spinlock, so
 		 * we do our mallocs now, and hope it is enough.  If it
 		 * isn't, we will free these, drop the lock, malloc more,
 		 * and try again, up to a point.  After that point we will
 		 * give up and report ENOMEM. We also cannot write to sb
 		 * during this time since the client may have set the
 		 * SBUF_AUTOEXTEND flag on their sbuf, which could cause a
 		 * malloc as we print to it.  So we defer actually printing
 		 * to sb until after we drop the spinlock.
 		 */
 
 		/* Where we will store the stacks. */
 		st = malloc(sizeof(struct stack *) * stacks_to_allocate,
 		    M_TEMP, M_WAITOK);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			st[stack_idx] = stack_create(M_WAITOK);
 
 		/* Where we will store the td name, tid, etc. */
 		td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate,
 		    M_TEMP, M_WAITOK);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			td_infos[stack_idx] = sbuf_new(NULL, NULL,
 			    MAXCOMLEN + sizeof(struct thread *) * 2 + 40,
 			    SBUF_FIXEDLEN);
 
 		sleepq_lock(wchan);
 		sq = sleepq_lookup(wchan);
 		if (sq == NULL) {
 			/* This sleepq does not exist; exit and return ENOENT. */
 			error = ENOENT;
 			finished = true;
 			sleepq_release(wchan);
 			goto loop_end;
 		}
 
 		stack_idx = 0;
 		/* Save thread info */
 		TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq,
 		    td_next) {
 			if (stack_idx >= stacks_to_allocate)
 				goto loop_end;
 
 			/* Note the td_lock is equal to the sleepq_lock here. */
 			(void)stack_save_td(st[stack_idx], td);
 
 			sbuf_printf(td_infos[stack_idx], "%d: %s %p",
 			    td->td_tid, td->td_name, td);
 
 			++stack_idx;
 		}
 
 		finished = true;
 		sleepq_release(wchan);
 
 		/* Print the stacks */
 		for (i = 0; i < stack_idx; i++) {
 			sbuf_finish(td_infos[i]);
 			sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i]));
 			stack_sbuf_print(sb, st[i]);
-			sbuf_printf(sb, "\n");
+			sbuf_putc(sb, '\n');
 
 			error = sbuf_error(sb);
 			if (error == 0)
 				*count_stacks_printed = stack_idx;
 		}
 
 loop_end:
 		if (!finished)
 			sleepq_release(wchan);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			stack_destroy(st[stack_idx]);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			sbuf_delete(td_infos[stack_idx]);
 		free(st, M_TEMP);
 		free(td_infos, M_TEMP);
 		stacks_to_allocate *= 10;
 	}
 
 	if (!finished && error == 0)
 		error = ENOMEM;
 
 	return (error);
 }
 #endif
 
 #ifdef SLEEPQUEUE_PROFILING
 #define	SLEEPQ_PROF_LOCATIONS	1024
 #define	SLEEPQ_SBUFSIZE		512
 struct sleepq_prof {
 	LIST_ENTRY(sleepq_prof) sp_link;
 	const char	*sp_wmesg;
 	long		sp_count;
 };
 
 LIST_HEAD(sqphead, sleepq_prof);
 
 struct sqphead sleepq_prof_free;
 struct sqphead sleepq_hash[SC_TABLESIZE];
 static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
 static struct mtx sleepq_prof_lock;
 MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);
 
 static void
 sleepq_profile(const char *wmesg)
 {
 	struct sleepq_prof *sp;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	if (prof_enabled == 0)
 		goto unlock;
 	LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
 		if (sp->sp_wmesg == wmesg)
 			goto done;
 	sp = LIST_FIRST(&sleepq_prof_free);
 	if (sp == NULL)
 		goto unlock;
 	sp->sp_wmesg = wmesg;
 	LIST_REMOVE(sp, sp_link);
 	LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
 done:
 	sp->sp_count++;
 unlock:
 	mtx_unlock_spin(&sleepq_prof_lock);
 	return;
 }
 
 static void
 sleepq_prof_reset(void)
 {
 	struct sleepq_prof *sp;
 	int enabled;
 	int i;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	enabled = prof_enabled;
 	prof_enabled = 0;
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_INIT(&sleepq_hash[i]);
 	LIST_INIT(&sleepq_prof_free);
 	for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
 		sp = &sleepq_profent[i];
 		sp->sp_wmesg = NULL;
 		sp->sp_count = 0;
 		LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
 	}
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 }
 
 static int
 enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = prof_enabled;
 	error = sysctl_handle_int(oidp, &v, v, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == prof_enabled)
 		return (0);
 	if (v == 1)
 		sleepq_prof_reset();
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = !!v;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	return (0);
 }
 
 static int
 reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = 0;
 	error = sysctl_handle_int(oidp, &v, 0, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == 0)
 		return (0);
 	sleepq_prof_reset();
 
 	return (0);
 }
 
 static int
 dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct sleepq_prof *sp;
 	struct sbuf *sb;
 	int enabled;
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
-	sbuf_printf(sb, "\nwmesg\tcount\n");
+	sbuf_cat(sb, "\nwmesg\tcount\n");
 	enabled = prof_enabled;
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = 0;
 	mtx_unlock_spin(&sleepq_prof_lock);
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
 			sbuf_printf(sb, "%s\t%ld\n",
 			    sp->sp_wmesg, sp->sp_count);
 		}
 	}
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0,
     dump_sleepq_prof_stats, "A",
     "Sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     reset_sleepq_prof_stats, "I",
     "Reset sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     enable_sleepq_prof, "I",
     "Enable sleepqueue profiling");
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 #ifdef INVARIANTS
 	struct lock_object *lock;
 #endif
 	struct thread *td;
 	void *wchan;
 	int i;
 
 	if (!have_addr)
 		return;
 
 	/*
 	 * First, see if there is an active sleep queue for the wait channel
 	 * indicated by the address.
 	 */
 	wchan = (void *)addr;
 	sc = SC_LOOKUP(wchan);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			goto found;
 
 	/*
 	 * Second, see if there is an active sleep queue at the address
 	 * indicated.
 	 */
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
 			if (sq == (struct sleepqueue *)addr)
 				goto found;
 		}
 
 	db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
 	return;
 found:
 	db_printf("Wait channel: %p\n", sq->sq_wchan);
 	db_printf("Queue type: %d\n", sq->sq_type);
 #ifdef INVARIANTS
 	if (sq->sq_lock) {
 		lock = sq->sq_lock;
 		db_printf("Associated Interlock: %p - (%s) %s\n", lock,
 		    LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	}
 #endif
 	db_printf("Blocked threads:\n");
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		db_printf("\nQueue[%d]:\n", i);
 		if (TAILQ_EMPTY(&sq->sq_blocked[i]))
 			db_printf("\tempty\n");
 		else
 			TAILQ_FOREACH(td, &sq->sq_blocked[i],
 				      td_slpq) {
 				db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
 					  td->td_tid, td->td_proc->p_pid,
 					  td->td_name);
 			}
 		db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
 	}
 }
 
 /* Alias 'show sleepqueue' to 'show sleepq'. */
 DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
 #endif
diff --git a/sys/kern/subr_stats.c b/sys/kern/subr_stats.c
index 0e7d2fad5f68..6e8ec44681e7 100644
--- a/sys/kern/subr_stats.c
+++ b/sys/kern/subr_stats.c
@@ -1,3954 +1,3954 @@
 /*-
  * Copyright (c) 2014-2018 Netflix, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Author: Lawrence Stewart <lstewart@netflix.com>
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/arb.h>
 #include <sys/ctype.h>
 #include <sys/errno.h>
 #include <sys/hash.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/qmath.h>
 #include <sys/sbuf.h>
 #if defined(DIAGNOSTIC)
 #include <sys/tree.h>
 #endif
 #include <sys/stats.h> /* Must come after qmath.h and arb.h */
 #include <sys/stddef.h>
 #include <sys/stdint.h>
 #include <sys/time.h>
 
 #ifdef _KERNEL
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #else /* ! _KERNEL */
 #include <pthread.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #endif /* _KERNEL */
 
 struct voistatdata_voistate {
 	/* Previous VOI value for diff calculation. */
 	struct voistatdata_numeric prev;
 };
 
 #define	VS_VSDVALID	0x0001	/* Stat's voistatdata updated at least once. */
 struct voistat {
 	int8_t		stype;		/* Type of stat e.g. VS_STYPE_SUM. */
 	enum vsd_dtype	dtype : 8;	/* Data type of this stat's data. */
 	uint16_t	data_off;	/* Blob offset for this stat's data. */
 	uint16_t	dsz;		/* Size of stat's data. */
 #define	VS_EBITS 8
 	uint16_t	errs : VS_EBITS;/* Non-wrapping error count. */
 	uint16_t	flags : 16 - VS_EBITS;
 };
 /* The voistat error count is capped to avoid wrapping. */
 #define	VS_INCERRS(vs) do {						\
 	if ((vs)->errs < (1U << VS_EBITS) - 1)				\
 		(vs)->errs++;						\
 } while (0)
 
 /*
  * Ideas for flags:
  *   - Global or entity specific (global would imply use of counter(9)?)
  *   - Whether to reset stats on read or not
  *   - Signal an overflow?
  *   - Compressed voistat array
  */
 #define	VOI_REQSTATE	0x0001	/* VOI requires VS_STYPE_VOISTATE. */
 struct voi {
 	int16_t		id;		/* VOI id. */
 	enum vsd_dtype	dtype : 8;	/* Data type of the VOI itself. */
 	int8_t		voistatmaxid;	/* Largest allocated voistat index. */
 	uint16_t	stats_off;	/* Blob offset for this VOIs stats. */
 	uint16_t	flags;
 };
 
 /*
  * Memory for the entire blob is allocated as a slab and then offsets are
  * maintained to carve up the slab into sections holding different data types.
  *
  * Ideas for flags:
  * - Compressed voi array (trade off memory usage vs search time)
  * - Units of offsets (default bytes, flag for e.g. vm_page/KiB/Mib)
  */
 struct statsblobv1 {
 	uint8_t		abi;
 	uint8_t		endian;
 	uint16_t	flags;
 	uint16_t	maxsz;
 	uint16_t	cursz;
 	/* Fields from here down are opaque to consumers. */
 	uint32_t	tplhash;	/* Base template hash ID. */
 	uint16_t	stats_off;	/* voistat array blob offset. */
 	uint16_t	statsdata_off;	/* voistatdata array blob offset. */
 	sbintime_t	created;	/* Blob creation time. */
 	sbintime_t	lastrst;	/* Time of last reset. */
 	struct voi	vois[];		/* Array indexed by [voi_id]. */
 } __aligned(sizeof(void *));
 _Static_assert(offsetof(struct statsblobv1, cursz) +
     SIZEOF_MEMBER(struct statsblobv1, cursz) ==
     offsetof(struct statsblob, opaque),
     "statsblobv1 ABI mismatch");
 
 struct statsblobv1_tpl {
 	struct metablob		*mb;
 	struct statsblobv1	*sb;
 };
 
 /* Context passed to iterator callbacks. */
 struct sb_iter_ctx {
 	void		*usrctx;	/* Caller supplied context. */
 	uint32_t	flags;		/* Flags for current iteration. */
 	int16_t		vslot;		/* struct voi slot index. */
 	int8_t		vsslot;		/* struct voistat slot index. */
 };
 
 struct sb_tostrcb_ctx {
 	struct sbuf		*buf;
 	struct statsblob_tpl	*tpl;
 	enum sb_str_fmt	fmt;
 	uint32_t		flags;
 };
 
 struct sb_visitcb_ctx {
 	stats_blob_visitcb_t	cb;
 	void			*usrctx;
 };
 
 /* Stats blob iterator callback. */
 typedef int (*stats_v1_blob_itercb_t)(struct statsblobv1 *sb, struct voi *v,
     struct voistat *vs, struct sb_iter_ctx *ctx);
 
 #ifdef _KERNEL
 static struct rwlock tpllistlock;
 RW_SYSINIT(stats_tpl_list, &tpllistlock, "Stat template list lock");
 #define	TPL_LIST_RLOCK() rw_rlock(&tpllistlock)
 #define	TPL_LIST_RUNLOCK() rw_runlock(&tpllistlock)
 #define	TPL_LIST_WLOCK() rw_wlock(&tpllistlock)
 #define	TPL_LIST_WUNLOCK() rw_wunlock(&tpllistlock)
 #define	TPL_LIST_LOCK_ASSERT() rw_assert(&tpllistlock, RA_LOCKED)
 #define	TPL_LIST_RLOCK_ASSERT() rw_assert(&tpllistlock, RA_RLOCKED)
 #define	TPL_LIST_WLOCK_ASSERT() rw_assert(&tpllistlock, RA_WLOCKED)
 MALLOC_DEFINE(M_STATS, "stats(9) related memory", "stats(9) related memory");
 #define	stats_free(ptr) free((ptr), M_STATS)
 #else /* ! _KERNEL */
 static void stats_constructor(void);
 static void stats_destructor(void);
 static pthread_rwlock_t tpllistlock;
 #define	TPL_LIST_UNLOCK() pthread_rwlock_unlock(&tpllistlock)
 #define	TPL_LIST_RLOCK() pthread_rwlock_rdlock(&tpllistlock)
 #define	TPL_LIST_RUNLOCK() TPL_LIST_UNLOCK()
 #define	TPL_LIST_WLOCK() pthread_rwlock_wrlock(&tpllistlock)
 #define	TPL_LIST_WUNLOCK() TPL_LIST_UNLOCK()
 #define	TPL_LIST_LOCK_ASSERT() do { } while (0)
 #define	TPL_LIST_RLOCK_ASSERT() do { } while (0)
 #define	TPL_LIST_WLOCK_ASSERT() do { } while (0)
 #ifdef NDEBUG
 #define	KASSERT(cond, msg) do {} while (0)
 #define	stats_abort() do {} while (0)
 #else /* ! NDEBUG */
 #define	KASSERT(cond, msg) do { \
 	if (!(cond)) { \
 		panic msg; \
 	} \
 } while (0)
 #define	stats_abort() abort()
 #endif /* NDEBUG */
 #define	stats_free(ptr) free(ptr)
 #define	panic(fmt, ...) do { \
 	fprintf(stderr, (fmt), ##__VA_ARGS__); \
 	stats_abort(); \
 } while (0)
 #endif /* _KERNEL */
 
 #define	SB_V1_MAXSZ 65535
 
 /* Obtain a blob offset pointer. */
 #define	BLOB_OFFSET(sb, off) ((void *)(((uint8_t *)(sb)) + (off)))
 
 /*
  * Number of VOIs in the blob's vois[] array. By virtue of struct voi being a
  * power of 2 size, we can shift instead of divide. The shift amount must be
  * updated if sizeof(struct voi) ever changes, which the assert should catch.
  */
 #define	NVOIS(sb) ((int32_t)((((struct statsblobv1 *)(sb))->stats_off - \
     sizeof(struct statsblobv1)) >> 3))
 _Static_assert(sizeof(struct voi) == 8, "statsblobv1 voi ABI mismatch");
 
 /* Try restrict names to alphanumeric and underscore to simplify JSON compat. */
 const char *vs_stype2name[VS_NUM_STYPES] = {
 	[VS_STYPE_VOISTATE] = "VOISTATE",
 	[VS_STYPE_SUM] = "SUM",
 	[VS_STYPE_MAX] = "MAX",
 	[VS_STYPE_MIN] = "MIN",
 	[VS_STYPE_HIST] = "HIST",
 	[VS_STYPE_TDGST] = "TDGST",
 };
 
 const char *vs_stype2desc[VS_NUM_STYPES] = {
 	[VS_STYPE_VOISTATE] = "VOI related state data (not a real stat)",
 	[VS_STYPE_SUM] = "Simple arithmetic accumulator",
 	[VS_STYPE_MAX] = "Maximum observed VOI value",
 	[VS_STYPE_MIN] = "Minimum observed VOI value",
 	[VS_STYPE_HIST] = "Histogram of observed VOI values",
 	[VS_STYPE_TDGST] = "t-digest of observed VOI values",
 };
 
 const char *vsd_dtype2name[VSD_NUM_DTYPES] = {
 	[VSD_DTYPE_VOISTATE] = "VOISTATE",
 	[VSD_DTYPE_INT_S32] = "INT_S32",
 	[VSD_DTYPE_INT_U32] = "INT_U32",
 	[VSD_DTYPE_INT_S64] = "INT_S64",
 	[VSD_DTYPE_INT_U64] = "INT_U64",
 	[VSD_DTYPE_INT_SLONG] = "INT_SLONG",
 	[VSD_DTYPE_INT_ULONG] = "INT_ULONG",
 	[VSD_DTYPE_Q_S32] = "Q_S32",
 	[VSD_DTYPE_Q_U32] = "Q_U32",
 	[VSD_DTYPE_Q_S64] = "Q_S64",
 	[VSD_DTYPE_Q_U64] = "Q_U64",
 	[VSD_DTYPE_CRHIST32] = "CRHIST32",
 	[VSD_DTYPE_DRHIST32] = "DRHIST32",
 	[VSD_DTYPE_DVHIST32] = "DVHIST32",
 	[VSD_DTYPE_CRHIST64] = "CRHIST64",
 	[VSD_DTYPE_DRHIST64] = "DRHIST64",
 	[VSD_DTYPE_DVHIST64] = "DVHIST64",
 	[VSD_DTYPE_TDGSTCLUST32] = "TDGSTCLUST32",
 	[VSD_DTYPE_TDGSTCLUST64] = "TDGSTCLUST64",
 };
 
 const size_t vsd_dtype2size[VSD_NUM_DTYPES] = {
 	[VSD_DTYPE_VOISTATE] = sizeof(struct voistatdata_voistate),
 	[VSD_DTYPE_INT_S32] = sizeof(struct voistatdata_int32),
 	[VSD_DTYPE_INT_U32] = sizeof(struct voistatdata_int32),
 	[VSD_DTYPE_INT_S64] = sizeof(struct voistatdata_int64),
 	[VSD_DTYPE_INT_U64] = sizeof(struct voistatdata_int64),
 	[VSD_DTYPE_INT_SLONG] = sizeof(struct voistatdata_intlong),
 	[VSD_DTYPE_INT_ULONG] = sizeof(struct voistatdata_intlong),
 	[VSD_DTYPE_Q_S32] = sizeof(struct voistatdata_q32),
 	[VSD_DTYPE_Q_U32] = sizeof(struct voistatdata_q32),
 	[VSD_DTYPE_Q_S64] = sizeof(struct voistatdata_q64),
 	[VSD_DTYPE_Q_U64] = sizeof(struct voistatdata_q64),
 	[VSD_DTYPE_CRHIST32] = sizeof(struct voistatdata_crhist32),
 	[VSD_DTYPE_DRHIST32] = sizeof(struct voistatdata_drhist32),
 	[VSD_DTYPE_DVHIST32] = sizeof(struct voistatdata_dvhist32),
 	[VSD_DTYPE_CRHIST64] = sizeof(struct voistatdata_crhist64),
 	[VSD_DTYPE_DRHIST64] = sizeof(struct voistatdata_drhist64),
 	[VSD_DTYPE_DVHIST64] = sizeof(struct voistatdata_dvhist64),
 	[VSD_DTYPE_TDGSTCLUST32] = sizeof(struct voistatdata_tdgstclust32),
 	[VSD_DTYPE_TDGSTCLUST64] = sizeof(struct voistatdata_tdgstclust64),
 };
 
 static const bool vsd_compoundtype[VSD_NUM_DTYPES] = {
 	[VSD_DTYPE_VOISTATE] = true,
 	[VSD_DTYPE_INT_S32] = false,
 	[VSD_DTYPE_INT_U32] = false,
 	[VSD_DTYPE_INT_S64] = false,
 	[VSD_DTYPE_INT_U64] = false,
 	[VSD_DTYPE_INT_SLONG] = false,
 	[VSD_DTYPE_INT_ULONG] = false,
 	[VSD_DTYPE_Q_S32] = false,
 	[VSD_DTYPE_Q_U32] = false,
 	[VSD_DTYPE_Q_S64] = false,
 	[VSD_DTYPE_Q_U64] = false,
 	[VSD_DTYPE_CRHIST32] = true,
 	[VSD_DTYPE_DRHIST32] = true,
 	[VSD_DTYPE_DVHIST32] = true,
 	[VSD_DTYPE_CRHIST64] = true,
 	[VSD_DTYPE_DRHIST64] = true,
 	[VSD_DTYPE_DVHIST64] = true,
 	[VSD_DTYPE_TDGSTCLUST32] = true,
 	[VSD_DTYPE_TDGSTCLUST64] = true,
 };
 
 const struct voistatdata_numeric numeric_limits[2][VSD_DTYPE_Q_U64 + 1] = {
 	[LIM_MIN] = {
 		[VSD_DTYPE_VOISTATE] = {0},
 		[VSD_DTYPE_INT_S32] = {.int32 = {.s32 = INT32_MIN}},
 		[VSD_DTYPE_INT_U32] = {.int32 = {.u32 = 0}},
 		[VSD_DTYPE_INT_S64] = {.int64 = {.s64 = INT64_MIN}},
 		[VSD_DTYPE_INT_U64] = {.int64 = {.u64 = 0}},
 		[VSD_DTYPE_INT_SLONG] = {.intlong = {.slong = LONG_MIN}},
 		[VSD_DTYPE_INT_ULONG] = {.intlong = {.ulong = 0}},
 		[VSD_DTYPE_Q_S32] = {.q32 = {.sq32 = Q_IFMINVAL(INT32_MIN)}},
 		[VSD_DTYPE_Q_U32] = {.q32 = {.uq32 = 0}},
 		[VSD_DTYPE_Q_S64] = {.q64 = {.sq64 = Q_IFMINVAL(INT64_MIN)}},
 		[VSD_DTYPE_Q_U64] = {.q64 = {.uq64 = 0}},
 	},
 	[LIM_MAX] = {
 		[VSD_DTYPE_VOISTATE] = {0},
 		[VSD_DTYPE_INT_S32] = {.int32 = {.s32 = INT32_MAX}},
 		[VSD_DTYPE_INT_U32] = {.int32 = {.u32 = UINT32_MAX}},
 		[VSD_DTYPE_INT_S64] = {.int64 = {.s64 = INT64_MAX}},
 		[VSD_DTYPE_INT_U64] = {.int64 = {.u64 = UINT64_MAX}},
 		[VSD_DTYPE_INT_SLONG] = {.intlong = {.slong = LONG_MAX}},
 		[VSD_DTYPE_INT_ULONG] = {.intlong = {.ulong = ULONG_MAX}},
 		[VSD_DTYPE_Q_S32] = {.q32 = {.sq32 = Q_IFMAXVAL(INT32_MAX)}},
 		[VSD_DTYPE_Q_U32] = {.q32 = {.uq32 = Q_IFMAXVAL(UINT32_MAX)}},
 		[VSD_DTYPE_Q_S64] = {.q64 = {.sq64 = Q_IFMAXVAL(INT64_MAX)}},
 		[VSD_DTYPE_Q_U64] = {.q64 = {.uq64 = Q_IFMAXVAL(UINT64_MAX)}},
 	}
 };
 
 /* tpllistlock protects tpllist and ntpl */
 static uint32_t ntpl;
 static struct statsblob_tpl **tpllist;
 
 static inline void * stats_realloc(void *ptr, size_t oldsz, size_t newsz,
     int flags);
 //static void stats_v1_blob_finalise(struct statsblobv1 *sb);
 static int stats_v1_blob_init_locked(struct statsblobv1 *sb, uint32_t tpl_id,
     uint32_t flags);
 static int stats_v1_blob_expand(struct statsblobv1 **sbpp, int newvoibytes,
     int newvoistatbytes, int newvoistatdatabytes);
 static void stats_v1_blob_iter(struct statsblobv1 *sb,
     stats_v1_blob_itercb_t icb, void *usrctx, uint32_t flags);
 static inline int stats_v1_vsd_tdgst_add(enum vsd_dtype vs_dtype,
     struct voistatdata_tdgst *tdgst, s64q_t x, uint64_t weight, int attempt);
 
 static inline int
 ctd32cmp(const struct voistatdata_tdgstctd32 *c1, const struct voistatdata_tdgstctd32 *c2)
 {
 
 	KASSERT(Q_PRECEQ(c1->mu, c2->mu),
 	    ("%s: Q_RELPREC(c1->mu,c2->mu)=%d", __func__,
 	    Q_RELPREC(c1->mu, c2->mu)));
 
        return (Q_QLTQ(c1->mu, c2->mu) ? -1 : 1);
 }
 ARB_GENERATE_STATIC(ctdth32, voistatdata_tdgstctd32, ctdlnk, ctd32cmp);
 
 static inline int
 ctd64cmp(const struct voistatdata_tdgstctd64 *c1, const struct voistatdata_tdgstctd64 *c2)
 {
 
 	KASSERT(Q_PRECEQ(c1->mu, c2->mu),
 	    ("%s: Q_RELPREC(c1->mu,c2->mu)=%d", __func__,
 	    Q_RELPREC(c1->mu, c2->mu)));
 
        return (Q_QLTQ(c1->mu, c2->mu) ? -1 : 1);
 }
 ARB_GENERATE_STATIC(ctdth64, voistatdata_tdgstctd64, ctdlnk, ctd64cmp);
 
 #ifdef DIAGNOSTIC
 RB_GENERATE_STATIC(rbctdth32, voistatdata_tdgstctd32, rblnk, ctd32cmp);
 RB_GENERATE_STATIC(rbctdth64, voistatdata_tdgstctd64, rblnk, ctd64cmp);
 #endif
 
 static inline sbintime_t
 stats_sbinuptime(void)
 {
 	sbintime_t sbt;
 #ifdef _KERNEL
 
 	sbt = sbinuptime();
 #else /* ! _KERNEL */
 	struct timespec tp;
 
 	clock_gettime(CLOCK_MONOTONIC_FAST, &tp);
 	sbt = tstosbt(tp);
 #endif /* _KERNEL */
 
 	return (sbt);
 }
 
 static inline void *
 stats_realloc(void *ptr, size_t oldsz, size_t newsz, int flags)
 {
 
 #ifdef _KERNEL
 	/* Default to M_NOWAIT if neither M_NOWAIT or M_WAITOK are set. */
 	if (!(flags & (M_WAITOK | M_NOWAIT)))
 		flags |= M_NOWAIT;
 	ptr = realloc(ptr, newsz, M_STATS, flags);
 #else /* ! _KERNEL */
 	ptr = realloc(ptr, newsz);
 	if ((flags & M_ZERO) && ptr != NULL) {
 		if (oldsz == 0)
 			memset(ptr, '\0', newsz);
 		else if (newsz > oldsz)
 			memset(BLOB_OFFSET(ptr, oldsz), '\0', newsz - oldsz);
 	}
 #endif /* _KERNEL */
 
 	return (ptr);
 }
 
 static inline char *
 stats_strdup(const char *s,
 #ifdef _KERNEL
     int flags)
 {
 	char *copy;
 	size_t len;
 
 	if (!(flags & (M_WAITOK | M_NOWAIT)))
 		flags |= M_NOWAIT;
 
 	len = strlen(s) + 1;
 	if ((copy = malloc(len, M_STATS, flags)) != NULL)
 		bcopy(s, copy, len);
 
 	return (copy);
 #else
     int flags __unused)
 {
 	return (strdup(s));
 #endif
 }
 
 static inline void
 stats_tpl_update_hash(struct statsblob_tpl *tpl)
 {
 
 	TPL_LIST_WLOCK_ASSERT();
 	tpl->mb->tplhash = hash32_str(tpl->mb->tplname, 0);
 	for (int voi_id = 0; voi_id < NVOIS(tpl->sb); voi_id++) {
 		if (tpl->mb->voi_meta[voi_id].name != NULL)
 			tpl->mb->tplhash = hash32_str(
 			    tpl->mb->voi_meta[voi_id].name, tpl->mb->tplhash);
 	}
 	tpl->mb->tplhash = hash32_buf(tpl->sb, tpl->sb->cursz,
 	    tpl->mb->tplhash);
 }
 
 static inline uint64_t
 stats_pow_u64(uint64_t base, uint64_t exp)
 {
 	uint64_t result = 1;
 
 	while (exp) {
 		if (exp & 1)
 			result *= base;
 		exp >>= 1;
 		base *= base;
 	}
 
 	return (result);
 }
 
 static inline int
 stats_vss_hist_bkt_hlpr(struct vss_hist_hlpr_info *info, uint32_t curbkt,
     struct voistatdata_numeric *bkt_lb, struct voistatdata_numeric *bkt_ub)
 {
 	uint64_t step = 0;
 	int error = 0;
 
 	switch (info->scheme) {
 	case BKT_LIN:
 		step = info->lin.stepinc;
 		break;
 	case BKT_EXP:
 		step = stats_pow_u64(info->exp.stepbase,
 		    info->exp.stepexp + curbkt);
 		break;
 	case BKT_LINEXP:
 		{
 		uint64_t curstepexp = 1;
 
 		switch (info->voi_dtype) {
 		case VSD_DTYPE_INT_S32:
 			while ((int32_t)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= bkt_lb->int32.s32)
 				curstepexp++;
 			break;
 		case VSD_DTYPE_INT_U32:
 			while ((uint32_t)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= bkt_lb->int32.u32)
 				curstepexp++;
 			break;
 		case VSD_DTYPE_INT_S64:
 			while ((int64_t)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= bkt_lb->int64.s64)
 				curstepexp++;
 			break;
 		case VSD_DTYPE_INT_U64:
 			while ((uint64_t)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= bkt_lb->int64.u64)
 				curstepexp++;
 			break;
 		case VSD_DTYPE_INT_SLONG:
 			while ((long)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= bkt_lb->intlong.slong)
 				curstepexp++;
 			break;
 		case VSD_DTYPE_INT_ULONG:
 			while ((unsigned long)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= bkt_lb->intlong.ulong)
 				curstepexp++;
 			break;
 		case VSD_DTYPE_Q_S32:
 			while ((s32q_t)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= Q_GIVAL(bkt_lb->q32.sq32))
 			break;
 		case VSD_DTYPE_Q_U32:
 			while ((u32q_t)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= Q_GIVAL(bkt_lb->q32.uq32))
 			break;
 		case VSD_DTYPE_Q_S64:
 			while ((s64q_t)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= Q_GIVAL(bkt_lb->q64.sq64))
 				curstepexp++;
 			break;
 		case VSD_DTYPE_Q_U64:
 			while ((u64q_t)stats_pow_u64(info->linexp.stepbase,
 			    curstepexp) <= Q_GIVAL(bkt_lb->q64.uq64))
 				curstepexp++;
 			break;
 		default:
 			break;
 		}
 
 		step = stats_pow_u64(info->linexp.stepbase, curstepexp) /
 		    info->linexp.linstepdiv;
 		if (step == 0)
 			step = 1;
 		break;
 		}
 	default:
 		break;
 	}
 
 	if (info->scheme == BKT_USR) {
 		*bkt_lb = info->usr.bkts[curbkt].lb;
 		*bkt_ub = info->usr.bkts[curbkt].ub;
 	} else if (step != 0) {
 		switch (info->voi_dtype) {
 		case VSD_DTYPE_INT_S32:
 			bkt_ub->int32.s32 += (int32_t)step;
 			break;
 		case VSD_DTYPE_INT_U32:
 			bkt_ub->int32.u32 += (uint32_t)step;
 			break;
 		case VSD_DTYPE_INT_S64:
 			bkt_ub->int64.s64 += (int64_t)step;
 			break;
 		case VSD_DTYPE_INT_U64:
 			bkt_ub->int64.u64 += (uint64_t)step;
 			break;
 		case VSD_DTYPE_INT_SLONG:
 			bkt_ub->intlong.slong += (long)step;
 			break;
 		case VSD_DTYPE_INT_ULONG:
 			bkt_ub->intlong.ulong += (unsigned long)step;
 			break;
 		case VSD_DTYPE_Q_S32:
 			error = Q_QADDI(&bkt_ub->q32.sq32, step);
 			break;
 		case VSD_DTYPE_Q_U32:
 			error = Q_QADDI(&bkt_ub->q32.uq32, step);
 			break;
 		case VSD_DTYPE_Q_S64:
 			error = Q_QADDI(&bkt_ub->q64.sq64, step);
 			break;
 		case VSD_DTYPE_Q_U64:
 			error = Q_QADDI(&bkt_ub->q64.uq64, step);
 			break;
 		default:
 			break;
 		}
 	} else { /* info->scheme != BKT_USR && step == 0 */
 		return (EINVAL);
 	}
 
 	return (error);
 }
 
 static uint32_t
 stats_vss_hist_nbkts_hlpr(struct vss_hist_hlpr_info *info)
 {
 	struct voistatdata_numeric bkt_lb, bkt_ub;
 	uint32_t nbkts;
 	int done;
 
 	if (info->scheme == BKT_USR) {
 		/* XXXLAS: Setting info->{lb,ub} from macro is tricky. */
 		info->lb = info->usr.bkts[0].lb;
 		info->ub = info->usr.bkts[info->usr.nbkts - 1].lb;
 	}
 
 	nbkts = 0;
 	done = 0;
 	bkt_ub = info->lb;
 
 	do {
 		bkt_lb = bkt_ub;
 		if (stats_vss_hist_bkt_hlpr(info, nbkts++, &bkt_lb, &bkt_ub))
 			return (0);
 
 		if (info->scheme == BKT_USR)
 			done = (nbkts == info->usr.nbkts);
 		else {
 			switch (info->voi_dtype) {
 			case VSD_DTYPE_INT_S32:
 				done = (bkt_ub.int32.s32 > info->ub.int32.s32);
 				break;
 			case VSD_DTYPE_INT_U32:
 				done = (bkt_ub.int32.u32 > info->ub.int32.u32);
 				break;
 			case VSD_DTYPE_INT_S64:
 				done = (bkt_ub.int64.s64 > info->ub.int64.s64);
 				break;
 			case VSD_DTYPE_INT_U64:
 				done = (bkt_ub.int64.u64 > info->ub.int64.u64);
 				break;
 			case VSD_DTYPE_INT_SLONG:
 				done = (bkt_ub.intlong.slong >
 				    info->ub.intlong.slong);
 				break;
 			case VSD_DTYPE_INT_ULONG:
 				done = (bkt_ub.intlong.ulong >
 				    info->ub.intlong.ulong);
 				break;
 			case VSD_DTYPE_Q_S32:
 				done = Q_QGTQ(bkt_ub.q32.sq32,
 				    info->ub.q32.sq32);
 				break;
 			case VSD_DTYPE_Q_U32:
 				done = Q_QGTQ(bkt_ub.q32.uq32,
 				    info->ub.q32.uq32);
 				break;
 			case VSD_DTYPE_Q_S64:
 				done = Q_QGTQ(bkt_ub.q64.sq64,
 				    info->ub.q64.sq64);
 				break;
 			case VSD_DTYPE_Q_U64:
 				done = Q_QGTQ(bkt_ub.q64.uq64,
 				    info->ub.q64.uq64);
 				break;
 			default:
 				return (0);
 			}
 		}
 	} while (!done);
 
 	if (info->flags & VSD_HIST_LBOUND_INF)
 		nbkts++;
 	if (info->flags & VSD_HIST_UBOUND_INF)
 		nbkts++;
 
 	return (nbkts);
 }
 
 int
 stats_vss_hist_hlpr(enum vsd_dtype voi_dtype, struct voistatspec *vss,
     struct vss_hist_hlpr_info *info)
 {
 	struct voistatdata_hist *hist;
 	struct voistatdata_numeric bkt_lb, bkt_ub, *lbinfbktlb, *lbinfbktub,
 	    *ubinfbktlb, *ubinfbktub;
 	uint32_t bkt, nbkts, nloop;
 
 	if (vss == NULL || info == NULL || (info->flags &
 	(VSD_HIST_LBOUND_INF|VSD_HIST_UBOUND_INF) && (info->hist_dtype ==
 	VSD_DTYPE_DVHIST32 || info->hist_dtype == VSD_DTYPE_DVHIST64)))
 		return (EINVAL);
 
 	info->voi_dtype = voi_dtype;
 
 	if ((nbkts = stats_vss_hist_nbkts_hlpr(info)) == 0)
 		return (EINVAL);
 
 	switch (info->hist_dtype) {
 	case VSD_DTYPE_CRHIST32:
 		vss->vsdsz = HIST_NBKTS2VSDSZ(crhist32, nbkts);
 		break;
 	case VSD_DTYPE_DRHIST32:
 		vss->vsdsz = HIST_NBKTS2VSDSZ(drhist32, nbkts);
 		break;
 	case VSD_DTYPE_DVHIST32:
 		vss->vsdsz = HIST_NBKTS2VSDSZ(dvhist32, nbkts);
 		break;
 	case VSD_DTYPE_CRHIST64:
 		vss->vsdsz = HIST_NBKTS2VSDSZ(crhist64, nbkts);
 		break;
 	case VSD_DTYPE_DRHIST64:
 		vss->vsdsz = HIST_NBKTS2VSDSZ(drhist64, nbkts);
 		break;
 	case VSD_DTYPE_DVHIST64:
 		vss->vsdsz = HIST_NBKTS2VSDSZ(dvhist64, nbkts);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	vss->iv = stats_realloc(NULL, 0, vss->vsdsz, M_ZERO);
 	if (vss->iv == NULL)
 		return (ENOMEM);
 
 	hist = (struct voistatdata_hist *)vss->iv;
 	bkt_ub = info->lb;
 
 	for (bkt = (info->flags & VSD_HIST_LBOUND_INF), nloop = 0;
 	    bkt < nbkts;
 	    bkt++, nloop++) {
 		bkt_lb = bkt_ub;
 		if (stats_vss_hist_bkt_hlpr(info, nloop, &bkt_lb, &bkt_ub))
 			return (EINVAL);
 
 		switch (info->hist_dtype) {
 		case VSD_DTYPE_CRHIST32:
 			VSD(crhist32, hist)->bkts[bkt].lb = bkt_lb;
 			break;
 		case VSD_DTYPE_DRHIST32:
 			VSD(drhist32, hist)->bkts[bkt].lb = bkt_lb;
 			VSD(drhist32, hist)->bkts[bkt].ub = bkt_ub;
 			break;
 		case VSD_DTYPE_DVHIST32:
 			VSD(dvhist32, hist)->bkts[bkt].val = bkt_lb;
 			break;
 		case VSD_DTYPE_CRHIST64:
 			VSD(crhist64, hist)->bkts[bkt].lb = bkt_lb;
 			break;
 		case VSD_DTYPE_DRHIST64:
 			VSD(drhist64, hist)->bkts[bkt].lb = bkt_lb;
 			VSD(drhist64, hist)->bkts[bkt].ub = bkt_ub;
 			break;
 		case VSD_DTYPE_DVHIST64:
 			VSD(dvhist64, hist)->bkts[bkt].val = bkt_lb;
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 	lbinfbktlb = lbinfbktub = ubinfbktlb = ubinfbktub = NULL;
 
 	switch (info->hist_dtype) {
 	case VSD_DTYPE_CRHIST32:
 		lbinfbktlb = &VSD(crhist32, hist)->bkts[0].lb;
 		ubinfbktlb = &VSD(crhist32, hist)->bkts[nbkts - 1].lb;
 		break;
 	case VSD_DTYPE_DRHIST32:
 		lbinfbktlb = &VSD(drhist32, hist)->bkts[0].lb;
 		lbinfbktub = &VSD(drhist32, hist)->bkts[0].ub;
 		ubinfbktlb = &VSD(drhist32, hist)->bkts[nbkts - 1].lb;
 		ubinfbktub = &VSD(drhist32, hist)->bkts[nbkts - 1].ub;
 		break;
 	case VSD_DTYPE_CRHIST64:
 		lbinfbktlb = &VSD(crhist64, hist)->bkts[0].lb;
 		ubinfbktlb = &VSD(crhist64, hist)->bkts[nbkts - 1].lb;
 		break;
 	case VSD_DTYPE_DRHIST64:
 		lbinfbktlb = &VSD(drhist64, hist)->bkts[0].lb;
 		lbinfbktub = &VSD(drhist64, hist)->bkts[0].ub;
 		ubinfbktlb = &VSD(drhist64, hist)->bkts[nbkts - 1].lb;
 		ubinfbktub = &VSD(drhist64, hist)->bkts[nbkts - 1].ub;
 		break;
 	case VSD_DTYPE_DVHIST32:
 	case VSD_DTYPE_DVHIST64:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if ((info->flags & VSD_HIST_LBOUND_INF) && lbinfbktlb) {
 		*lbinfbktlb = numeric_limits[LIM_MIN][info->voi_dtype];
 		/*
 		 * Assignment from numeric_limit array for Q types assigns max
 		 * possible integral/fractional value for underlying data type,
 		 * but we must set control bits for this specific histogram per
 		 * the user's choice of fractional bits, which we extract from
 		 * info->lb.
 		 */
 		if (info->voi_dtype == VSD_DTYPE_Q_S32 ||
 		    info->voi_dtype == VSD_DTYPE_Q_U32) {
 			/* Signedness doesn't matter for setting control bits. */
 			Q_SCVAL(lbinfbktlb->q32.sq32,
 			    Q_GCVAL(info->lb.q32.sq32));
 		} else if (info->voi_dtype == VSD_DTYPE_Q_S64 ||
 		    info->voi_dtype == VSD_DTYPE_Q_U64) {
 			/* Signedness doesn't matter for setting control bits. */
 			Q_SCVAL(lbinfbktlb->q64.sq64,
 			    Q_GCVAL(info->lb.q64.sq64));
 		}
 		if (lbinfbktub)
 			*lbinfbktub = info->lb;
 	}
 	if ((info->flags & VSD_HIST_UBOUND_INF) && ubinfbktlb) {
 		*ubinfbktlb = bkt_lb;
 		if (ubinfbktub) {
 			*ubinfbktub = numeric_limits[LIM_MAX][info->voi_dtype];
 			if (info->voi_dtype == VSD_DTYPE_Q_S32 ||
 			    info->voi_dtype == VSD_DTYPE_Q_U32) {
 				Q_SCVAL(ubinfbktub->q32.sq32,
 				    Q_GCVAL(info->lb.q32.sq32));
 			} else if (info->voi_dtype == VSD_DTYPE_Q_S64 ||
 			    info->voi_dtype == VSD_DTYPE_Q_U64) {
 				Q_SCVAL(ubinfbktub->q64.sq64,
 				    Q_GCVAL(info->lb.q64.sq64));
 			}
 		}
 	}
 
 	return (0);
 }
 
 int
 stats_vss_tdgst_hlpr(enum vsd_dtype voi_dtype, struct voistatspec *vss,
     struct vss_tdgst_hlpr_info *info)
 {
 	struct voistatdata_tdgst *tdgst;
 	struct ctdth32 *ctd32tree;
 	struct ctdth64 *ctd64tree;
 	struct voistatdata_tdgstctd32 *ctd32;
 	struct voistatdata_tdgstctd64 *ctd64;
 
 	info->voi_dtype = voi_dtype;
 
 	switch (info->tdgst_dtype) {
 	case VSD_DTYPE_TDGSTCLUST32:
 		vss->vsdsz = TDGST_NCTRS2VSDSZ(tdgstclust32, info->nctds);
 		break;
 	case VSD_DTYPE_TDGSTCLUST64:
 		vss->vsdsz = TDGST_NCTRS2VSDSZ(tdgstclust64, info->nctds);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	vss->iv = stats_realloc(NULL, 0, vss->vsdsz, M_ZERO);
 	if (vss->iv == NULL)
 		return (ENOMEM);
 
 	tdgst = (struct voistatdata_tdgst *)vss->iv;
 
 	switch (info->tdgst_dtype) {
 	case VSD_DTYPE_TDGSTCLUST32:
 		ctd32tree = &VSD(tdgstclust32, tdgst)->ctdtree;
 		ARB_INIT(ctd32, ctdlnk, ctd32tree, info->nctds) {
 			Q_INI(&ctd32->mu, 0, 0, info->prec);
 		}
 		break;
 	case VSD_DTYPE_TDGSTCLUST64:
 		ctd64tree = &VSD(tdgstclust64, tdgst)->ctdtree;
 		ARB_INIT(ctd64, ctdlnk, ctd64tree, info->nctds) {
 			Q_INI(&ctd64->mu, 0, 0, info->prec);
 		}
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 int
 stats_vss_numeric_hlpr(enum vsd_dtype voi_dtype, struct voistatspec *vss,
     struct vss_numeric_hlpr_info *info)
 {
 	struct voistatdata_numeric iv;
 
 	switch (vss->stype) {
 	case VS_STYPE_SUM:
 		iv = stats_ctor_vsd_numeric(0);
 		break;
 	case VS_STYPE_MIN:
 		iv = numeric_limits[LIM_MAX][voi_dtype];
 		break;
 	case VS_STYPE_MAX:
 		iv = numeric_limits[LIM_MIN][voi_dtype];
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	vss->iv = stats_realloc(NULL, 0, vsd_dtype2size[voi_dtype], 0);
 	if (vss->iv == NULL)
 		return (ENOMEM);
 
 	vss->vs_dtype = voi_dtype;
 	vss->vsdsz = vsd_dtype2size[voi_dtype];
 	switch (voi_dtype) {
 	case VSD_DTYPE_INT_S32:
 		*((int32_t *)vss->iv) = iv.int32.s32;
 		break;
 	case VSD_DTYPE_INT_U32:
 		*((uint32_t *)vss->iv) = iv.int32.u32;
 		break;
 	case VSD_DTYPE_INT_S64:
 		*((int64_t *)vss->iv) = iv.int64.s64;
 		break;
 	case VSD_DTYPE_INT_U64:
 		*((uint64_t *)vss->iv) = iv.int64.u64;
 		break;
 	case VSD_DTYPE_INT_SLONG:
 		*((long *)vss->iv) = iv.intlong.slong;
 		break;
 	case VSD_DTYPE_INT_ULONG:
 		*((unsigned long *)vss->iv) = iv.intlong.ulong;
 		break;
 	case VSD_DTYPE_Q_S32:
 		*((s32q_t *)vss->iv) = Q_SCVAL(iv.q32.sq32,
 		    Q_CTRLINI(info->prec));
 		break;
 	case VSD_DTYPE_Q_U32:
 		*((u32q_t *)vss->iv) = Q_SCVAL(iv.q32.uq32,
 		    Q_CTRLINI(info->prec));
 		break;
 	case VSD_DTYPE_Q_S64:
 		*((s64q_t *)vss->iv) = Q_SCVAL(iv.q64.sq64,
 		    Q_CTRLINI(info->prec));
 		break;
 	case VSD_DTYPE_Q_U64:
 		*((u64q_t *)vss->iv) = Q_SCVAL(iv.q64.uq64,
 		    Q_CTRLINI(info->prec));
 		break;
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 int
 stats_vss_hlpr_init(enum vsd_dtype voi_dtype, uint32_t nvss,
     struct voistatspec *vss)
 {
 	int i, ret;
 
 	for (i = nvss - 1; i >= 0; i--) {
 		if (vss[i].hlpr && (ret = vss[i].hlpr(voi_dtype, &vss[i],
 		    vss[i].hlprinfo)) != 0)
 			return (ret);
 	}
 
 	return (0);
 }
 
 void
 stats_vss_hlpr_cleanup(uint32_t nvss, struct voistatspec *vss)
 {
 	int i;
 
 	for (i = nvss - 1; i >= 0; i--) {
 		if (vss[i].hlpr) {
 			stats_free((void *)vss[i].iv);
 			vss[i].iv = NULL;
 		}
 	}
 }
 
 int
 stats_tpl_fetch(int tpl_id, struct statsblob_tpl **tpl)
 {
 	int error;
 
 	error = 0;
 
 	TPL_LIST_WLOCK();
 	if (tpl_id < 0 || tpl_id >= (int)ntpl) {
 		error = ENOENT;
 	} else {
 		*tpl = tpllist[tpl_id];
 		/* XXXLAS: Acquire refcount on tpl. */
 	}
 	TPL_LIST_WUNLOCK();
 
 	return (error);
 }
 
 int
 stats_tpl_fetch_allocid(const char *name, uint32_t hash)
 {
 	int i, tpl_id;
 
 	tpl_id = -ESRCH;
 
 	TPL_LIST_RLOCK();
 	for (i = ntpl - 1; i >= 0; i--) {
 		if (name != NULL) {
 			if (strlen(name) == strlen(tpllist[i]->mb->tplname) &&
 			    strncmp(name, tpllist[i]->mb->tplname,
 			    TPL_MAX_NAME_LEN) == 0 && (!hash || hash ==
 			    tpllist[i]->mb->tplhash)) {
 				tpl_id = i;
 				break;
 			}
 		} else if (hash == tpllist[i]->mb->tplhash) {
 			tpl_id = i;
 			break;
 		}
 	}
 	TPL_LIST_RUNLOCK();
 
 	return (tpl_id);
 }
 
 int
 stats_tpl_id2name(uint32_t tpl_id, char *buf, size_t len)
 {
 	int error;
 
 	error = 0;
 
 	TPL_LIST_RLOCK();
 	if (tpl_id < ntpl) {
 		if (buf != NULL && len > strlen(tpllist[tpl_id]->mb->tplname))
 			strlcpy(buf, tpllist[tpl_id]->mb->tplname, len);
 		else
 			error = EOVERFLOW;
 	} else
 		error = ENOENT;
 	TPL_LIST_RUNLOCK();
 
 	return (error);
 }
 
 int
 stats_tpl_sample_rollthedice(struct stats_tpl_sample_rate *rates, int nrates,
     void *seed_bytes, size_t seed_len)
 {
 	uint32_t cum_pct, rnd_pct;
 	int i;
 
 	cum_pct = 0;
 
 	/*
 	 * Choose a pseudorandom or seeded number in range [0,100] and use
 	 * it to make a sampling decision and template selection where required.
 	 * If no seed is supplied, a PRNG is used to generate a pseudorandom
 	 * number so that every selection is independent. If a seed is supplied,
 	 * the caller desires random selection across different seeds, but
 	 * deterministic selection given the same seed. This is achieved by
 	 * hashing the seed and using the hash as the random number source.
 	 *
 	 * XXXLAS: Characterise hash function output distribution.
 	 */
 	if (seed_bytes == NULL)
 		rnd_pct = random() / (INT32_MAX / 100);
 	else
 		rnd_pct = hash32_buf(seed_bytes, seed_len, 0) /
 		    (UINT32_MAX / 100U);
 
 	/*
 	 * We map the randomly selected percentage on to the interval [0,100]
 	 * consisting of the cumulatively summed template sampling percentages.
 	 * The difference between the cumulative sum of all template sampling
 	 * percentages and 100 is treated as a NULL assignment i.e. no stats
 	 * template will be assigned, and -1 returned instead.
 	 */
 	for (i = 0; i < nrates; i++) {
 		cum_pct += rates[i].tpl_sample_pct;
 
 		KASSERT(cum_pct <= 100, ("%s cum_pct %u > 100", __func__,
 		    cum_pct));
 		if (rnd_pct > cum_pct || rates[i].tpl_sample_pct == 0)
 			continue;
 
 		return (rates[i].tpl_slot_id);
 	}
 
 	return (-1);
 }
 
 int
 stats_v1_blob_clone(struct statsblobv1 **dst, size_t dstmaxsz,
     struct statsblobv1 *src, uint32_t flags)
 {
 	int error;
 
 	error = 0;
 
 	if (src == NULL || dst == NULL ||
 	    src->cursz < sizeof(struct statsblob) ||
 	    ((flags & SB_CLONE_ALLOCDST) &&
 	    (flags & (SB_CLONE_USRDSTNOFAULT | SB_CLONE_USRDST)))) {
 		error = EINVAL;
 	} else if (flags & SB_CLONE_ALLOCDST) {
 		*dst = stats_realloc(NULL, 0, src->cursz, 0);
 		if (*dst)
 			(*dst)->maxsz = dstmaxsz = src->cursz;
 		else
 			error = ENOMEM;
 	} else if (*dst == NULL || dstmaxsz < sizeof(struct statsblob)) {
 		error = EINVAL;
 	}
 
 	if (!error) {
 		size_t postcurszlen;
 
 		/*
 		 * Clone src into dst except for the maxsz field. If dst is too
 		 * small to hold all of src, only copy src's header and return
 		 * EOVERFLOW.
 		 */
 #ifdef _KERNEL
 		if (flags & SB_CLONE_USRDSTNOFAULT)
 			copyout_nofault(src, *dst,
 			    offsetof(struct statsblob, maxsz));
 		else if (flags & SB_CLONE_USRDST)
 			copyout(src, *dst, offsetof(struct statsblob, maxsz));
 		else
 #endif
 			memcpy(*dst, src, offsetof(struct statsblob, maxsz));
 
 		if (dstmaxsz >= src->cursz) {
 			postcurszlen = src->cursz -
 			    offsetof(struct statsblob, cursz);
 		} else {
 			error = EOVERFLOW;
 			postcurszlen = sizeof(struct statsblob) -
 			    offsetof(struct statsblob, cursz);
 		}
 #ifdef _KERNEL
 		if (flags & SB_CLONE_USRDSTNOFAULT)
 			copyout_nofault(&(src->cursz), &((*dst)->cursz),
 			    postcurszlen);
 		else if (flags & SB_CLONE_USRDST)
 			copyout(&(src->cursz), &((*dst)->cursz), postcurszlen);
 		else
 #endif
 			memcpy(&((*dst)->cursz), &(src->cursz), postcurszlen);
 	}
 
 	return (error);
 }
 
 int
 stats_v1_tpl_alloc(const char *name, uint32_t flags __unused)
 {
 	struct statsblobv1_tpl *tpl, **newtpllist;
 	struct statsblobv1 *tpl_sb;
 	struct metablob *tpl_mb;
 	int tpl_id;
 
 	if (name != NULL && strlen(name) > TPL_MAX_NAME_LEN)
 		return (-EINVAL);
 
 	if (name != NULL && stats_tpl_fetch_allocid(name, 0) >= 0)
 		return (-EEXIST);
 
 	tpl = stats_realloc(NULL, 0, sizeof(struct statsblobv1_tpl), M_ZERO);
 	tpl_mb = stats_realloc(NULL, 0, sizeof(struct metablob), M_ZERO);
 	tpl_sb = stats_realloc(NULL, 0, sizeof(struct statsblobv1), M_ZERO);
 
 	if (tpl_mb != NULL && name != NULL)
 		tpl_mb->tplname = stats_strdup(name, 0);
 
 	if (tpl == NULL || tpl_sb == NULL || tpl_mb == NULL ||
 	    tpl_mb->tplname == NULL) {
 		stats_free(tpl);
 		stats_free(tpl_sb);
 		if (tpl_mb != NULL) {
 			stats_free(tpl_mb->tplname);
 			stats_free(tpl_mb);
 		}
 		return (-ENOMEM);
 	}
 
 	tpl->mb = tpl_mb;
 	tpl->sb = tpl_sb;
 
 	tpl_sb->abi = STATS_ABI_V1;
 	tpl_sb->endian =
 #if BYTE_ORDER == LITTLE_ENDIAN
 	    SB_LE;
 #elif BYTE_ORDER == BIG_ENDIAN
 	    SB_BE;
 #else
 	    SB_UE;
 #endif
 	tpl_sb->cursz = tpl_sb->maxsz = sizeof(struct statsblobv1);
 	tpl_sb->stats_off = tpl_sb->statsdata_off = sizeof(struct statsblobv1);
 
 	TPL_LIST_WLOCK();
 	newtpllist = stats_realloc(tpllist, ntpl * sizeof(void *),
 	    (ntpl + 1) * sizeof(void *), 0);
 	if (newtpllist != NULL) {
 		tpl_id = ntpl++;
 		tpllist = (struct statsblob_tpl **)newtpllist;
 		tpllist[tpl_id] = (struct statsblob_tpl *)tpl;
 		stats_tpl_update_hash(tpllist[tpl_id]);
 	} else {
 		stats_free(tpl);
 		stats_free(tpl_sb);
 		if (tpl_mb != NULL) {
 			stats_free(tpl_mb->tplname);
 			stats_free(tpl_mb);
 		}
 		tpl_id = -ENOMEM;
 	}
 	TPL_LIST_WUNLOCK();
 
 	return (tpl_id);
 }
 
 int
 stats_v1_tpl_add_voistats(uint32_t tpl_id, int32_t voi_id, const char *voi_name,
     enum vsd_dtype voi_dtype, uint32_t nvss, struct voistatspec *vss,
     uint32_t flags)
 {
 	struct voi *voi;
 	struct voistat *tmpstat;
 	struct statsblobv1 *tpl_sb;
 	struct metablob *tpl_mb;
 	int error, i, newstatdataidx, newvoibytes, newvoistatbytes,
 	    newvoistatdatabytes, newvoistatmaxid;
 	uint32_t nbytes;
 
 	if (voi_id < 0 || voi_dtype == 0 || voi_dtype >= VSD_NUM_DTYPES ||
 	    nvss == 0 || vss == NULL)
 		return (EINVAL);
 
 	error = nbytes = newvoibytes = newvoistatbytes =
 	    newvoistatdatabytes = 0;
 	newvoistatmaxid = -1;
 
 	/* Calculate the number of bytes required for the new voistats. */
 	for (i = nvss - 1; i >= 0; i--) {
 		if (vss[i].stype == 0 || vss[i].stype >= VS_NUM_STYPES ||
 		    vss[i].vs_dtype == 0 || vss[i].vs_dtype >= VSD_NUM_DTYPES ||
 		    vss[i].iv == NULL || vss[i].vsdsz == 0)
 			return (EINVAL);
 		if ((int)vss[i].stype > newvoistatmaxid)
 			newvoistatmaxid = vss[i].stype;
 		newvoistatdatabytes += vss[i].vsdsz;
 	}
 
 	if (flags & SB_VOI_RELUPDATE) {
 		/* XXXLAS: VOI state bytes may need to vary based on stat types. */
 		newvoistatdatabytes += sizeof(struct voistatdata_voistate);
 	}
 	nbytes += newvoistatdatabytes;
 
 	TPL_LIST_WLOCK();
 	if (tpl_id < ntpl) {
 		tpl_sb = (struct statsblobv1 *)tpllist[tpl_id]->sb;
 		tpl_mb = tpllist[tpl_id]->mb;
 
 		if (voi_id >= NVOIS(tpl_sb) || tpl_sb->vois[voi_id].id == -1) {
 			/* Adding a new VOI and associated stats. */
 			if (voi_id >= NVOIS(tpl_sb)) {
 				/* We need to grow the tpl_sb->vois array. */
 				newvoibytes = (voi_id - (NVOIS(tpl_sb) - 1)) *
 				    sizeof(struct voi);
 				nbytes += newvoibytes;
 			}
 			newvoistatbytes =
 			    (newvoistatmaxid + 1) * sizeof(struct voistat);
 		} else {
 			/* Adding stats to an existing VOI. */
 			if (newvoistatmaxid >
 			    tpl_sb->vois[voi_id].voistatmaxid) {
 				newvoistatbytes = (newvoistatmaxid -
 				    tpl_sb->vois[voi_id].voistatmaxid) *
 				    sizeof(struct voistat);
 			}
 			/* XXXLAS: KPI does not yet support expanding VOIs. */
 			error = EOPNOTSUPP;
 		}
 		nbytes += newvoistatbytes;
 
 		if (!error && newvoibytes > 0) {
 			struct voi_meta *voi_meta = tpl_mb->voi_meta;
 
 			voi_meta = stats_realloc(voi_meta, voi_meta == NULL ?
 			    0 : NVOIS(tpl_sb) * sizeof(struct voi_meta),
 			    (1 + voi_id) * sizeof(struct voi_meta),
 			    M_ZERO);
 
 			if (voi_meta == NULL)
 				error = ENOMEM;
 			else
 				tpl_mb->voi_meta = voi_meta;
 		}
 
 		if (!error) {
 			/* NB: Resizing can change where tpl_sb points. */
 			error = stats_v1_blob_expand(&tpl_sb, newvoibytes,
 			    newvoistatbytes, newvoistatdatabytes);
 		}
 
 		if (!error) {
 			tpl_mb->voi_meta[voi_id].name = stats_strdup(voi_name,
 			    0);
 			if (tpl_mb->voi_meta[voi_id].name == NULL)
 				error = ENOMEM;
 		}
 
 		if (!error) {
 			/* Update the template list with the resized pointer. */
 			tpllist[tpl_id]->sb = (struct statsblob *)tpl_sb;
 
 			/* Update the template. */
 			voi = &tpl_sb->vois[voi_id];
 
 			if (voi->id < 0) {
 				/* VOI is new and needs to be initialised. */
 				voi->id = voi_id;
 				voi->dtype = voi_dtype;
 				voi->stats_off = tpl_sb->stats_off;
 				if (flags & SB_VOI_RELUPDATE)
 					voi->flags |= VOI_REQSTATE;
 			} else {
 				/*
 				 * XXXLAS: When this else block is written, the
 				 * "KPI does not yet support expanding VOIs"
 				 * error earlier in this function can be
 				 * removed. What is required here is to shuffle
 				 * the voistat array such that the new stats for
 				 * the voi are contiguous, which will displace
 				 * stats for other vois that reside after the
 				 * voi being updated. The other vois then need
 				 * to have their stats_off adjusted post
 				 * shuffle.
 				 */
 			}
 
 			voi->voistatmaxid = newvoistatmaxid;
 			newstatdataidx = 0;
 
 			if (voi->flags & VOI_REQSTATE) {
 				/* Initialise the voistate stat in slot 0. */
 				tmpstat = BLOB_OFFSET(tpl_sb, voi->stats_off);
 				tmpstat->stype = VS_STYPE_VOISTATE;
 				tmpstat->flags = 0;
 				tmpstat->dtype = VSD_DTYPE_VOISTATE;
 				newstatdataidx = tmpstat->dsz =
 				    sizeof(struct voistatdata_numeric);
 				tmpstat->data_off = tpl_sb->statsdata_off;
 			}
 
 			for (i = 0; (uint32_t)i < nvss; i++) {
 				tmpstat = BLOB_OFFSET(tpl_sb, voi->stats_off +
 				    (vss[i].stype * sizeof(struct voistat)));
 				KASSERT(tmpstat->stype < 0, ("voistat %p "
 				    "already initialised", tmpstat));
 				tmpstat->stype = vss[i].stype;
 				tmpstat->flags = vss[i].flags;
 				tmpstat->dtype = vss[i].vs_dtype;
 				tmpstat->dsz = vss[i].vsdsz;
 				tmpstat->data_off = tpl_sb->statsdata_off +
 				    newstatdataidx;
 				memcpy(BLOB_OFFSET(tpl_sb, tmpstat->data_off),
 				    vss[i].iv, vss[i].vsdsz);
 				newstatdataidx += vss[i].vsdsz;
 			}
 
 			/* Update the template version hash. */
 			stats_tpl_update_hash(tpllist[tpl_id]);
 			/* XXXLAS: Confirm tpl name/hash pair remains unique. */
 		}
 	} else
 		error = EINVAL;
 	TPL_LIST_WUNLOCK();
 
 	return (error);
 }
 
 struct statsblobv1 *
 stats_v1_blob_alloc(uint32_t tpl_id, uint32_t flags __unused)
 {
 	struct statsblobv1 *sb;
 	int error;
 
 	sb = NULL;
 
 	TPL_LIST_RLOCK();
 	if (tpl_id < ntpl) {
 		sb = stats_realloc(NULL, 0, tpllist[tpl_id]->sb->maxsz, 0);
 		if (sb != NULL) {
 			sb->maxsz = tpllist[tpl_id]->sb->maxsz;
 			error = stats_v1_blob_init_locked(sb, tpl_id, 0);
 		} else
 			error = ENOMEM;
 
 		if (error) {
 			stats_free(sb);
 			sb = NULL;
 		}
 	}
 	TPL_LIST_RUNLOCK();
 
 	return (sb);
 }
 
 void
 stats_v1_blob_destroy(struct statsblobv1 *sb)
 {
 
 	stats_free(sb);
 }
 
 int
 stats_v1_voistat_fetch_dptr(struct statsblobv1 *sb, int32_t voi_id,
     enum voi_stype stype, enum vsd_dtype *retdtype, struct voistatdata **retvsd,
     size_t *retvsdsz)
 {
 	struct voi *v;
 	struct voistat *vs;
 
 	if (retvsd == NULL || sb == NULL || sb->abi != STATS_ABI_V1 ||
 	    voi_id >= NVOIS(sb))
 		return (EINVAL);
 
 	v = &sb->vois[voi_id];
 	if ((__typeof(v->voistatmaxid))stype > v->voistatmaxid)
 		return (EINVAL);
 
 	vs = BLOB_OFFSET(sb, v->stats_off + (stype * sizeof(struct voistat)));
 	*retvsd = BLOB_OFFSET(sb, vs->data_off);
 	if (retdtype != NULL)
 		*retdtype = vs->dtype;
 	if (retvsdsz != NULL)
 		*retvsdsz = vs->dsz;
 
 	return (0);
 }
 
 int
 stats_v1_blob_init(struct statsblobv1 *sb, uint32_t tpl_id, uint32_t flags)
 {
 	int error;
 
 	error = 0;
 
 	TPL_LIST_RLOCK();
 	if (sb == NULL || tpl_id >= ntpl) {
 		error = EINVAL;
 	} else {
 		error = stats_v1_blob_init_locked(sb, tpl_id, flags);
 	}
 	TPL_LIST_RUNLOCK();
 
 	return (error);
 }
 
 static inline int
 stats_v1_blob_init_locked(struct statsblobv1 *sb, uint32_t tpl_id,
     uint32_t flags __unused)
 {
 	int error;
 
 	TPL_LIST_RLOCK_ASSERT();
 	error = (sb->maxsz >= tpllist[tpl_id]->sb->cursz) ? 0 : EOVERFLOW;
 	KASSERT(!error,
 	    ("sb %d instead of %d bytes", sb->maxsz, tpllist[tpl_id]->sb->cursz));
 
 	if (!error) {
 		memcpy(sb, tpllist[tpl_id]->sb, tpllist[tpl_id]->sb->cursz);
 		sb->created = sb->lastrst = stats_sbinuptime();
 		sb->tplhash = tpllist[tpl_id]->mb->tplhash;
 	}
 
 	return (error);
 }
 
 static int
 stats_v1_blob_expand(struct statsblobv1 **sbpp, int newvoibytes,
     int newvoistatbytes, int newvoistatdatabytes)
 {
 	struct statsblobv1 *sb;
 	struct voi *tmpvoi;
 	struct voistat *tmpvoistat, *voistat_array;
 	int error, i, idxnewvois, idxnewvoistats, nbytes, nvoistats;
 
 	KASSERT(newvoibytes % sizeof(struct voi) == 0,
 	    ("Bad newvoibytes %d", newvoibytes));
 	KASSERT(newvoistatbytes % sizeof(struct voistat) == 0,
 	    ("Bad newvoistatbytes %d", newvoistatbytes));
 
 	error = ((newvoibytes % sizeof(struct voi) == 0) &&
 	    (newvoistatbytes % sizeof(struct voistat) == 0)) ? 0 : EINVAL;
 	sb = *sbpp;
 	nbytes = newvoibytes + newvoistatbytes + newvoistatdatabytes;
 
 	/*
 	 * XXXLAS: Required until we gain support for flags which alter the
 	 * units of size/offset fields in key structs.
 	 */
 	if (!error && ((((int)sb->cursz) + nbytes) > SB_V1_MAXSZ))
 		error = EFBIG;
 
 	if (!error && (sb->cursz + nbytes > sb->maxsz)) {
 		/* Need to expand our blob. */
 		sb = stats_realloc(sb, sb->maxsz, sb->cursz + nbytes, M_ZERO);
 		if (sb != NULL) {
 			sb->maxsz = sb->cursz + nbytes;
 			*sbpp = sb;
 		} else
 		    error = ENOMEM;
 	}
 
 	if (!error) {
 		/*
 		 * Shuffle memory within the expanded blob working from the end
 		 * backwards, leaving gaps for the new voistat and voistatdata
 		 * structs at the beginning of their respective blob regions,
 		 * and for the new voi structs at the end of their blob region.
 		 */
 		memmove(BLOB_OFFSET(sb, sb->statsdata_off + nbytes),
 		    BLOB_OFFSET(sb, sb->statsdata_off),
 		    sb->cursz - sb->statsdata_off);
 		memmove(BLOB_OFFSET(sb, sb->stats_off + newvoibytes +
 		    newvoistatbytes), BLOB_OFFSET(sb, sb->stats_off),
 		    sb->statsdata_off - sb->stats_off);
 
 		/* First index of new voi/voistat structs to be initialised. */
 		idxnewvois = NVOIS(sb);
 		idxnewvoistats = (newvoistatbytes / sizeof(struct voistat)) - 1;
 
 		/* Update housekeeping variables and offsets. */
 		sb->cursz += nbytes;
 		sb->stats_off += newvoibytes;
 		sb->statsdata_off += newvoibytes + newvoistatbytes;
 
 		/* XXXLAS: Zeroing not strictly needed but aids debugging. */
 		memset(&sb->vois[idxnewvois], '\0', newvoibytes);
 		memset(BLOB_OFFSET(sb, sb->stats_off), '\0',
 		    newvoistatbytes);
 		memset(BLOB_OFFSET(sb, sb->statsdata_off), '\0',
 		    newvoistatdatabytes);
 
 		/* Initialise new voi array members and update offsets. */
 		for (i = 0; i < NVOIS(sb); i++) {
 			tmpvoi = &sb->vois[i];
 			if (i >= idxnewvois) {
 				tmpvoi->id = tmpvoi->voistatmaxid = -1;
 			} else if (tmpvoi->id > -1) {
 				tmpvoi->stats_off += newvoibytes +
 				    newvoistatbytes;
 			}
 		}
 
 		/* Initialise new voistat array members and update offsets. */
 		nvoistats = (sb->statsdata_off - sb->stats_off) /
 		    sizeof(struct voistat);
 		voistat_array = BLOB_OFFSET(sb, sb->stats_off);
 		for (i = 0; i < nvoistats; i++) {
 			tmpvoistat = &voistat_array[i];
 			if (i <= idxnewvoistats) {
 				tmpvoistat->stype = -1;
 			} else if (tmpvoistat->stype > -1) {
 				tmpvoistat->data_off += nbytes;
 			}
 		}
 	}
 
 	return (error);
 }
 
 static void
 stats_v1_blob_finalise(struct statsblobv1 *sb __unused)
 {
 
 	/* XXXLAS: Fill this in. */
 }
 
 static void
 stats_v1_blob_iter(struct statsblobv1 *sb, stats_v1_blob_itercb_t icb,
     void *usrctx, uint32_t flags)
 {
 	struct voi *v;
 	struct voistat *vs;
 	struct sb_iter_ctx ctx;
 	int i, j, firstvoi;
 
 	ctx.usrctx = usrctx;
 	ctx.flags = SB_IT_FIRST_CB;
 	firstvoi = 1;
 
 	for (i = 0; i < NVOIS(sb); i++) {
 		v = &sb->vois[i];
 		ctx.vslot = i;
 		ctx.vsslot = -1;
 		ctx.flags |= SB_IT_FIRST_VOISTAT;
 
 		if (firstvoi)
 			ctx.flags |= SB_IT_FIRST_VOI;
 		else if (i == (NVOIS(sb) - 1))
 			ctx.flags |= SB_IT_LAST_VOI | SB_IT_LAST_CB;
 
 		if (v->id < 0 && (flags & SB_IT_NULLVOI)) {
 			if (icb(sb, v, NULL, &ctx))
 				return;
 			firstvoi = 0;
 			ctx.flags &= ~SB_IT_FIRST_CB;
 		}
 
 		/* If NULL voi, v->voistatmaxid == -1 */
 		for (j = 0; j <= v->voistatmaxid; j++) {
 			vs = &((struct voistat *)BLOB_OFFSET(sb,
 			    v->stats_off))[j];
 			if (vs->stype < 0 &&
 			    !(flags & SB_IT_NULLVOISTAT))
 				continue;
 
 			if (j == v->voistatmaxid) {
 				ctx.flags |= SB_IT_LAST_VOISTAT;
 				if (i == (NVOIS(sb) - 1))
 					ctx.flags |=
 					    SB_IT_LAST_CB;
 			} else
 				ctx.flags &= ~SB_IT_LAST_CB;
 
 			ctx.vsslot = j;
 			if (icb(sb, v, vs, &ctx))
 				return;
 
 			ctx.flags &= ~(SB_IT_FIRST_CB | SB_IT_FIRST_VOISTAT |
 			    SB_IT_LAST_VOISTAT);
 		}
 		ctx.flags &= ~(SB_IT_FIRST_VOI | SB_IT_LAST_VOI);
 	}
 }
 
 static inline void
 stats_voistatdata_tdgst_tostr(enum vsd_dtype voi_dtype __unused,
     const struct voistatdata_tdgst *tdgst, enum vsd_dtype tdgst_dtype,
     size_t tdgst_dsz __unused, enum sb_str_fmt fmt, struct sbuf *buf, int objdump)
 {
 	const struct ctdth32 *ctd32tree;
 	const struct ctdth64 *ctd64tree;
 	const struct voistatdata_tdgstctd32 *ctd32;
 	const struct voistatdata_tdgstctd64 *ctd64;
 	const char *fmtstr;
 	uint64_t smplcnt, compcnt;
 	int is32bit, qmaxstrlen;
 	uint16_t maxctds, curctds;
 
 	switch (tdgst_dtype) {
 	case VSD_DTYPE_TDGSTCLUST32:
 		smplcnt = CONSTVSD(tdgstclust32, tdgst)->smplcnt;
 		compcnt = CONSTVSD(tdgstclust32, tdgst)->compcnt;
 		maxctds = ARB_MAXNODES(&CONSTVSD(tdgstclust32, tdgst)->ctdtree);
 		curctds = ARB_CURNODES(&CONSTVSD(tdgstclust32, tdgst)->ctdtree);
 		ctd32tree = &CONSTVSD(tdgstclust32, tdgst)->ctdtree;
 		ctd32 = (objdump ? ARB_CNODE(ctd32tree, 0) :
 		    ARB_CMIN(ctdth32, ctd32tree));
 		qmaxstrlen = (ctd32 == NULL) ? 1 : Q_MAXSTRLEN(ctd32->mu, 10);
 		is32bit = 1;
 		ctd64tree = NULL;
 		ctd64 = NULL;
 		break;
 	case VSD_DTYPE_TDGSTCLUST64:
 		smplcnt = CONSTVSD(tdgstclust64, tdgst)->smplcnt;
 		compcnt = CONSTVSD(tdgstclust64, tdgst)->compcnt;
 		maxctds = ARB_MAXNODES(&CONSTVSD(tdgstclust64, tdgst)->ctdtree);
 		curctds = ARB_CURNODES(&CONSTVSD(tdgstclust64, tdgst)->ctdtree);
 		ctd64tree = &CONSTVSD(tdgstclust64, tdgst)->ctdtree;
 		ctd64 = (objdump ? ARB_CNODE(ctd64tree, 0) :
 		    ARB_CMIN(ctdth64, ctd64tree));
 		qmaxstrlen = (ctd64 == NULL) ? 1 : Q_MAXSTRLEN(ctd64->mu, 10);
 		is32bit = 0;
 		ctd32tree = NULL;
 		ctd32 = NULL;
 		break;
 	default:
 		return;
 	}
 
 	switch (fmt) {
 	case SB_STRFMT_FREEFORM:
 		fmtstr = "smplcnt=%ju, compcnt=%ju, maxctds=%hu, nctds=%hu";
 		break;
 	case SB_STRFMT_JSON:
 	default:
 		fmtstr =
 		    "\"smplcnt\":%ju,\"compcnt\":%ju,\"maxctds\":%hu,"
 		    "\"nctds\":%hu,\"ctds\":[";
 		break;
 	}
 	sbuf_printf(buf, fmtstr, (uintmax_t)smplcnt, (uintmax_t)compcnt,
 	    maxctds, curctds);
 
 	while ((is32bit ? NULL != ctd32 : NULL != ctd64)) {
 		char qstr[qmaxstrlen];
 
 		switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = "\n\t\t\t\t";
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = "{";
 			break;
 		}
 		sbuf_cat(buf, fmtstr);
 
 		if (objdump) {
 			switch (fmt) {
 			case SB_STRFMT_FREEFORM:
 				fmtstr = "ctd[%hu].";
 				break;
 			case SB_STRFMT_JSON:
 			default:
 				fmtstr = "\"ctd\":%hu,";
 				break;
 			}
 			sbuf_printf(buf, fmtstr, is32bit ?
 			    ARB_SELFIDX(ctd32tree, ctd32) :
 			    ARB_SELFIDX(ctd64tree, ctd64));
 		}
 
 		switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = "{mu=";
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = "\"mu\":";
 			break;
 		}
 		sbuf_cat(buf, fmtstr);
 		Q_TOSTR((is32bit ? ctd32->mu : ctd64->mu), -1, 10, qstr,
 		    sizeof(qstr));
 		sbuf_cat(buf, qstr);
 
 		switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = is32bit ? ",cnt=%u}" : ",cnt=%ju}";
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = is32bit ? ",\"cnt\":%u}" : ",\"cnt\":%ju}";
 			break;
 		}
 		sbuf_printf(buf, fmtstr,
 		    is32bit ? ctd32->cnt : (uintmax_t)ctd64->cnt);
 
 		if (is32bit)
 			ctd32 = (objdump ? ARB_CNODE(ctd32tree,
 			    ARB_SELFIDX(ctd32tree, ctd32) + 1) :
 			    ARB_CNEXT(ctdth32, ctd32tree, ctd32));
 		else
 			ctd64 = (objdump ? ARB_CNODE(ctd64tree,
 			    ARB_SELFIDX(ctd64tree, ctd64) + 1) :
 			    ARB_CNEXT(ctdth64, ctd64tree, ctd64));
 
 		if (fmt == SB_STRFMT_JSON &&
 		    (is32bit ? NULL != ctd32 : NULL != ctd64))
 			sbuf_putc(buf, ',');
 	}
 	if (fmt == SB_STRFMT_JSON)
 		sbuf_cat(buf, "]");
 }
 
 static inline void
 stats_voistatdata_hist_tostr(enum vsd_dtype voi_dtype,
     const struct voistatdata_hist *hist, enum vsd_dtype hist_dtype,
     size_t hist_dsz, enum sb_str_fmt fmt, struct sbuf *buf, int objdump)
 {
 	const struct voistatdata_numeric *bkt_lb, *bkt_ub;
 	const char *fmtstr;
 	int is32bit;
 	uint16_t i, nbkts;
 
 	switch (hist_dtype) {
 	case VSD_DTYPE_CRHIST32:
 		nbkts = HIST_VSDSZ2NBKTS(crhist32, hist_dsz);
 		is32bit = 1;
 		break;
 	case VSD_DTYPE_DRHIST32:
 		nbkts = HIST_VSDSZ2NBKTS(drhist32, hist_dsz);
 		is32bit = 1;
 		break;
 	case VSD_DTYPE_DVHIST32:
 		nbkts = HIST_VSDSZ2NBKTS(dvhist32, hist_dsz);
 		is32bit = 1;
 		break;
 	case VSD_DTYPE_CRHIST64:
 		nbkts = HIST_VSDSZ2NBKTS(crhist64, hist_dsz);
 		is32bit = 0;
 		break;
 	case VSD_DTYPE_DRHIST64:
 		nbkts = HIST_VSDSZ2NBKTS(drhist64, hist_dsz);
 		is32bit = 0;
 		break;
 	case VSD_DTYPE_DVHIST64:
 		nbkts = HIST_VSDSZ2NBKTS(dvhist64, hist_dsz);
 		is32bit = 0;
 		break;
 	default:
 		return;
 	}
 
 	switch (fmt) {
 	case SB_STRFMT_FREEFORM:
 		fmtstr = "nbkts=%hu, ";
 		break;
 	case SB_STRFMT_JSON:
 	default:
 		fmtstr = "\"nbkts\":%hu,";
 		break;
 	}
 	sbuf_printf(buf, fmtstr, nbkts);
 
 	switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = (is32bit ? "oob=%u" : "oob=%ju");
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = (is32bit ? "\"oob\":%u,\"bkts\":[" :
 			    "\"oob\":%ju,\"bkts\":[");
 			break;
 	}
 	sbuf_printf(buf, fmtstr, is32bit ? VSD_CONSTHIST_FIELDVAL(hist,
 	    hist_dtype, oob) : (uintmax_t)VSD_CONSTHIST_FIELDVAL(hist,
 	    hist_dtype, oob));
 
 	for (i = 0; i < nbkts; i++) {
 		switch (hist_dtype) {
 		case VSD_DTYPE_CRHIST32:
 		case VSD_DTYPE_CRHIST64:
 			bkt_lb = VSD_CONSTCRHIST_FIELDPTR(hist, hist_dtype,
 			    bkts[i].lb);
 			if (i < nbkts - 1)
 				bkt_ub = VSD_CONSTCRHIST_FIELDPTR(hist,
 				    hist_dtype, bkts[i + 1].lb);
 			else
 				bkt_ub = &numeric_limits[LIM_MAX][voi_dtype];
 			break;
 		case VSD_DTYPE_DRHIST32:
 		case VSD_DTYPE_DRHIST64:
 			bkt_lb = VSD_CONSTDRHIST_FIELDPTR(hist, hist_dtype,
 			    bkts[i].lb);
 			bkt_ub = VSD_CONSTDRHIST_FIELDPTR(hist, hist_dtype,
 			    bkts[i].ub);
 			break;
 		case VSD_DTYPE_DVHIST32:
 		case VSD_DTYPE_DVHIST64:
 			bkt_lb = bkt_ub = VSD_CONSTDVHIST_FIELDPTR(hist,
 			    hist_dtype, bkts[i].val);
 			break;
 		default:
 			break;
 		}
 
 		switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = "\n\t\t\t\t";
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = "{";
 			break;
 		}
 		sbuf_cat(buf, fmtstr);
 
 		if (objdump) {
 			switch (fmt) {
 			case SB_STRFMT_FREEFORM:
 				fmtstr = "bkt[%hu].";
 				break;
 			case SB_STRFMT_JSON:
 			default:
 				fmtstr = "\"bkt\":%hu,";
 				break;
 			}
 			sbuf_printf(buf, fmtstr, i);
 		}
 
 		switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = "{lb=";
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = "\"lb\":";
 			break;
 		}
 		sbuf_cat(buf, fmtstr);
 		stats_voistatdata_tostr((const struct voistatdata *)bkt_lb,
 		    voi_dtype, voi_dtype, sizeof(struct voistatdata_numeric),
 		    fmt, buf, objdump);
 
 		switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = ",ub=";
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = ",\"ub\":";
 			break;
 		}
 		sbuf_cat(buf, fmtstr);
 		stats_voistatdata_tostr((const struct voistatdata *)bkt_ub,
 		    voi_dtype, voi_dtype, sizeof(struct voistatdata_numeric),
 		    fmt, buf, objdump);
 
 		switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = is32bit ? ",cnt=%u}" : ",cnt=%ju}";
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = is32bit ? ",\"cnt\":%u}" : ",\"cnt\":%ju}";
 			break;
 		}
 		sbuf_printf(buf, fmtstr, is32bit ?
 		    VSD_CONSTHIST_FIELDVAL(hist, hist_dtype, bkts[i].cnt) :
 		    (uintmax_t)VSD_CONSTHIST_FIELDVAL(hist, hist_dtype,
 		    bkts[i].cnt));
 
 		if (fmt == SB_STRFMT_JSON && i < nbkts - 1)
 			sbuf_putc(buf, ',');
 	}
 	if (fmt == SB_STRFMT_JSON)
 		sbuf_cat(buf, "]");
 }
 
 int
 stats_voistatdata_tostr(const struct voistatdata *vsd, enum vsd_dtype voi_dtype,
     enum vsd_dtype vsd_dtype, size_t vsd_sz, enum sb_str_fmt fmt,
     struct sbuf *buf, int objdump)
 {
 	const char *fmtstr;
 
 	if (vsd == NULL || buf == NULL || voi_dtype >= VSD_NUM_DTYPES ||
 	    vsd_dtype >= VSD_NUM_DTYPES || fmt >= SB_STRFMT_NUM_FMTS)
 		return (EINVAL);
 
 	switch (vsd_dtype) {
 	case VSD_DTYPE_VOISTATE:
 		switch (fmt) {
 		case SB_STRFMT_FREEFORM:
 			fmtstr = "prev=";
 			break;
 		case SB_STRFMT_JSON:
 		default:
 			fmtstr = "\"prev\":";
 			break;
 		}
 		sbuf_cat(buf, fmtstr);
 		/*
 		 * Render prev by passing it as *vsd and voi_dtype as vsd_dtype.
 		 */
 		stats_voistatdata_tostr(
 		    (const struct voistatdata *)&CONSTVSD(voistate, vsd)->prev,
 		    voi_dtype, voi_dtype, vsd_sz, fmt, buf, objdump);
 		break;
 	case VSD_DTYPE_INT_S32:
 		sbuf_printf(buf, "%d", vsd->int32.s32);
 		break;
 	case VSD_DTYPE_INT_U32:
 		sbuf_printf(buf, "%u", vsd->int32.u32);
 		break;
 	case VSD_DTYPE_INT_S64:
 		sbuf_printf(buf, "%jd", (intmax_t)vsd->int64.s64);
 		break;
 	case VSD_DTYPE_INT_U64:
 		sbuf_printf(buf, "%ju", (uintmax_t)vsd->int64.u64);
 		break;
 	case VSD_DTYPE_INT_SLONG:
 		sbuf_printf(buf, "%ld", vsd->intlong.slong);
 		break;
 	case VSD_DTYPE_INT_ULONG:
 		sbuf_printf(buf, "%lu", vsd->intlong.ulong);
 		break;
 	case VSD_DTYPE_Q_S32:
 		{
 		char qstr[Q_MAXSTRLEN(vsd->q32.sq32, 10)];
 		Q_TOSTR((s32q_t)vsd->q32.sq32, -1, 10, qstr, sizeof(qstr));
 		sbuf_cat(buf, qstr);
 		}
 		break;
 	case VSD_DTYPE_Q_U32:
 		{
 		char qstr[Q_MAXSTRLEN(vsd->q32.uq32, 10)];
 		Q_TOSTR((u32q_t)vsd->q32.uq32, -1, 10, qstr, sizeof(qstr));
 		sbuf_cat(buf, qstr);
 		}
 		break;
 	case VSD_DTYPE_Q_S64:
 		{
 		char qstr[Q_MAXSTRLEN(vsd->q64.sq64, 10)];
 		Q_TOSTR((s64q_t)vsd->q64.sq64, -1, 10, qstr, sizeof(qstr));
 		sbuf_cat(buf, qstr);
 		}
 		break;
 	case VSD_DTYPE_Q_U64:
 		{
 		char qstr[Q_MAXSTRLEN(vsd->q64.uq64, 10)];
 		Q_TOSTR((u64q_t)vsd->q64.uq64, -1, 10, qstr, sizeof(qstr));
 		sbuf_cat(buf, qstr);
 		}
 		break;
 	case VSD_DTYPE_CRHIST32:
 	case VSD_DTYPE_DRHIST32:
 	case VSD_DTYPE_DVHIST32:
 	case VSD_DTYPE_CRHIST64:
 	case VSD_DTYPE_DRHIST64:
 	case VSD_DTYPE_DVHIST64:
 		stats_voistatdata_hist_tostr(voi_dtype, CONSTVSD(hist, vsd),
 		    vsd_dtype, vsd_sz, fmt, buf, objdump);
 		break;
 	case VSD_DTYPE_TDGSTCLUST32:
 	case VSD_DTYPE_TDGSTCLUST64:
 		stats_voistatdata_tdgst_tostr(voi_dtype,
 		    CONSTVSD(tdgst, vsd), vsd_dtype, vsd_sz, fmt, buf,
 		    objdump);
 		break;
 	default:
 		break;
 	}
 
 	return (sbuf_error(buf));
 }
 
 static void
 stats_v1_itercb_tostr_freeform(struct statsblobv1 *sb, struct voi *v,
     struct voistat *vs, struct sb_iter_ctx *ctx)
 {
 	struct sb_tostrcb_ctx *sctx;
 	struct metablob *tpl_mb;
 	struct sbuf *buf;
 	void *vsd;
 	uint8_t dump;
 
 	sctx = ctx->usrctx;
 	buf = sctx->buf;
 	tpl_mb = sctx->tpl ? sctx->tpl->mb : NULL;
 	dump = ((sctx->flags & SB_TOSTR_OBJDUMP) != 0);
 
 	if (ctx->flags & SB_IT_FIRST_CB) {
 		sbuf_printf(buf, "struct statsblobv1@%p", sb);
 		if (dump) {
 			sbuf_printf(buf, ", abi=%hhu, endian=%hhu, maxsz=%hu, "
 			    "cursz=%hu, created=%jd, lastrst=%jd, flags=0x%04hx, "
 			    "stats_off=%hu, statsdata_off=%hu",
 			    sb->abi, sb->endian, sb->maxsz, sb->cursz,
 			    sb->created, sb->lastrst, sb->flags, sb->stats_off,
 			    sb->statsdata_off);
 		}
 		sbuf_printf(buf, ", tplhash=%u", sb->tplhash);
 	}
 
 	if (ctx->flags & SB_IT_FIRST_VOISTAT) {
 		sbuf_printf(buf, "\n\tvois[%hd]: id=%hd", ctx->vslot, v->id);
 		if (v->id < 0)
 			return;
 		sbuf_printf(buf, ", name=\"%s\"", (tpl_mb == NULL) ? "" :
 		    tpl_mb->voi_meta[v->id].name);
 		if (dump)
 		    sbuf_printf(buf, ", flags=0x%04hx, dtype=%s, "
 		    "voistatmaxid=%hhd, stats_off=%hu", v->flags,
 		    vsd_dtype2name[v->dtype], v->voistatmaxid, v->stats_off);
 	}
 
 	if (!dump && vs->stype <= 0)
 		return;
 
 	sbuf_printf(buf, "\n\t\tvois[%hd]stat[%hhd]: stype=", v->id, ctx->vsslot);
 	if (vs->stype < 0) {
 		sbuf_printf(buf, "%hhd", vs->stype);
 		return;
 	} else
 		sbuf_printf(buf, "%s, errs=%hu", vs_stype2name[vs->stype],
 		    vs->errs);
 	vsd = BLOB_OFFSET(sb, vs->data_off);
 	if (dump)
 		sbuf_printf(buf, ", flags=0x%04x, dtype=%s, dsz=%hu, "
 		    "data_off=%hu", vs->flags, vsd_dtype2name[vs->dtype],
 		    vs->dsz, vs->data_off);
 
-	sbuf_printf(buf, "\n\t\t\tvoistatdata: ");
+	sbuf_cat(buf, "\n\t\t\tvoistatdata: ");
 	stats_voistatdata_tostr(vsd, v->dtype, vs->dtype, vs->dsz,
 	    sctx->fmt, buf, dump);
 }
 
 static void
 stats_v1_itercb_tostr_json(struct statsblobv1 *sb, struct voi *v, struct voistat *vs,
     struct sb_iter_ctx *ctx)
 {
 	struct sb_tostrcb_ctx *sctx;
 	struct metablob *tpl_mb;
 	struct sbuf *buf;
 	const char *fmtstr;
 	void *vsd;
 	uint8_t dump;
 
 	sctx = ctx->usrctx;
 	buf = sctx->buf;
 	tpl_mb = sctx->tpl ? sctx->tpl->mb : NULL;
 	dump = ((sctx->flags & SB_TOSTR_OBJDUMP) != 0);
 
 	if (ctx->flags & SB_IT_FIRST_CB) {
 		sbuf_putc(buf, '{');
 		if (dump) {
 			sbuf_printf(buf, "\"abi\":%hhu,\"endian\":%hhu,"
 			    "\"maxsz\":%hu,\"cursz\":%hu,\"created\":%jd,"
 			    "\"lastrst\":%jd,\"flags\":%hu,\"stats_off\":%hu,"
 			    "\"statsdata_off\":%hu,", sb->abi,
 			    sb->endian, sb->maxsz, sb->cursz, sb->created,
 			    sb->lastrst, sb->flags, sb->stats_off,
 			    sb->statsdata_off);
 		}
 
 		if (tpl_mb == NULL)
 			fmtstr = "\"tplname\":%s,\"tplhash\":%u,\"vois\":{";
 		else
 			fmtstr = "\"tplname\":\"%s\",\"tplhash\":%u,\"vois\":{";
 
 		sbuf_printf(buf, fmtstr, tpl_mb ? tpl_mb->tplname : "null",
 		    sb->tplhash);
 	}
 
 	if (ctx->flags & SB_IT_FIRST_VOISTAT) {
 		if (dump) {
 			sbuf_printf(buf, "\"[%d]\":{\"id\":%d", ctx->vslot,
 			    v->id);
 			if (v->id < 0) {
-				sbuf_printf(buf, "},");
+				sbuf_cat(buf, "},");
 				return;
 			}
 			
 			if (tpl_mb == NULL)
 				fmtstr = ",\"name\":%s,\"flags\":%hu,"
 				    "\"dtype\":\"%s\",\"voistatmaxid\":%hhd,"
 				    "\"stats_off\":%hu,";
 			else
 				fmtstr = ",\"name\":\"%s\",\"flags\":%hu,"
 				    "\"dtype\":\"%s\",\"voistatmaxid\":%hhd,"
 				    "\"stats_off\":%hu,";
 
 			sbuf_printf(buf, fmtstr, tpl_mb ?
 			    tpl_mb->voi_meta[v->id].name : "null", v->flags,
 			    vsd_dtype2name[v->dtype], v->voistatmaxid,
 			    v->stats_off);
 		} else {
 			if (tpl_mb == NULL) {
 				sbuf_printf(buf, "\"[%hd]\":{", v->id);
 			} else {
 				sbuf_printf(buf, "\"%s\":{",
 				    tpl_mb->voi_meta[v->id].name);
 			}
 		}
 		sbuf_cat(buf, "\"stats\":{");
 	}
 
 	vsd = BLOB_OFFSET(sb, vs->data_off);
 	if (dump) {
 		sbuf_printf(buf, "\"[%hhd]\":", ctx->vsslot);
 		if (vs->stype < 0) {
-			sbuf_printf(buf, "{\"stype\":-1},");
+			sbuf_cat(buf, "{\"stype\":-1},");
 			return;
 		}
 		sbuf_printf(buf, "{\"stype\":\"%s\",\"errs\":%hu,\"flags\":%hu,"
 		    "\"dtype\":\"%s\",\"data_off\":%hu,\"voistatdata\":{",
 		    vs_stype2name[vs->stype], vs->errs, vs->flags,
 		    vsd_dtype2name[vs->dtype], vs->data_off);
 	} else if (vs->stype > 0) {
 		if (tpl_mb == NULL)
 			sbuf_printf(buf, "\"[%hhd]\":", vs->stype);
 		else
 			sbuf_printf(buf, "\"%s\":", vs_stype2name[vs->stype]);
 	} else
 		return;
 
 	if ((vs->flags & VS_VSDVALID) || dump) {
 		if (!dump)
 			sbuf_printf(buf, "{\"errs\":%hu,", vs->errs);
 		/* Simple non-compound VSD types need a key. */
 		if (!vsd_compoundtype[vs->dtype])
 			sbuf_cat(buf, "\"val\":");
 		stats_voistatdata_tostr(vsd, v->dtype, vs->dtype, vs->dsz,
 		    sctx->fmt, buf, dump);
 		sbuf_cat(buf, dump ? "}}" : "}");
 	} else
 		sbuf_cat(buf, dump ? "null}" : "null");
 
 	if (ctx->flags & SB_IT_LAST_VOISTAT)
 		sbuf_cat(buf, "}}");
 
 	if (ctx->flags & SB_IT_LAST_CB)
 		sbuf_cat(buf, "}}");
 	else
 		sbuf_putc(buf, ',');
 }
 
 static int
 stats_v1_itercb_tostr(struct statsblobv1 *sb, struct voi *v, struct voistat *vs,
     struct sb_iter_ctx *ctx)
 {
 	struct sb_tostrcb_ctx *sctx;
 
 	sctx = ctx->usrctx;
 
 	switch (sctx->fmt) {
 	case SB_STRFMT_FREEFORM:
 		stats_v1_itercb_tostr_freeform(sb, v, vs, ctx);
 		break;
 	case SB_STRFMT_JSON:
 		stats_v1_itercb_tostr_json(sb, v, vs, ctx);
 		break;
 	default:
 		break;
 	}
 
 	return (sbuf_error(sctx->buf));
 }
 
 int
 stats_v1_blob_tostr(struct statsblobv1 *sb, struct sbuf *buf,
     enum sb_str_fmt fmt, uint32_t flags)
 {
 	struct sb_tostrcb_ctx sctx;
 	uint32_t iflags;
 
 	if (sb == NULL || sb->abi != STATS_ABI_V1 || buf == NULL ||
 	    fmt >= SB_STRFMT_NUM_FMTS)
 		return (EINVAL);
 
 	sctx.buf = buf;
 	sctx.fmt = fmt;
 	sctx.flags = flags;
 
 	if (flags & SB_TOSTR_META) {
 		if (stats_tpl_fetch(stats_tpl_fetch_allocid(NULL, sb->tplhash),
 		    &sctx.tpl))
 			return (EINVAL);
 	} else
 		sctx.tpl = NULL;
 
 	iflags = 0;
 	if (flags & SB_TOSTR_OBJDUMP)
 		iflags |= (SB_IT_NULLVOI | SB_IT_NULLVOISTAT);
 	stats_v1_blob_iter(sb, stats_v1_itercb_tostr, &sctx, iflags);
 
 	return (sbuf_error(buf));
 }
 
 static int
 stats_v1_itercb_visit(struct statsblobv1 *sb, struct voi *v,
     struct voistat *vs, struct sb_iter_ctx *ctx)
 {
 	struct sb_visitcb_ctx *vctx;
 	struct sb_visit sbv;
 
 	vctx = ctx->usrctx;
 
 	sbv.tplhash = sb->tplhash;
 	sbv.voi_id = v->id;
 	sbv.voi_dtype = v->dtype;
 	sbv.vs_stype = vs->stype;
 	sbv.vs_dtype = vs->dtype;
 	sbv.vs_dsz = vs->dsz;
 	sbv.vs_data = BLOB_OFFSET(sb, vs->data_off);
 	sbv.vs_errs = vs->errs;
 	sbv.flags = ctx->flags & (SB_IT_FIRST_CB | SB_IT_LAST_CB |
 	    SB_IT_FIRST_VOI | SB_IT_LAST_VOI | SB_IT_FIRST_VOISTAT |
 	    SB_IT_LAST_VOISTAT);
 
 	return (vctx->cb(&sbv, vctx->usrctx));
 }
 
 int
 stats_v1_blob_visit(struct statsblobv1 *sb, stats_blob_visitcb_t func,
     void *usrctx)
 {
 	struct sb_visitcb_ctx vctx;
 
 	if (sb == NULL || sb->abi != STATS_ABI_V1 || func == NULL)
 		return (EINVAL);
 
 	vctx.cb = func;
 	vctx.usrctx = usrctx;
 
 	stats_v1_blob_iter(sb, stats_v1_itercb_visit, &vctx, 0);
 
 	return (0);
 }
 
 static int
 stats_v1_icb_reset_voistat(struct statsblobv1 *sb, struct voi *v __unused,
     struct voistat *vs, struct sb_iter_ctx *ctx __unused)
 {
 	void *vsd;
 
 	if (vs->stype == VS_STYPE_VOISTATE)
 		return (0);
 
 	vsd = BLOB_OFFSET(sb, vs->data_off);
 
 	/* Perform the stat type's default reset action. */
 	switch (vs->stype) {
 	case VS_STYPE_SUM:
 		switch (vs->dtype) {
 		case VSD_DTYPE_Q_S32:
 			Q_SIFVAL(VSD(q32, vsd)->sq32, 0);
 			break;
 		case VSD_DTYPE_Q_U32:
 			Q_SIFVAL(VSD(q32, vsd)->uq32, 0);
 			break;
 		case VSD_DTYPE_Q_S64:
 			Q_SIFVAL(VSD(q64, vsd)->sq64, 0);
 			break;
 		case VSD_DTYPE_Q_U64:
 			Q_SIFVAL(VSD(q64, vsd)->uq64, 0);
 			break;
 		default:
 			bzero(vsd, vs->dsz);
 			break;
 		}
 		break;
 	case VS_STYPE_MAX:
 		switch (vs->dtype) {
 		case VSD_DTYPE_Q_S32:
 			Q_SIFVAL(VSD(q32, vsd)->sq32,
 			    Q_IFMINVAL(VSD(q32, vsd)->sq32));
 			break;
 		case VSD_DTYPE_Q_U32:
 			Q_SIFVAL(VSD(q32, vsd)->uq32,
 			    Q_IFMINVAL(VSD(q32, vsd)->uq32));
 			break;
 		case VSD_DTYPE_Q_S64:
 			Q_SIFVAL(VSD(q64, vsd)->sq64,
 			    Q_IFMINVAL(VSD(q64, vsd)->sq64));
 			break;
 		case VSD_DTYPE_Q_U64:
 			Q_SIFVAL(VSD(q64, vsd)->uq64,
 			    Q_IFMINVAL(VSD(q64, vsd)->uq64));
 			break;
 		default:
 			memcpy(vsd, &numeric_limits[LIM_MIN][vs->dtype],
 			    vs->dsz);
 			break;
 		}
 		break;
 	case VS_STYPE_MIN:
 		switch (vs->dtype) {
 		case VSD_DTYPE_Q_S32:
 			Q_SIFVAL(VSD(q32, vsd)->sq32,
 			    Q_IFMAXVAL(VSD(q32, vsd)->sq32));
 			break;
 		case VSD_DTYPE_Q_U32:
 			Q_SIFVAL(VSD(q32, vsd)->uq32,
 			    Q_IFMAXVAL(VSD(q32, vsd)->uq32));
 			break;
 		case VSD_DTYPE_Q_S64:
 			Q_SIFVAL(VSD(q64, vsd)->sq64,
 			    Q_IFMAXVAL(VSD(q64, vsd)->sq64));
 			break;
 		case VSD_DTYPE_Q_U64:
 			Q_SIFVAL(VSD(q64, vsd)->uq64,
 			    Q_IFMAXVAL(VSD(q64, vsd)->uq64));
 			break;
 		default:
 			memcpy(vsd, &numeric_limits[LIM_MAX][vs->dtype],
 			    vs->dsz);
 			break;
 		}
 		break;
 	case VS_STYPE_HIST:
 		{
 		/* Reset bucket counts. */
 		struct voistatdata_hist *hist;
 		int i, is32bit;
 		uint16_t nbkts;
 
 		hist = VSD(hist, vsd);
 		switch (vs->dtype) {
 		case VSD_DTYPE_CRHIST32:
 			nbkts = HIST_VSDSZ2NBKTS(crhist32, vs->dsz);
 			is32bit = 1;
 			break;
 		case VSD_DTYPE_DRHIST32:
 			nbkts = HIST_VSDSZ2NBKTS(drhist32, vs->dsz);
 			is32bit = 1;
 			break;
 		case VSD_DTYPE_DVHIST32:
 			nbkts = HIST_VSDSZ2NBKTS(dvhist32, vs->dsz);
 			is32bit = 1;
 			break;
 		case VSD_DTYPE_CRHIST64:
 			nbkts = HIST_VSDSZ2NBKTS(crhist64, vs->dsz);
 			is32bit = 0;
 			break;
 		case VSD_DTYPE_DRHIST64:
 			nbkts = HIST_VSDSZ2NBKTS(drhist64, vs->dsz);
 			is32bit = 0;
 			break;
 		case VSD_DTYPE_DVHIST64:
 			nbkts = HIST_VSDSZ2NBKTS(dvhist64, vs->dsz);
 			is32bit = 0;
 			break;
 		default:
 			return (0);
 		}
 
 		bzero(VSD_HIST_FIELDPTR(hist, vs->dtype, oob),
 		    is32bit ? sizeof(uint32_t) : sizeof(uint64_t));
 		for (i = nbkts - 1; i >= 0; i--) {
 			bzero(VSD_HIST_FIELDPTR(hist, vs->dtype,
 			    bkts[i].cnt), is32bit ? sizeof(uint32_t) :
 			    sizeof(uint64_t));
 		}
 		break;
 		}
 	case VS_STYPE_TDGST:
 		{
 		/* Reset sample count centroids array/tree. */
 		struct voistatdata_tdgst *tdgst;
 		struct ctdth32 *ctd32tree;
 		struct ctdth64 *ctd64tree;
 		struct voistatdata_tdgstctd32 *ctd32;
 		struct voistatdata_tdgstctd64 *ctd64;
 
 		tdgst = VSD(tdgst, vsd);
 		switch (vs->dtype) {
 		case VSD_DTYPE_TDGSTCLUST32:
 			VSD(tdgstclust32, tdgst)->smplcnt = 0;
 			VSD(tdgstclust32, tdgst)->compcnt = 0;
 			ctd32tree = &VSD(tdgstclust32, tdgst)->ctdtree;
 			ARB_INIT(ctd32, ctdlnk, ctd32tree,
 			    ARB_MAXNODES(ctd32tree)) {
 				ctd32->cnt = 0;
 				Q_SIFVAL(ctd32->mu, 0);
 			}
 #ifdef DIAGNOSTIC
 			RB_INIT(&VSD(tdgstclust32, tdgst)->rbctdtree);
 #endif
 		break;
 		case VSD_DTYPE_TDGSTCLUST64:
 			VSD(tdgstclust64, tdgst)->smplcnt = 0;
 			VSD(tdgstclust64, tdgst)->compcnt = 0;
 			ctd64tree = &VSD(tdgstclust64, tdgst)->ctdtree;
 			ARB_INIT(ctd64, ctdlnk, ctd64tree,
 			    ARB_MAXNODES(ctd64tree)) {
 				ctd64->cnt = 0;
 				Q_SIFVAL(ctd64->mu, 0);
 			}
 #ifdef DIAGNOSTIC
 			RB_INIT(&VSD(tdgstclust64, tdgst)->rbctdtree);
 #endif
 		break;
 		default:
 			return (0);
 		}
 		break;
 		}
 	default:
 		KASSERT(0, ("Unknown VOI stat type %d", vs->stype));
 		break;
 	}
 
 	vs->errs = 0;
 	vs->flags &= ~VS_VSDVALID;
 
 	return (0);
 }
 
 int
 stats_v1_blob_snapshot(struct statsblobv1 **dst, size_t dstmaxsz,
     struct statsblobv1 *src, uint32_t flags)
 {
 	int error;
 
 	if (src != NULL && src->abi == STATS_ABI_V1) {
 		error = stats_v1_blob_clone(dst, dstmaxsz, src, flags);
 		if (!error) {
 			if (flags & SB_CLONE_RSTSRC) {
 				stats_v1_blob_iter(src,
 				    stats_v1_icb_reset_voistat, NULL, 0);
 				src->lastrst = stats_sbinuptime();
 			}
 			stats_v1_blob_finalise(*dst);
 		}
 	} else
 		error = EINVAL;
 
 	return (error);
 }
 
 static inline int
 stats_v1_voi_update_max(enum vsd_dtype voi_dtype __unused,
     struct voistatdata *voival, struct voistat *vs, void *vsd)
 {
 	int error;
 
 	KASSERT(vs->dtype < VSD_NUM_DTYPES,
 	    ("Unknown VSD dtype %d", vs->dtype));
 
 	error = 0;
 
 	switch (vs->dtype) {
 	case VSD_DTYPE_INT_S32:
 		if (VSD(int32, vsd)->s32 < voival->int32.s32) {
 			VSD(int32, vsd)->s32 = voival->int32.s32;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_U32:
 		if (VSD(int32, vsd)->u32 < voival->int32.u32) {
 			VSD(int32, vsd)->u32 = voival->int32.u32;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_S64:
 		if (VSD(int64, vsd)->s64 < voival->int64.s64) {
 			VSD(int64, vsd)->s64 = voival->int64.s64;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_U64:
 		if (VSD(int64, vsd)->u64 < voival->int64.u64) {
 			VSD(int64, vsd)->u64 = voival->int64.u64;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_SLONG:
 		if (VSD(intlong, vsd)->slong < voival->intlong.slong) {
 			VSD(intlong, vsd)->slong = voival->intlong.slong;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_ULONG:
 		if (VSD(intlong, vsd)->ulong < voival->intlong.ulong) {
 			VSD(intlong, vsd)->ulong = voival->intlong.ulong;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_Q_S32:
 		if (Q_QLTQ(VSD(q32, vsd)->sq32, voival->q32.sq32) &&
 		    (0 == (error = Q_QCPYVALQ(&VSD(q32, vsd)->sq32,
 		    voival->q32.sq32)))) {
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_Q_U32:
 		if (Q_QLTQ(VSD(q32, vsd)->uq32, voival->q32.uq32) &&
 		    (0 == (error = Q_QCPYVALQ(&VSD(q32, vsd)->uq32,
 		    voival->q32.uq32)))) {
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_Q_S64:
 		if (Q_QLTQ(VSD(q64, vsd)->sq64, voival->q64.sq64) &&
 		    (0 == (error = Q_QCPYVALQ(&VSD(q64, vsd)->sq64,
 		    voival->q64.sq64)))) {
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_Q_U64:
 		if (Q_QLTQ(VSD(q64, vsd)->uq64, voival->q64.uq64) &&
 		    (0 == (error = Q_QCPYVALQ(&VSD(q64, vsd)->uq64,
 		    voival->q64.uq64)))) {
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static inline int
 stats_v1_voi_update_min(enum vsd_dtype voi_dtype __unused,
     struct voistatdata *voival, struct voistat *vs, void *vsd)
 {
 	int error;
 
 	KASSERT(vs->dtype < VSD_NUM_DTYPES,
 	    ("Unknown VSD dtype %d", vs->dtype));
 
 	error = 0;
 
 	switch (vs->dtype) {
 	case VSD_DTYPE_INT_S32:
 		if (VSD(int32, vsd)->s32 > voival->int32.s32) {
 			VSD(int32, vsd)->s32 = voival->int32.s32;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_U32:
 		if (VSD(int32, vsd)->u32 > voival->int32.u32) {
 			VSD(int32, vsd)->u32 = voival->int32.u32;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_S64:
 		if (VSD(int64, vsd)->s64 > voival->int64.s64) {
 			VSD(int64, vsd)->s64 = voival->int64.s64;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_U64:
 		if (VSD(int64, vsd)->u64 > voival->int64.u64) {
 			VSD(int64, vsd)->u64 = voival->int64.u64;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_SLONG:
 		if (VSD(intlong, vsd)->slong > voival->intlong.slong) {
 			VSD(intlong, vsd)->slong = voival->intlong.slong;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_INT_ULONG:
 		if (VSD(intlong, vsd)->ulong > voival->intlong.ulong) {
 			VSD(intlong, vsd)->ulong = voival->intlong.ulong;
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_Q_S32:
 		if (Q_QGTQ(VSD(q32, vsd)->sq32, voival->q32.sq32) &&
 		    (0 == (error = Q_QCPYVALQ(&VSD(q32, vsd)->sq32,
 		    voival->q32.sq32)))) {
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_Q_U32:
 		if (Q_QGTQ(VSD(q32, vsd)->uq32, voival->q32.uq32) &&
 		    (0 == (error = Q_QCPYVALQ(&VSD(q32, vsd)->uq32,
 		    voival->q32.uq32)))) {
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_Q_S64:
 		if (Q_QGTQ(VSD(q64, vsd)->sq64, voival->q64.sq64) &&
 		    (0 == (error = Q_QCPYVALQ(&VSD(q64, vsd)->sq64,
 		    voival->q64.sq64)))) {
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	case VSD_DTYPE_Q_U64:
 		if (Q_QGTQ(VSD(q64, vsd)->uq64, voival->q64.uq64) &&
 		    (0 == (error = Q_QCPYVALQ(&VSD(q64, vsd)->uq64,
 		    voival->q64.uq64)))) {
 			vs->flags |= VS_VSDVALID;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static inline int
 stats_v1_voi_update_sum(enum vsd_dtype voi_dtype __unused,
     struct voistatdata *voival, struct voistat *vs, void *vsd)
 {
 	int error;
 
 	KASSERT(vs->dtype < VSD_NUM_DTYPES,
 	    ("Unknown VSD dtype %d", vs->dtype));
 
 	error = 0;
 
 	switch (vs->dtype) {
 	case VSD_DTYPE_INT_S32:
 		VSD(int32, vsd)->s32 += voival->int32.s32;
 		break;
 	case VSD_DTYPE_INT_U32:
 		VSD(int32, vsd)->u32 += voival->int32.u32;
 		break;
 	case VSD_DTYPE_INT_S64:
 		VSD(int64, vsd)->s64 += voival->int64.s64;
 		break;
 	case VSD_DTYPE_INT_U64:
 		VSD(int64, vsd)->u64 += voival->int64.u64;
 		break;
 	case VSD_DTYPE_INT_SLONG:
 		VSD(intlong, vsd)->slong += voival->intlong.slong;
 		break;
 	case VSD_DTYPE_INT_ULONG:
 		VSD(intlong, vsd)->ulong += voival->intlong.ulong;
 		break;
 	case VSD_DTYPE_Q_S32:
 		error = Q_QADDQ(&VSD(q32, vsd)->sq32, voival->q32.sq32);
 		break;
 	case VSD_DTYPE_Q_U32:
 		error = Q_QADDQ(&VSD(q32, vsd)->uq32, voival->q32.uq32);
 		break;
 	case VSD_DTYPE_Q_S64:
 		error = Q_QADDQ(&VSD(q64, vsd)->sq64, voival->q64.sq64);
 		break;
 	case VSD_DTYPE_Q_U64:
 		error = Q_QADDQ(&VSD(q64, vsd)->uq64, voival->q64.uq64);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (!error)
 		vs->flags |= VS_VSDVALID;
 
 	return (error);
 }
 
 static inline int
 stats_v1_voi_update_hist(enum vsd_dtype voi_dtype, struct voistatdata *voival,
     struct voistat *vs, struct voistatdata_hist *hist)
 {
 	struct voistatdata_numeric *bkt_lb, *bkt_ub;
 	uint64_t *oob64, *cnt64;
 	uint32_t *oob32, *cnt32;
 	int error, i, found, is32bit, has_ub, eq_only;
 
 	error = 0;
 
 	switch (vs->dtype) {
 	case VSD_DTYPE_CRHIST32:
 		i = HIST_VSDSZ2NBKTS(crhist32, vs->dsz);
 		is32bit = 1;
 		has_ub = eq_only = 0;
 		oob32 = &VSD(crhist32, hist)->oob;
 		break;
 	case VSD_DTYPE_DRHIST32:
 		i = HIST_VSDSZ2NBKTS(drhist32, vs->dsz);
 		is32bit = has_ub = 1;
 		eq_only = 0;
 		oob32 = &VSD(drhist32, hist)->oob;
 		break;
 	case VSD_DTYPE_DVHIST32:
 		i = HIST_VSDSZ2NBKTS(dvhist32, vs->dsz);
 		is32bit = eq_only = 1;
 		has_ub = 0;
 		oob32 = &VSD(dvhist32, hist)->oob;
 		break;
 	case VSD_DTYPE_CRHIST64:
 		i = HIST_VSDSZ2NBKTS(crhist64, vs->dsz);
 		is32bit = has_ub = eq_only = 0;
 		oob64 = &VSD(crhist64, hist)->oob;
 		break;
 	case VSD_DTYPE_DRHIST64:
 		i = HIST_VSDSZ2NBKTS(drhist64, vs->dsz);
 		is32bit = eq_only = 0;
 		has_ub = 1;
 		oob64 = &VSD(drhist64, hist)->oob;
 		break;
 	case VSD_DTYPE_DVHIST64:
 		i = HIST_VSDSZ2NBKTS(dvhist64, vs->dsz);
 		is32bit = has_ub = 0;
 		eq_only = 1;
 		oob64 = &VSD(dvhist64, hist)->oob;
 		break;
 	default:
 		return (EINVAL);
 	}
 	i--; /* Adjust for 0-based array index. */
 
 	/* XXXLAS: Should probably use a better bucket search algorithm. ARB? */
 	for (found = 0; i >= 0 && !found; i--) {
 		switch (vs->dtype) {
 		case VSD_DTYPE_CRHIST32:
 			bkt_lb = &VSD(crhist32, hist)->bkts[i].lb;
 			cnt32 = &VSD(crhist32, hist)->bkts[i].cnt;
 			break;
 		case VSD_DTYPE_DRHIST32:
 			bkt_lb = &VSD(drhist32, hist)->bkts[i].lb;
 			bkt_ub = &VSD(drhist32, hist)->bkts[i].ub;
 			cnt32 = &VSD(drhist32, hist)->bkts[i].cnt;
 			break;
 		case VSD_DTYPE_DVHIST32:
 			bkt_lb = &VSD(dvhist32, hist)->bkts[i].val;
 			cnt32 = &VSD(dvhist32, hist)->bkts[i].cnt;
 			break;
 		case VSD_DTYPE_CRHIST64:
 			bkt_lb = &VSD(crhist64, hist)->bkts[i].lb;
 			cnt64 = &VSD(crhist64, hist)->bkts[i].cnt;
 			break;
 		case VSD_DTYPE_DRHIST64:
 			bkt_lb = &VSD(drhist64, hist)->bkts[i].lb;
 			bkt_ub = &VSD(drhist64, hist)->bkts[i].ub;
 			cnt64 = &VSD(drhist64, hist)->bkts[i].cnt;
 			break;
 		case VSD_DTYPE_DVHIST64:
 			bkt_lb = &VSD(dvhist64, hist)->bkts[i].val;
 			cnt64 = &VSD(dvhist64, hist)->bkts[i].cnt;
 			break;
 		default:
 			return (EINVAL);
 		}
 
 		switch (voi_dtype) {
 		case VSD_DTYPE_INT_S32:
 			if (voival->int32.s32 >= bkt_lb->int32.s32) {
 				if ((eq_only && voival->int32.s32 ==
 				    bkt_lb->int32.s32) ||
 				    (!eq_only && (!has_ub ||
 				    voival->int32.s32 < bkt_ub->int32.s32)))
 					found = 1;
 			}
 			break;
 		case VSD_DTYPE_INT_U32:
 			if (voival->int32.u32 >= bkt_lb->int32.u32) {
 				if ((eq_only && voival->int32.u32 ==
 				    bkt_lb->int32.u32) ||
 				    (!eq_only && (!has_ub ||
 				    voival->int32.u32 < bkt_ub->int32.u32)))
 					found = 1;
 			}
 			break;
 		case VSD_DTYPE_INT_S64:
 			if (voival->int64.s64 >= bkt_lb->int64.s64)
 				if ((eq_only && voival->int64.s64 ==
 				    bkt_lb->int64.s64) ||
 				    (!eq_only && (!has_ub ||
 				    voival->int64.s64 < bkt_ub->int64.s64)))
 					found = 1;
 			break;
 		case VSD_DTYPE_INT_U64:
 			if (voival->int64.u64 >= bkt_lb->int64.u64)
 				if ((eq_only && voival->int64.u64 ==
 				    bkt_lb->int64.u64) ||
 				    (!eq_only && (!has_ub ||
 				    voival->int64.u64 < bkt_ub->int64.u64)))
 					found = 1;
 			break;
 		case VSD_DTYPE_INT_SLONG:
 			if (voival->intlong.slong >= bkt_lb->intlong.slong)
 				if ((eq_only && voival->intlong.slong ==
 				    bkt_lb->intlong.slong) ||
 				    (!eq_only && (!has_ub ||
 				    voival->intlong.slong <
 				    bkt_ub->intlong.slong)))
 					found = 1;
 			break;
 		case VSD_DTYPE_INT_ULONG:
 			if (voival->intlong.ulong >= bkt_lb->intlong.ulong)
 				if ((eq_only && voival->intlong.ulong ==
 				    bkt_lb->intlong.ulong) ||
 				    (!eq_only && (!has_ub ||
 				    voival->intlong.ulong <
 				    bkt_ub->intlong.ulong)))
 					found = 1;
 			break;
 		case VSD_DTYPE_Q_S32:
 			if (Q_QGEQ(voival->q32.sq32, bkt_lb->q32.sq32))
 				if ((eq_only && Q_QEQ(voival->q32.sq32,
 				    bkt_lb->q32.sq32)) ||
 				    (!eq_only && (!has_ub ||
 				    Q_QLTQ(voival->q32.sq32,
 				    bkt_ub->q32.sq32))))
 					found = 1;
 			break;
 		case VSD_DTYPE_Q_U32:
 			if (Q_QGEQ(voival->q32.uq32, bkt_lb->q32.uq32))
 				if ((eq_only && Q_QEQ(voival->q32.uq32,
 				    bkt_lb->q32.uq32)) ||
 				    (!eq_only && (!has_ub ||
 				    Q_QLTQ(voival->q32.uq32,
 				    bkt_ub->q32.uq32))))
 					found = 1;
 			break;
 		case VSD_DTYPE_Q_S64:
 			if (Q_QGEQ(voival->q64.sq64, bkt_lb->q64.sq64))
 				if ((eq_only && Q_QEQ(voival->q64.sq64,
 				    bkt_lb->q64.sq64)) ||
 				    (!eq_only && (!has_ub ||
 				    Q_QLTQ(voival->q64.sq64,
 				    bkt_ub->q64.sq64))))
 					found = 1;
 			break;
 		case VSD_DTYPE_Q_U64:
 			if (Q_QGEQ(voival->q64.uq64, bkt_lb->q64.uq64))
 				if ((eq_only && Q_QEQ(voival->q64.uq64,
 				    bkt_lb->q64.uq64)) ||
 				    (!eq_only && (!has_ub ||
 				    Q_QLTQ(voival->q64.uq64,
 				    bkt_ub->q64.uq64))))
 					found = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (found) {
 		if (is32bit)
 			*cnt32 += 1;
 		else
 			*cnt64 += 1;
 	} else {
 		if (is32bit)
 			*oob32 += 1;
 		else
 			*oob64 += 1;
 	}
 
 	vs->flags |= VS_VSDVALID;
 	return (error);
 }
 
 static inline int
 stats_v1_vsd_tdgst_compress(enum vsd_dtype vs_dtype,
     struct voistatdata_tdgst *tdgst, int attempt)
 {
 	struct ctdth32 *ctd32tree;
 	struct ctdth64 *ctd64tree;
 	struct voistatdata_tdgstctd32 *ctd32;
 	struct voistatdata_tdgstctd64 *ctd64;
 	uint64_t ebits, idxmask;
 	uint32_t bitsperidx, nebits;
 	int error, idx, is32bit, maxctds, remctds, tmperr;
 
 	error = 0;
 
 	switch (vs_dtype) {
 	case VSD_DTYPE_TDGSTCLUST32:
 		ctd32tree = &VSD(tdgstclust32, tdgst)->ctdtree;
 		if (!ARB_FULL(ctd32tree))
 			return (0);
 		VSD(tdgstclust32, tdgst)->compcnt++;
 		maxctds = remctds = ARB_MAXNODES(ctd32tree);
 		ARB_RESET_TREE(ctd32tree, ctdth32, maxctds);
 		VSD(tdgstclust32, tdgst)->smplcnt = 0;
 		is32bit = 1;
 		ctd64tree = NULL;
 		ctd64 = NULL;
 #ifdef DIAGNOSTIC
 		RB_INIT(&VSD(tdgstclust32, tdgst)->rbctdtree);
 #endif
 		break;
 	case VSD_DTYPE_TDGSTCLUST64:
 		ctd64tree = &VSD(tdgstclust64, tdgst)->ctdtree;
 		if (!ARB_FULL(ctd64tree))
 			return (0);
 		VSD(tdgstclust64, tdgst)->compcnt++;
 		maxctds = remctds = ARB_MAXNODES(ctd64tree);
 		ARB_RESET_TREE(ctd64tree, ctdth64, maxctds);
 		VSD(tdgstclust64, tdgst)->smplcnt = 0;
 		is32bit = 0;
 		ctd32tree = NULL;
 		ctd32 = NULL;
 #ifdef DIAGNOSTIC
 		RB_INIT(&VSD(tdgstclust64, tdgst)->rbctdtree);
 #endif
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	/*
 	 * Rebuild the t-digest ARB by pseudorandomly selecting centroids and
 	 * re-inserting the mu/cnt of each as a value and corresponding weight.
 	 */
 
 	/*
 	 * XXXCEM: random(9) is currently rand(3), not random(3).  rand(3)
 	 * RAND_MAX happens to be approximately 31 bits (range [0,
 	 * 0x7ffffffd]), so the math kinda works out.  When/if this portion of
 	 * the code is compiled in userspace, it gets the random(3) behavior,
 	 * which has expected range [0, 0x7fffffff].
 	 */
 #define	bitsperrand 31
 	ebits = 0;
 	nebits = 0;
 	bitsperidx = fls(maxctds);
 	KASSERT(bitsperidx <= sizeof(ebits) << 3,
 	    ("%s: bitsperidx=%d, ebits=%d",
 	    __func__, bitsperidx, (int)(sizeof(ebits) << 3)));
 	idxmask = (UINT64_C(1) << bitsperidx) - 1;
 
 	/* Initialise the free list with randomised centroid indices. */
 	for (; remctds > 0; remctds--) {
 		while (nebits < bitsperidx) {
 			ebits |= ((uint64_t)random()) << nebits;
 			nebits += bitsperrand;
 			if (nebits > (sizeof(ebits) << 3))
 				nebits = sizeof(ebits) << 3;
 		}
 		idx = ebits & idxmask;
 		nebits -= bitsperidx;
 		ebits >>= bitsperidx;
 
 		/*
 		 * Select the next centroid to put on the ARB free list. We
 		 * start with the centroid at our randomly selected array index,
 		 * and work our way forwards until finding one (the latter
 		 * aspect reduces re-insertion randomness, but is good enough).
 		 */
 		do {
 			if (idx >= maxctds)
 				idx %= maxctds;
 
 			if (is32bit)
 				ctd32 = ARB_NODE(ctd32tree, idx);
 			else
 				ctd64 = ARB_NODE(ctd64tree, idx);
 		} while ((is32bit ? ARB_ISFREE(ctd32, ctdlnk) :
 		    ARB_ISFREE(ctd64, ctdlnk)) && ++idx);
 
 		/* Put the centroid on the ARB free list. */
 		if (is32bit)
 			ARB_RETURNFREE(ctd32tree, ctd32, ctdlnk);
 		else
 			ARB_RETURNFREE(ctd64tree, ctd64, ctdlnk);
 	}
 
 	/*
 	 * The free list now contains the randomised indices of every centroid.
 	 * Walk the free list from start to end, re-inserting each centroid's
 	 * mu/cnt. The tdgst_add() call may or may not consume the free centroid
 	 * we re-insert values from during each loop iteration, so we must latch
 	 * the index of the next free list centroid before the re-insertion
 	 * call. The previous loop above should have left the centroid pointer
 	 * pointing to the element at the head of the free list.
 	 */
 	KASSERT((is32bit ?
 	    ARB_FREEIDX(ctd32tree) == ARB_SELFIDX(ctd32tree, ctd32) :
 	    ARB_FREEIDX(ctd64tree) == ARB_SELFIDX(ctd64tree, ctd64)),
 	    ("%s: t-digest ARB@%p free list bug", __func__,
 	    (is32bit ? (void *)ctd32tree : (void *)ctd64tree)));
 	remctds = maxctds;
 	while ((is32bit ? ctd32 != NULL : ctd64 != NULL)) {
 		tmperr = 0;
 		if (is32bit) {
 			s64q_t x;
 
 			idx = ARB_NEXTFREEIDX(ctd32, ctdlnk);
 			/* Cloning a s32q_t into a s64q_t should never fail. */
 			tmperr = Q_QCLONEQ(&x, ctd32->mu);
 			tmperr = tmperr ? tmperr : stats_v1_vsd_tdgst_add(
 			    vs_dtype, tdgst, x, ctd32->cnt, attempt);
 			ctd32 = ARB_NODE(ctd32tree, idx);
 			KASSERT(ctd32 == NULL || ARB_ISFREE(ctd32, ctdlnk),
 			    ("%s: t-digest ARB@%p free list bug", __func__,
 			    ctd32tree));
 		} else {
 			idx = ARB_NEXTFREEIDX(ctd64, ctdlnk);
 			tmperr = stats_v1_vsd_tdgst_add(vs_dtype, tdgst,
 			    ctd64->mu, ctd64->cnt, attempt);
 			ctd64 = ARB_NODE(ctd64tree, idx);
 			KASSERT(ctd64 == NULL || ARB_ISFREE(ctd64, ctdlnk),
 			    ("%s: t-digest ARB@%p free list bug", __func__,
 			    ctd64tree));
 		}
 		/*
 		 * This process should not produce errors, bugs notwithstanding.
 		 * Just in case, latch any errors and attempt all re-insertions.
 		 */
 		error = tmperr ? tmperr : error;
 		remctds--;
 	}
 
 	KASSERT(remctds == 0, ("%s: t-digest ARB@%p free list bug", __func__,
 	    (is32bit ? (void *)ctd32tree : (void *)ctd64tree)));
 
 	return (error);
 }
 
 static inline int
 stats_v1_vsd_tdgst_add(enum vsd_dtype vs_dtype, struct voistatdata_tdgst *tdgst,
     s64q_t x, uint64_t weight, int attempt)
 {
 #ifdef DIAGNOSTIC
 	char qstr[Q_MAXSTRLEN(x, 10)];
 #endif
 	struct ctdth32 *ctd32tree;
 	struct ctdth64 *ctd64tree;
 	void *closest, *cur, *lb, *ub;
 	struct voistatdata_tdgstctd32 *ctd32;
 	struct voistatdata_tdgstctd64 *ctd64;
 	uint64_t cnt, smplcnt, sum, tmpsum;
 	s64q_t k, minz, q, z;
 	int error, is32bit, n;
 
 	error = 0;
 	minz = Q_INI(&z, 0, 0, Q_NFBITS(x));
 
 	switch (vs_dtype) {
 	case VSD_DTYPE_TDGSTCLUST32:
 		if ((UINT32_MAX - weight) < VSD(tdgstclust32, tdgst)->smplcnt)
 			error = EOVERFLOW;
 		smplcnt = VSD(tdgstclust32, tdgst)->smplcnt;
 		ctd32tree = &VSD(tdgstclust32, tdgst)->ctdtree;
 		is32bit = 1;
 		ctd64tree = NULL;
 		ctd64 = NULL;
 		break;
 	case VSD_DTYPE_TDGSTCLUST64:
 		if ((UINT64_MAX - weight) < VSD(tdgstclust64, tdgst)->smplcnt)
 			error = EOVERFLOW;
 		smplcnt = VSD(tdgstclust64, tdgst)->smplcnt;
 		ctd64tree = &VSD(tdgstclust64, tdgst)->ctdtree;
 		is32bit = 0;
 		ctd32tree = NULL;
 		ctd32 = NULL;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (error)
 		return (error);
 
 	/*
 	 * Inspired by Ted Dunning's AVLTreeDigest.java
 	 */
 	do {
 #if defined(DIAGNOSTIC)
 		KASSERT(attempt < 5,
 		    ("%s: Too many attempts", __func__));
 #endif
 		if (attempt >= 5)
 			return (EAGAIN);
 
 		Q_SIFVAL(minz, Q_IFMAXVAL(minz));
 		closest = ub = NULL;
 		sum = tmpsum = 0;
 
 		if (is32bit)
 			lb = cur = (void *)(ctd32 = ARB_MIN(ctdth32, ctd32tree));
 		else
 			lb = cur = (void *)(ctd64 = ARB_MIN(ctdth64, ctd64tree));
 
 		if (lb == NULL) /* Empty tree. */
 			lb = (is32bit ? (void *)ARB_ROOT(ctd32tree) :
 			    (void *)ARB_ROOT(ctd64tree));
 
 		/*
 		 * Find the set of centroids with minimum distance to x and
 		 * compute the sum of counts for all centroids with mean less
 		 * than the first centroid in the set.
 		 */
 		for (; cur != NULL;
 		    cur = (is32bit ?
 		    (void *)(ctd32 = ARB_NEXT(ctdth32, ctd32tree, ctd32)) :
 		    (void *)(ctd64 = ARB_NEXT(ctdth64, ctd64tree, ctd64)))) {
 			if (is32bit) {
 				cnt = ctd32->cnt;
 				KASSERT(Q_PRECEQ(ctd32->mu, x),
 				    ("%s: Q_RELPREC(mu,x)=%d", __func__,
 				    Q_RELPREC(ctd32->mu, x)));
 				/* Ok to assign as both have same precision. */
 				z = ctd32->mu;
 			} else {
 				cnt = ctd64->cnt;
 				KASSERT(Q_PRECEQ(ctd64->mu, x),
 				    ("%s: Q_RELPREC(mu,x)=%d", __func__,
 				    Q_RELPREC(ctd64->mu, x)));
 				/* Ok to assign as both have same precision. */
 				z = ctd64->mu;
 			}
 
 			error = Q_QSUBQ(&z, x);
 #if defined(DIAGNOSTIC)
 			KASSERT(!error, ("%s: unexpected error %d", __func__,
 			    error));
 #endif
 			if (error)
 				return (error);
 
 			z = Q_QABS(z);
 			if (Q_QLTQ(z, minz)) {
 				minz = z;
 				lb = cur;
 				sum = tmpsum;
 				tmpsum += cnt;
 			} else if (Q_QGTQ(z, minz)) {
 				ub = cur;
 				break;
 			}
 		}
 
 		cur = (is32bit ?
 		    (void *)(ctd32 = (struct voistatdata_tdgstctd32 *)lb) :
 		    (void *)(ctd64 = (struct voistatdata_tdgstctd64 *)lb));
 
 		for (n = 0; cur != ub; cur = (is32bit ?
 		    (void *)(ctd32 = ARB_NEXT(ctdth32, ctd32tree, ctd32)) :
 		    (void *)(ctd64 = ARB_NEXT(ctdth64, ctd64tree, ctd64)))) {
 			if (is32bit)
 				cnt = ctd32->cnt;
 			else
 				cnt = ctd64->cnt;
 
 			q = Q_CTRLINI(16);
 			if (smplcnt == 1)
 				error = Q_QFRACI(&q, 1, 2);
 			else
 				/* [ sum + ((cnt - 1) / 2) ] / (smplcnt - 1) */
 				error = Q_QFRACI(&q, (sum << 1) + cnt - 1,
 				    (smplcnt - 1) << 1);
 			k = q;
 			/* k = q x 4 x samplcnt x attempt */
 			error |= Q_QMULI(&k, 4 * smplcnt * attempt);
 			/* k = k x (1 - q) */
 			error |= Q_QSUBI(&q, 1);
 			q = Q_QABS(q);
 			error |= Q_QMULQ(&k, q);
 #if defined(DIAGNOSTIC)
 #if !defined(_KERNEL)
 			double q_dbl, k_dbl, q2d, k2d;
 			q2d = Q_Q2D(q);
 			k2d = Q_Q2D(k);
 			q_dbl = smplcnt == 1 ? 0.5 :
 			    (sum + ((cnt - 1)  / 2.0)) / (double)(smplcnt - 1);
 			k_dbl = 4 * smplcnt * q_dbl * (1.0 - q_dbl) * attempt;
 			/*
 			 * If the difference between q and q_dbl is greater than
 			 * the fractional precision of q, something is off.
 			 * NB: q is holding the value of 1 - q
 			 */
 			q_dbl = 1.0 - q_dbl;
 			KASSERT((q_dbl > q2d ? q_dbl - q2d : q2d - q_dbl) <
 			    (1.05 * ((double)1 / (double)(1ULL << Q_NFBITS(q)))),
 			    ("Q-type q bad precision"));
 			KASSERT((k_dbl > k2d ? k_dbl - k2d : k2d - k_dbl) <
 			    1.0 + (0.01 * smplcnt),
 			    ("Q-type k bad precision"));
 #endif /* !_KERNEL */
 			KASSERT(!error, ("%s: unexpected error %d", __func__,
 			    error));
 #endif /* DIAGNOSTIC */
 			if (error)
 				return (error);
 			if ((is32bit && ((ctd32->cnt + weight) <=
 			    (uint64_t)Q_GIVAL(k))) ||
 			    (!is32bit && ((ctd64->cnt + weight) <=
 			    (uint64_t)Q_GIVAL(k)))) {
 				n++;
 				/* random() produces 31 bits. */
 				if (random() < (INT32_MAX / n))
 					closest = cur;
 			}
 			sum += cnt;
 		}
 	} while (closest == NULL &&
 	    (is32bit ? ARB_FULL(ctd32tree) : ARB_FULL(ctd64tree)) &&
 	    (error = stats_v1_vsd_tdgst_compress(vs_dtype, tdgst,
 	    attempt++)) == 0);
 
 	if (error)
 		return (error);
 
 	if (closest != NULL) {
 		/* Merge with an existing centroid. */
 		if (is32bit) {
 			ctd32 = (struct voistatdata_tdgstctd32 *)closest;
 			error = Q_QSUBQ(&x, ctd32->mu);
 			/*
 			 * The following calculation "x / (cnt + weight)"
 			 * computes the amount by which to adjust the centroid's
 			 * mu value in order to merge in the VOI sample.
 			 *
 			 * It can underflow (Q_QDIVI() returns ERANGE) when the
 			 * user centroids' fractional precision (which is
 			 * inherited by 'x') is too low to represent the result.
 			 *
 			 * A sophisticated approach to dealing with this issue
 			 * would minimise accumulation of error by tracking
 			 * underflow per centroid and making an adjustment when
 			 * a LSB's worth of underflow has accumulated.
 			 *
 			 * A simpler approach is to let the result underflow
 			 * i.e. merge the VOI sample into the centroid without
 			 * adjusting the centroid's mu, and rely on the user to
 			 * specify their t-digest with sufficient centroid
 			 * fractional precision such that the accumulation of
 			 * error from multiple underflows is of no material
 			 * consequence to the centroid's final value of mu.
 			 *
 			 * For the moment, the latter approach is employed by
 			 * simply ignoring ERANGE here.
 			 *
 			 * XXXLAS: Per-centroid underflow tracking is likely too
 			 * onerous, but it probably makes sense to accumulate a
 			 * single underflow error variable across all centroids
 			 * and report it as part of the digest to provide
 			 * additional visibility into the digest's fidelity.
 			 */
 			error = error ? error :
 			    Q_QDIVI(&x, ctd32->cnt + weight);
 			if ((error && error != ERANGE)
 			    || (error = Q_QADDQ(&ctd32->mu, x))) {
 #ifdef DIAGNOSTIC
 				KASSERT(!error, ("%s: unexpected error %d",
 				    __func__, error));
 #endif
 				return (error);
 			}
 			ctd32->cnt += weight;
 			error = ARB_REINSERT(ctdth32, ctd32tree, ctd32) ==
 			    NULL ? 0 : EALREADY;
 #ifdef DIAGNOSTIC
 			RB_REINSERT(rbctdth32,
 			    &VSD(tdgstclust32, tdgst)->rbctdtree, ctd32);
 #endif
 		} else {
 			ctd64 = (struct voistatdata_tdgstctd64 *)closest;
 			error = Q_QSUBQ(&x, ctd64->mu);
 			error = error ? error :
 			    Q_QDIVI(&x, ctd64->cnt + weight);
 			/* Refer to is32bit ERANGE discussion above. */
 			if ((error && error != ERANGE)
 			    || (error = Q_QADDQ(&ctd64->mu, x))) {
 				KASSERT(!error, ("%s: unexpected error %d",
 				    __func__, error));
 				return (error);
 			}
 			ctd64->cnt += weight;
 			error = ARB_REINSERT(ctdth64, ctd64tree, ctd64) ==
 			    NULL ? 0 : EALREADY;
 #ifdef DIAGNOSTIC
 			RB_REINSERT(rbctdth64,
 			    &VSD(tdgstclust64, tdgst)->rbctdtree, ctd64);
 #endif
 		}
 	} else {
 		/*
 		 * Add a new centroid. If digest compression is working
 		 * correctly, there should always be at least one free.
 		 */
 		if (is32bit) {
 			ctd32 = ARB_GETFREE(ctd32tree, ctdlnk);
 #ifdef DIAGNOSTIC
 			KASSERT(ctd32 != NULL,
 			    ("%s: t-digest@%p has no free centroids",
 			    __func__, tdgst));
 #endif
 			if (ctd32 == NULL)
 				return (EAGAIN);
 			if ((error = Q_QCPYVALQ(&ctd32->mu, x)))
 				return (error);
 			ctd32->cnt = weight;
 			error = ARB_INSERT(ctdth32, ctd32tree, ctd32) == NULL ?
 			    0 : EALREADY;
 #ifdef DIAGNOSTIC
 			RB_INSERT(rbctdth32,
 			    &VSD(tdgstclust32, tdgst)->rbctdtree, ctd32);
 #endif
 		} else {
 			ctd64 = ARB_GETFREE(ctd64tree, ctdlnk);
 #ifdef DIAGNOSTIC
 			KASSERT(ctd64 != NULL,
 			    ("%s: t-digest@%p has no free centroids",
 			    __func__, tdgst));
 #endif
 			if (ctd64 == NULL) /* Should not happen. */
 				return (EAGAIN);
 			/* Direct assignment ok as both have same type/prec. */
 			ctd64->mu = x;
 			ctd64->cnt = weight;
 			error = ARB_INSERT(ctdth64, ctd64tree, ctd64) == NULL ?
 			    0 : EALREADY;
 #ifdef DIAGNOSTIC
 			RB_INSERT(rbctdth64, &VSD(tdgstclust64,
 			    tdgst)->rbctdtree, ctd64);
 #endif
 		}
 	}
 
 	if (is32bit)
 		VSD(tdgstclust32, tdgst)->smplcnt += weight;
 	else {
 		VSD(tdgstclust64, tdgst)->smplcnt += weight;
 
 #ifdef DIAGNOSTIC
 		struct rbctdth64 *rbctdtree =
 		    &VSD(tdgstclust64, tdgst)->rbctdtree;
 		struct voistatdata_tdgstctd64 *rbctd64;
 		int i = 0;
 		ARB_FOREACH(ctd64, ctdth64, ctd64tree) {
 			rbctd64 = (i == 0 ? RB_MIN(rbctdth64, rbctdtree) :
 			    RB_NEXT(rbctdth64, rbctdtree, rbctd64));
 
 			if (i >= ARB_CURNODES(ctd64tree)
 			    || ctd64 != rbctd64
 			    || ARB_MIN(ctdth64, ctd64tree) !=
 			       RB_MIN(rbctdth64, rbctdtree)
 			    || ARB_MAX(ctdth64, ctd64tree) !=
 			       RB_MAX(rbctdth64, rbctdtree)
 			    || ARB_LEFTIDX(ctd64, ctdlnk) !=
 			       ARB_SELFIDX(ctd64tree, RB_LEFT(rbctd64, rblnk))
 			    || ARB_RIGHTIDX(ctd64, ctdlnk) !=
 			       ARB_SELFIDX(ctd64tree, RB_RIGHT(rbctd64, rblnk))
 			    || ARB_PARENTIDX(ctd64, ctdlnk) !=
 			       ARB_SELFIDX(ctd64tree,
 			       RB_PARENT(rbctd64, rblnk))) {
 				Q_TOSTR(ctd64->mu, -1, 10, qstr, sizeof(qstr));
 				printf("ARB ctd=%3d p=%3d l=%3d r=%3d c=%2d "
 				    "mu=%s\n",
 				    (int)ARB_SELFIDX(ctd64tree, ctd64),
 				    ARB_PARENTIDX(ctd64, ctdlnk),
 				    ARB_LEFTIDX(ctd64, ctdlnk),
 				    ARB_RIGHTIDX(ctd64, ctdlnk),
 				    ARB_COLOR(ctd64, ctdlnk),
 				    qstr);
 
 				Q_TOSTR(rbctd64->mu, -1, 10, qstr,
 				    sizeof(qstr));
 				struct voistatdata_tdgstctd64 *parent;
 				parent = RB_PARENT(rbctd64, rblnk);
 				int rb_color =
 					parent == NULL ? 0 :
 					RB_LEFT(parent, rblnk) == rbctd64 ?
 					(_RB_BITSUP(parent, rblnk) & _RB_L) != 0 :
  					(_RB_BITSUP(parent, rblnk) & _RB_R) != 0;
 				printf(" RB ctd=%3d p=%3d l=%3d r=%3d c=%2d "
 				    "mu=%s\n",
 				    (int)ARB_SELFIDX(ctd64tree, rbctd64),
 				    (int)ARB_SELFIDX(ctd64tree,
 				      RB_PARENT(rbctd64, rblnk)),
 				    (int)ARB_SELFIDX(ctd64tree,
 				      RB_LEFT(rbctd64, rblnk)),
 				    (int)ARB_SELFIDX(ctd64tree,
 				      RB_RIGHT(rbctd64, rblnk)),
 				    rb_color,
 				    qstr);
 
 				panic("RB@%p and ARB@%p trees differ\n",
 				    rbctdtree, ctd64tree);
 			}
 			i++;
 		}
 #endif /* DIAGNOSTIC */
 	}
 
 	return (error);
 }
 
 static inline int
 stats_v1_voi_update_tdgst(enum vsd_dtype voi_dtype, struct voistatdata *voival,
     struct voistat *vs, struct voistatdata_tdgst *tdgst)
 {
 	s64q_t x;
 	int error;
 
 	error = 0;
 
 	switch (vs->dtype) {
 	case VSD_DTYPE_TDGSTCLUST32:
 		/* Use same precision as the user's centroids. */
 		Q_INI(&x, 0, 0, Q_NFBITS(
 		    ARB_CNODE(&VSD(tdgstclust32, tdgst)->ctdtree, 0)->mu));
 		break;
 	case VSD_DTYPE_TDGSTCLUST64:
 		/* Use same precision as the user's centroids. */
 		Q_INI(&x, 0, 0, Q_NFBITS(
 		    ARB_CNODE(&VSD(tdgstclust64, tdgst)->ctdtree, 0)->mu));
 		break;
 	default:
 		KASSERT(vs->dtype == VSD_DTYPE_TDGSTCLUST32 ||
 		    vs->dtype == VSD_DTYPE_TDGSTCLUST64,
 		    ("%s: vs->dtype(%d) != VSD_DTYPE_TDGSTCLUST<32|64>",
 		    __func__, vs->dtype));
 		return (EINVAL);
 	}
 
 	/*
 	 * XXXLAS: Should have both a signed and unsigned 'x' variable to avoid
 	 * returning EOVERFLOW if the voival would have fit in a u64q_t.
 	 */
 	switch (voi_dtype) {
 	case VSD_DTYPE_INT_S32:
 		error = Q_QCPYVALI(&x, voival->int32.s32);
 		break;
 	case VSD_DTYPE_INT_U32:
 		error = Q_QCPYVALI(&x, voival->int32.u32);
 		break;
 	case VSD_DTYPE_INT_S64:
 		error = Q_QCPYVALI(&x, voival->int64.s64);
 		break;
 	case VSD_DTYPE_INT_U64:
 		error = Q_QCPYVALI(&x, voival->int64.u64);
 		break;
 	case VSD_DTYPE_INT_SLONG:
 		error = Q_QCPYVALI(&x, voival->intlong.slong);
 		break;
 	case VSD_DTYPE_INT_ULONG:
 		error = Q_QCPYVALI(&x, voival->intlong.ulong);
 		break;
 	case VSD_DTYPE_Q_S32:
 		error = Q_QCPYVALQ(&x, voival->q32.sq32);
 		break;
 	case VSD_DTYPE_Q_U32:
 		error = Q_QCPYVALQ(&x, voival->q32.uq32);
 		break;
 	case VSD_DTYPE_Q_S64:
 		error = Q_QCPYVALQ(&x, voival->q64.sq64);
 		break;
 	case VSD_DTYPE_Q_U64:
 		error = Q_QCPYVALQ(&x, voival->q64.uq64);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (error ||
 	    (error = stats_v1_vsd_tdgst_add(vs->dtype, tdgst, x, 1, 1)))
 		return (error);
 
 	vs->flags |= VS_VSDVALID;
 	return (0);
 }
 
 int
 stats_v1_voi_update(struct statsblobv1 *sb, int32_t voi_id,
     enum vsd_dtype voi_dtype, struct voistatdata *voival, uint32_t flags)
 {
 	struct voi *v;
 	struct voistat *vs;
 	void *statevsd, *vsd;
 	int error, i, tmperr;
 
 	error = 0;
 
 	if (sb == NULL || sb->abi != STATS_ABI_V1 || voi_id >= NVOIS(sb) ||
 	    voi_dtype == 0 || voi_dtype >= VSD_NUM_DTYPES || voival == NULL)
 		return (EINVAL);
 	v = &sb->vois[voi_id];
 	if (voi_dtype != v->dtype || v->id < 0 ||
 	    ((flags & SB_VOI_RELUPDATE) && !(v->flags & VOI_REQSTATE)))
 		return (EINVAL);
 
 	vs = BLOB_OFFSET(sb, v->stats_off);
 	if (v->flags & VOI_REQSTATE)
 		statevsd = BLOB_OFFSET(sb, vs->data_off);
 	else
 		statevsd = NULL;
 
 	if (flags & SB_VOI_RELUPDATE) {
 		switch (voi_dtype) {
 		case VSD_DTYPE_INT_S32:
 			voival->int32.s32 +=
 			    VSD(voistate, statevsd)->prev.int32.s32;
 			break;
 		case VSD_DTYPE_INT_U32:
 			voival->int32.u32 +=
 			    VSD(voistate, statevsd)->prev.int32.u32;
 			break;
 		case VSD_DTYPE_INT_S64:
 			voival->int64.s64 +=
 			    VSD(voistate, statevsd)->prev.int64.s64;
 			break;
 		case VSD_DTYPE_INT_U64:
 			voival->int64.u64 +=
 			    VSD(voistate, statevsd)->prev.int64.u64;
 			break;
 		case VSD_DTYPE_INT_SLONG:
 			voival->intlong.slong +=
 			    VSD(voistate, statevsd)->prev.intlong.slong;
 			break;
 		case VSD_DTYPE_INT_ULONG:
 			voival->intlong.ulong +=
 			    VSD(voistate, statevsd)->prev.intlong.ulong;
 			break;
 		case VSD_DTYPE_Q_S32:
 			error = Q_QADDQ(&voival->q32.sq32,
 			    VSD(voistate, statevsd)->prev.q32.sq32);
 			break;
 		case VSD_DTYPE_Q_U32:
 			error = Q_QADDQ(&voival->q32.uq32,
 			    VSD(voistate, statevsd)->prev.q32.uq32);
 			break;
 		case VSD_DTYPE_Q_S64:
 			error = Q_QADDQ(&voival->q64.sq64,
 			    VSD(voistate, statevsd)->prev.q64.sq64);
 			break;
 		case VSD_DTYPE_Q_U64:
 			error = Q_QADDQ(&voival->q64.uq64,
 			    VSD(voistate, statevsd)->prev.q64.uq64);
 			break;
 		default:
 			KASSERT(0, ("Unknown VOI data type %d", voi_dtype));
 			break;
 		}
 	}
 
 	if (error)
 		return (error);
 
 	for (i = v->voistatmaxid; i > 0; i--) {
 		vs = &((struct voistat *)BLOB_OFFSET(sb, v->stats_off))[i];
 		if (vs->stype < 0)
 			continue;
 
 		vsd = BLOB_OFFSET(sb, vs->data_off);
 
 		switch (vs->stype) {
 		case VS_STYPE_MAX:
 			tmperr = stats_v1_voi_update_max(voi_dtype, voival,
 			    vs, vsd);
 			break;
 		case VS_STYPE_MIN:
 			tmperr = stats_v1_voi_update_min(voi_dtype, voival,
 			    vs, vsd);
 			break;
 		case VS_STYPE_SUM:
 			tmperr = stats_v1_voi_update_sum(voi_dtype, voival,
 			    vs, vsd);
 			break;
 		case VS_STYPE_HIST:
 			tmperr = stats_v1_voi_update_hist(voi_dtype, voival,
 			    vs, vsd);
 			break;
 		case VS_STYPE_TDGST:
 			tmperr = stats_v1_voi_update_tdgst(voi_dtype, voival,
 			    vs, vsd);
 			break;
 		default:
 			KASSERT(0, ("Unknown VOI stat type %d", vs->stype));
 			break;
 		}
 
 		if (tmperr) {
 			error = tmperr;
 			VS_INCERRS(vs);
 		}
 	}
 
 	if (statevsd) {
 		switch (voi_dtype) {
 		case VSD_DTYPE_INT_S32:
 			VSD(voistate, statevsd)->prev.int32.s32 =
 			    voival->int32.s32;
 			break;
 		case VSD_DTYPE_INT_U32:
 			VSD(voistate, statevsd)->prev.int32.u32 =
 			    voival->int32.u32;
 			break;
 		case VSD_DTYPE_INT_S64:
 			VSD(voistate, statevsd)->prev.int64.s64 =
 			    voival->int64.s64;
 			break;
 		case VSD_DTYPE_INT_U64:
 			VSD(voistate, statevsd)->prev.int64.u64 =
 			    voival->int64.u64;
 			break;
 		case VSD_DTYPE_INT_SLONG:
 			VSD(voistate, statevsd)->prev.intlong.slong =
 			    voival->intlong.slong;
 			break;
 		case VSD_DTYPE_INT_ULONG:
 			VSD(voistate, statevsd)->prev.intlong.ulong =
 			    voival->intlong.ulong;
 			break;
 		case VSD_DTYPE_Q_S32:
 			error = Q_QCPYVALQ(
 			    &VSD(voistate, statevsd)->prev.q32.sq32,
 			    voival->q32.sq32);
 			break;
 		case VSD_DTYPE_Q_U32:
 			error = Q_QCPYVALQ(
 			    &VSD(voistate, statevsd)->prev.q32.uq32,
 			    voival->q32.uq32);
 			break;
 		case VSD_DTYPE_Q_S64:
 			error = Q_QCPYVALQ(
 			    &VSD(voistate, statevsd)->prev.q64.sq64,
 			    voival->q64.sq64);
 			break;
 		case VSD_DTYPE_Q_U64:
 			error = Q_QCPYVALQ(
 			    &VSD(voistate, statevsd)->prev.q64.uq64,
 			    voival->q64.uq64);
 			break;
 		default:
 			KASSERT(0, ("Unknown VOI data type %d", voi_dtype));
 			break;
 		}
 	}
 
 	return (error);
 }
 
 #ifdef _KERNEL
 
 static void
 stats_init(void *arg)
 {
 
 }
 SYSINIT(stats, SI_SUB_KDTRACE, SI_ORDER_FIRST, stats_init, NULL);
 
 /*
  * Sysctl handler to display the list of available stats templates.
  */
 static int
 stats_tpl_list_available(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *s;
 	int err, i;
 
 	err = 0;
 
 	/* We can tolerate ntpl being stale, so do not take the lock. */
 	s = sbuf_new(NULL, NULL, /* +1 per tpl for , */
 	    ntpl * (STATS_TPL_MAX_STR_SPEC_LEN + 1), SBUF_FIXEDLEN);
 	if (s == NULL)
 		return (ENOMEM);
 
 	TPL_LIST_RLOCK();
 	for (i = 0; i < ntpl; i++) {
 		err = sbuf_printf(s, "%s\"%s\":%u", i ? "," : "",
 		    tpllist[i]->mb->tplname, tpllist[i]->mb->tplhash);
 		if (err) {
 			/* Sbuf overflow condition. */
 			err = EOVERFLOW;
 			break;
 		}
 	}
 	TPL_LIST_RUNLOCK();
 
 	if (!err) {
 		sbuf_finish(s);
 		err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
 	}
 
 	sbuf_delete(s);
 	return (err);
 }
 
 /*
  * Called by subsystem-specific sysctls to report and/or parse the list of
  * templates being sampled and their sampling rates. A stats_tpl_sr_cb_t
  * conformant function pointer must be passed in as arg1, which is used to
  * interact with the subsystem's stats template sample rates list. If arg2 > 0,
  * a zero-initialised allocation of arg2-sized contextual memory is
  * heap-allocated and passed in to all subsystem callbacks made during the
  * operation of stats_tpl_sample_rates().
  *
  * XXXLAS: Assumes templates are never removed, which is currently true but may
  * need to be reworked in future if dynamic template management becomes a
  * requirement e.g. to support kernel module based templates.
  */
 int
 stats_tpl_sample_rates(SYSCTL_HANDLER_ARGS)
 {
 	char kvpair_fmt[16], tplspec_fmt[16];
 	char tpl_spec[STATS_TPL_MAX_STR_SPEC_LEN];
 	char tpl_name[TPL_MAX_NAME_LEN + 2]; /* +2 for "" */
 	stats_tpl_sr_cb_t subsys_cb;
 	void *subsys_ctx;
 	char *buf, *new_rates_usr_str, *tpl_name_p;
 	struct stats_tpl_sample_rate *rates;
 	struct sbuf *s, _s;
 	uint32_t cum_pct, pct, tpl_hash;
 	int err, i, off, len, newlen, nrates;
 
 	buf = NULL;
 	rates = NULL;
 	err = nrates = 0;
 	subsys_cb = (stats_tpl_sr_cb_t)arg1;
 	KASSERT(subsys_cb != NULL, ("%s: subsys_cb == arg1 == NULL", __func__));
 	if (arg2 > 0)
 		subsys_ctx = malloc(arg2, M_TEMP, M_WAITOK | M_ZERO);
 	else
 		subsys_ctx = NULL;
 
 	/* Grab current count of subsystem rates. */
 	err = subsys_cb(TPL_SR_UNLOCKED_GET, NULL, &nrates, subsys_ctx);
 	if (err)
 		goto done;
 
 	/* +1 to ensure we can append '\0' post copyin, +5 per rate for =nnn, */
 	len = max(req->newlen + 1, nrates * (STATS_TPL_MAX_STR_SPEC_LEN + 5));
 
 	if (req->oldptr != NULL || req->newptr != NULL)
 		buf = malloc(len, M_TEMP, M_WAITOK);
 
 	if (req->oldptr != NULL) {
 		if (nrates == 0) {
 			/* No rates, so return an empty string via oldptr. */
 			err = SYSCTL_OUT(req, "", 1);
 			if (err)
 				goto done;
 			goto process_new;
 		}
 
 		s = sbuf_new(&_s, buf, len, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
 
 		/* Grab locked count of, and ptr to, subsystem rates. */
 		err = subsys_cb(TPL_SR_RLOCKED_GET, &rates, &nrates,
 		    subsys_ctx);
 		if (err)
 			goto done;
 		TPL_LIST_RLOCK();
 		for (i = 0; i < nrates && !err; i++) {
 			err = sbuf_printf(s, "%s\"%s\":%u=%u", i ? "," : "",
 			    tpllist[rates[i].tpl_slot_id]->mb->tplname,
 			    tpllist[rates[i].tpl_slot_id]->mb->tplhash,
 			    rates[i].tpl_sample_pct);
 		}
 		TPL_LIST_RUNLOCK();
 		/* Tell subsystem that we're done with its rates list. */
 		err = subsys_cb(TPL_SR_RUNLOCK, &rates, &nrates, subsys_ctx);
 		if (err)
 			goto done;
 
 		err = sbuf_finish(s);
 		if (err)
 			goto done; /* We lost a race for buf to be too small. */
 
 		/* Return the rendered string data via oldptr. */
 		err = SYSCTL_OUT(req, sbuf_data(s), sbuf_len(s));
 	} else {
 		/* Return the upper bound size for buffer sizing requests. */
 		err = SYSCTL_OUT(req, NULL, len);
 	}
 
 process_new:
 	if (err || req->newptr == NULL)
 		goto done;
 
 	newlen = req->newlen - req->newidx;
 	err = SYSCTL_IN(req, buf, newlen);
 	if (err)
 		goto done;
 
 	/*
 	 * Initialise format strings at run time.
 	 *
 	 * Write the max template spec string length into the
 	 * template_spec=percent key-value pair parsing format string as:
 	 *     " %<width>[^=]=%u %n"
 	 *
 	 * Write the max template name string length into the tplname:tplhash
 	 * parsing format string as:
 	 *     "%<width>[^:]:%u"
 	 *
 	 * Subtract 1 for \0 appended by sscanf().
 	 */
 	sprintf(kvpair_fmt, " %%%zu[^=]=%%u %%n", sizeof(tpl_spec) - 1);
 	sprintf(tplspec_fmt, "%%%zu[^:]:%%u", sizeof(tpl_name) - 1);
 
 	/*
 	 * Parse each CSV key-value pair specifying a template and its sample
 	 * percentage. Whitespace either side of a key-value pair is ignored.
 	 * Templates can be specified by name, hash, or name and hash per the
 	 * following formats (chars in [] are optional):
 	 *    ["]<tplname>["]=<percent>
 	 *    :hash=pct
 	 *    ["]<tplname>["]:hash=<percent>
 	 */
 	cum_pct = nrates = 0;
 	rates = NULL;
 	buf[newlen] = '\0'; /* buf is at least newlen+1 in size. */
 	new_rates_usr_str = buf;
 	while (isspace(*new_rates_usr_str))
 		new_rates_usr_str++; /* Skip leading whitespace. */
 	while (*new_rates_usr_str != '\0') {
 		tpl_name_p = tpl_name;
 		tpl_name[0] = '\0';
 		tpl_hash = 0;
 		off = 0;
 
 		/*
 		 * Parse key-value pair which must perform 2 conversions, then
 		 * parse the template spec to extract either name, hash, or name
 		 * and hash depending on the three possible spec formats. The
 		 * tplspec_fmt format specifier parses name or name and hash
 		 * template specs, while the ":%u" format specifier parses
 		 * hash-only template specs. If parsing is successfull, ensure
 		 * the cumulative sampling percentage does not exceed 100.
 		 */
 		err = EINVAL;
 		if (2 != sscanf(new_rates_usr_str, kvpair_fmt, tpl_spec, &pct,
 		    &off))
 			break;
 		if ((1 > sscanf(tpl_spec, tplspec_fmt, tpl_name, &tpl_hash)) &&
 		    (1 != sscanf(tpl_spec, ":%u", &tpl_hash)))
 			break;
 		if ((cum_pct += pct) > 100)
 			break;
 		err = 0;
 
 		/* Strip surrounding "" from template name if present. */
 		len = strlen(tpl_name);
 		if (len > 0) {
 			if (tpl_name[len - 1] == '"')
 				tpl_name[--len] = '\0';
 			if (tpl_name[0] == '"') {
 				tpl_name_p++;
 				len--;
 			}
 		}
 
 		rates = stats_realloc(rates, 0, /* oldsz is unused in kernel. */
 		    (nrates + 1) * sizeof(*rates), M_WAITOK);
 		rates[nrates].tpl_slot_id =
 		    stats_tpl_fetch_allocid(len ? tpl_name_p : NULL, tpl_hash);
 		if (rates[nrates].tpl_slot_id < 0) {
 			err = -rates[nrates].tpl_slot_id;
 			break;
 		}
 		rates[nrates].tpl_sample_pct = pct;
 		nrates++;
 		new_rates_usr_str += off;
 		if (*new_rates_usr_str != ',')
 			break; /* End-of-input or malformed. */
 		new_rates_usr_str++; /* Move past comma to next pair. */
 	}
 
 	if (!err) {
 		if ((new_rates_usr_str - buf) < newlen) {
 			/* Entire input has not been consumed. */
 			err = EINVAL;
 		} else {
 			/*
 			 * Give subsystem the new rates. They'll return the
 			 * appropriate rates pointer for us to garbage collect.
 			 */
 			err = subsys_cb(TPL_SR_PUT, &rates, &nrates,
 			    subsys_ctx);
 		}
 	}
 	stats_free(rates);
 
 done:
 	free(buf, M_TEMP);
 	free(subsys_ctx, M_TEMP);
 	return (err);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     "stats(9) MIB");
 
 SYSCTL_PROC(_kern_stats, OID_AUTO, templates,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     stats_tpl_list_available, "A",
     "list the name/hash of all available stats(9) templates");
 
 #else /* ! _KERNEL */
 
 static void __attribute__ ((constructor))
 stats_constructor(void)
 {
 
 	pthread_rwlock_init(&tpllistlock, NULL);
 }
 
 static void __attribute__ ((destructor))
 stats_destructor(void)
 {
 
 	pthread_rwlock_destroy(&tpllistlock);
 }
 
 #endif /* _KERNEL */
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index 5b9f8afd9565..aa189e8cd057 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -1,3183 +1,3183 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 Isilon Systems, Inc.
  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
  * Copyright (c) 1998 Berkeley Software Design, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of the `witness' lock verifier.  Originally implemented for
  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
  * classes in FreeBSD.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 /*
  * Special rules concerning Giant and lock orders:
  *
  * 1) Giant must be acquired before any other mutexes.  Stated another way,
  *    no other mutex may be held when Giant is acquired.
  *
  * 2) Giant must be released when blocking on a sleepable lock.
  *
  * This rule is less obvious, but is a result of Giant providing the same
  * semantics as spl().  Basically, when a thread sleeps, it must release
  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
  * 2).
  *
  * 3) Giant may be acquired before or after sleepable locks.
  *
  * This rule is also not quite as obvious.  Giant may be acquired after
  * a sleepable lock because it is a non-sleepable lock and non-sleepable
  * locks may always be acquired while holding a sleepable lock.  The second
  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
  * execute.  Thus, acquiring Giant both before and after a sleepable lock
  * will not result in a lock order reversal.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_stack.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #if !defined(DDB) && !defined(STACK)
 #error "DDB or STACK options are required for WITNESS"
 #endif
 
 /* Note that these traces do not work with KTR_ALQ. */
 #if 0
 #define	KTR_WITNESS	KTR_SUBSYS
 #else
 #define	KTR_WITNESS	0
 #endif
 
 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
 #define	LI_SLEEPABLE	0x00040000	/* Lock may be held while sleeping. */
 
 #ifndef WITNESS_COUNT
 #define	WITNESS_COUNT 		1536
 #endif
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	(512 + (MAXCPU * 4))
 
 /* Allocate 256 KB of stack data space */
 #define	WITNESS_LO_DATA_COUNT	2048
 
 /* Prime, gives load factor of ~2 at full load */
 #define	WITNESS_LO_HASH_SIZE	1021
 
 /*
  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
  * probably be safe for the most part, but it's still a SWAG.
  */
 #define	LOCK_NCHILDREN	5
 #define	LOCK_CHILDCOUNT	2048
 
 #define	MAX_W_NAME	64
 
 #define	FULLGRAPH_SBUF_SIZE	512
 
 /*
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
 #define	WITNESS_RELATED_MASK						\
 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
 					  * observed. */
 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
 
 /* Ancestor to descendant flags */
 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
 
 #define	WITNESS_INDEX_ASSERT(i)						\
 	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
 
 static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
 
 /*
  * Lock instances.  A lock instance is the data associated with a lock while
  * it is held by witness.  For example, a lock instance will hold the
  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
  * are held in a per-cpu list while sleep locks are held in per-thread list.
  */
 struct lock_instance {
 	struct lock_object	*li_lock;
 	const char		*li_file;
 	int			li_line;
 	u_int			li_flags;
 };
 
 /*
  * A simple list type used to build the list of locks held by a thread
  * or CPU.  We can't simply embed the list in struct lock_object since a
  * lock may be held by more than one thread if it is a shared lock.  Locks
  * are added to the head of the list, so we fill up each list entry from
  * "the back" logically.  To ease some of the arithmetic, we actually fill
  * in each list entry the normal way (children[0] then children[1], etc.) but
  * when we traverse the list we read children[count-1] as the first entry
  * down to children[0] as the final entry.
  */
 struct lock_list_entry {
 	struct lock_list_entry	*ll_next;
 	struct lock_instance	ll_children[LOCK_NCHILDREN];
 	u_int			ll_count;
 };
 
 /*
  * The main witness structure. One of these per named lock type in the system
  * (for example, "vnode interlock").
  */
 struct witness {
 	char  			w_name[MAX_W_NAME];
 	uint32_t 		w_index;  /* Index in the relationship matrix */
 	struct lock_class	*w_class;
 	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
 	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
 	const char		*w_file; /* File where last acquired */
 	uint32_t 		w_line; /* Line where last acquired */
 	uint32_t 		w_refcount;
 	uint16_t 		w_num_ancestors; /* direct/indirect
 						  * ancestor count */
 	uint16_t 		w_num_descendants; /* direct/indirect
 						    * descendant count */
 	int16_t 		w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
 
 STAILQ_HEAD(witness_list, witness);
 
 /*
  * The witness hash table. Keys are witness names (const char *), elements are
  * witness objects (struct witness *).
  */
 struct witness_hash {
 	struct witness	*wh_array[WITNESS_HASH_SIZE];
 	uint32_t	wh_size;
 	uint32_t	wh_count;
 };
 
 /*
  * Key type for the lock order data hash table.
  */
 struct witness_lock_order_key {
 	uint16_t	from;
 	uint16_t	to;
 };
 
 struct witness_lock_order_data {
 	struct stack			wlod_stack;
 	struct witness_lock_order_key	wlod_key;
 	struct witness_lock_order_data	*wlod_next;
 };
 
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
  * (struct witness_lock_order_data). 
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
 	u_int	wloh_size;
 	u_int	wloh_count;
 };
 
 struct witness_blessed {
 	const char	*b_lock1;
 	const char	*b_lock2;
 };
 
 struct witness_pendhelp {
 	const char		*wh_type;
 	struct lock_object	*wh_lock;
 };
 
 struct witness_order_list_entry {
 	const char		*w_name;
 	struct lock_class	*w_class;
 };
 
 /*
  * Returns 0 if one of the locks is a spin lock and the other is not.
  * Returns 1 otherwise.
  */
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
 
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
 
 static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
 
 	return (a->from == b->from && a->to == b->to);
 }
 
 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
 		    const char *fname);
 static void	adopt(struct witness *parent, struct witness *child);
 static int	blessed(struct witness *, struct witness *);
 static void	depart(struct witness *w);
 static struct witness	*enroll(const char *description,
 			    struct lock_class *lock_class);
 static struct lock_instance	*find_instance(struct lock_list_entry *list,
 				    const struct lock_object *lock);
 static int	isitmychild(struct witness *parent, struct witness *child);
 static int	isitmydescendant(struct witness *parent, struct witness *child);
 static void	itismychild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
 static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
 #ifdef DDB
 static void	witness_ddb_compute_levels(void);
 static void	witness_ddb_display(int(*)(const char *fmt, ...));
 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
 		    struct witness *, int indent);
 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
 		    struct witness_list *list);
 static void	witness_ddb_level_descendants(struct witness *parent, int l);
 static void	witness_ddb_list(struct thread *td);
 #endif
 static void	witness_enter_debugger(const char *msg);
 static void	witness_debugger(int cond, const char *msg);
 static void	witness_free(struct witness *m);
 static struct witness	*witness_get(void);
 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
 static struct witness	*witness_hash_get(const char *key);
 static void	witness_hash_put(struct witness *w);
 static void	witness_init_hash_tables(void);
 static void	witness_increment_graph_generation(void);
 static void	witness_lock_list_free(struct lock_list_entry *lle);
 static struct lock_list_entry	*witness_lock_list_get(void);
 static int	witness_lock_order_add(struct witness *parent,
 		    struct witness *child);
 static int	witness_lock_order_check(struct witness *parent,
 		    struct witness *child);
 static struct witness_lock_order_data	*witness_lock_order_get(
 					    struct witness *parent,
 					    struct witness *child);
 static void	witness_list_lock(struct lock_instance *instance,
 		    int (*prnt)(const char *fmt, ...));
 static int	witness_output(const char *fmt, ...) __printflike(1, 2);
 static int	witness_output_drain(void *arg __unused, const char *data,
 		    int len);
 static int	witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
 static void	witness_setflag(struct lock_object *lock, int flag, int set);
 
 FEATURE(witness, "kernel has witness(9) support");
 
 static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     "Witness Locking");
 
 /*
  * If set to 0, lock order checking is disabled.  If set to -1,
  * witness is completely disabled.  Otherwise witness performs full
  * lock order checking for all locks.  At runtime, lock order checking
  * may be toggled.  However, witness cannot be reenabled once it is
  * completely disabled.
  */
 static int witness_watch = 1;
 SYSCTL_PROC(_debug_witness, OID_AUTO, watch,
     CTLFLAG_RWTUN | CTLTYPE_INT | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_debug_witness_watch, "I",
     "witness is watching lock operations");
 
 #ifdef KDB
 /*
  * When KDB is enabled and witness_kdb is 1, it will cause the system
  * to drop into kdebug() when:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
 int	witness_kdb = 1;
 #else
 int	witness_kdb = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
 #endif /* KDB */
 
 #if defined(DDB) || defined(KDB)
 /*
  * When DDB or KDB is enabled and witness_trace is 1, it will cause the system
  * to print a stack trace:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
 SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
 #endif /* DDB || KDB */
 
 #ifdef WITNESS_SKIPSPIN
 int	witness_skipspin = 1;
 #else
 int	witness_skipspin = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");
 
 int badstack_sbuf_size;
 
 int witness_count = WITNESS_COUNT;
 SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, 
     &witness_count, 0, "");
 
 /*
  * Output channel for witness messages.  By default we print to the console.
  */
 enum witness_channel {
 	WITNESS_CONSOLE,
 	WITNESS_LOG,
 	WITNESS_NONE,
 };
 
 static enum witness_channel witness_channel = WITNESS_CONSOLE;
 SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel,
     CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_debug_witness_channel, "A",
     "Output channel for warnings");
 
 /*
  * Call this to print out the relations between locks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_debug_witness_fullgraph, "A",
     "Show locks relation graphs");
 
 /*
  * Call this to print out the witness faulty stacks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_debug_witness_badstacks, "A",
     "Show bad witness stacks");
 
 static struct mtx w_mtx;
 
 /* w_list */
 static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
 static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
 
 /* w_typelist */
 static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
 static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
 
 /* lock list */
 static struct lock_list_entry *w_lock_list_free = NULL;
 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
 static u_int pending_cnt;
 
 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
 SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
     "");
 
 static struct witness *w_data;
 static uint8_t **w_rmatrix;
 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
 static struct witness_hash w_hash;	/* The witness hash table. */
 
 /* The lock order data hash */
 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
 static struct witness_lock_order_data *w_lofree = NULL;
 static struct witness_lock_order_hash w_lohash;
 static int w_max_used_index = 0;
 static unsigned int w_generation = 0;
 static const char w_notrunning[] = "Witness not running\n";
 static const char w_stillcold[] = "Witness is still cold\n";
 #ifdef __i386__
 static const char w_notallowed[] = "The sysctl is disabled on the arch\n";
 #endif
 
 static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * sx locks
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
 	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-sleep", &lock_class_mtx_sleep },
 #endif
 	{ "process lock", &lock_class_mtx_sleep },
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_rw },
 	{ "time lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * umtx
 	 */
 	{ "umtx lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "sellck", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "radix node head", &lock_class_rm },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv4 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "in_multi_sx", &lock_class_sx },
 	{ "udpinp", &lock_class_rw },
 	{ "in_multi_list_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv6 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "in6_multi_sx", &lock_class_sx },
 	{ "udpinp", &lock_class_rw },
 	{ "in6_multi_list_mtx", &lock_class_mtx_sleep },
 	{ "mld_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UNIX Domain Sockets
 	 */
 	{ "unp_link_rwlock", &lock_class_rw },
 	{ "unp_list_lock", &lock_class_mtx_sleep },
 	{ "unp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UDP/IP
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "udp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
 	{ "tcpinp", &lock_class_rw },
 	{ "tcp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * BPF
 	 */
 	{ "bpf global lock", &lock_class_sx },
 	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
 	 */
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 
 	/*
 	 * IEEE 802.11
 	 */
 	{ "802.11 com lock", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 	/*
 	 * Network drivers
 	 */
 	{ "network driver", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 
 	/*
 	 * Netgraph
 	 */
 	{ "ng_node", &lock_class_mtx_sleep },
 	{ "ng_worklist", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
 	{ "vm map (system)", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ "devthrd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
 	 */
 	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_rw },
 	{ "vm page", &lock_class_mtx_sleep },
 	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ "pmap pv list", &lock_class_rw },
 	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
 	 */
 	{ "kqueue", &lock_class_mtx_sleep },
 	{ "struct mount mtx", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VFS namecache
 	 */
 	{ "ncvn", &lock_class_mtx_sleep },
 	{ "ncbuc", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "ncneg", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },
 	{ "dr->dt.di.dr_mtx", &lock_class_sx },
 	{ "db->db_mtx", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * TCP log locks
 	 */
 	{ "TCP ID tree", &lock_class_rw },
 	{ "tcp log id bucket", &lock_class_mtx_sleep },
 	{ "tcpinp", &lock_class_rw },
 	{ "TCP log expireq", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * spin locks
 	 */
 #ifdef SMP
 	{ "ap boot", &lock_class_mtx_spin },
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
 	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
 	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
 	{ "process slock", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
 	{ "rm_spinlock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
 	{ "turnstile lock", &lock_class_mtx_spin },
 	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 #ifdef SMP
 	{ "smp rendezvous", &lock_class_mtx_spin },
 #endif
 #ifdef __powerpc__
 	{ "tlb0", &lock_class_mtx_spin },
 #endif
 	{ NULL, NULL },
 	{ "sched lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
 	{ NULL, NULL },
 	/*
 	 * leaf locks
 	 */
 	{ "intrcnt", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "allpmaps", &lock_class_mtx_spin },
 	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "cpuset", &lock_class_mtx_spin },
 	{ "mprof lock", &lock_class_mtx_spin },
 	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #if defined(__i386__) || defined(__amd64__)
 	{ "pcicfg", &lock_class_mtx_spin },
 	{ "NDIS thread lock", &lock_class_mtx_spin },
 #endif
 	{ "tw_osl_io_lock", &lock_class_mtx_spin },
 	{ "tw_osl_q_lock", &lock_class_mtx_spin },
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
 	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
 
 /*
  * Pairs of locks which have been blessed.  Witness does not complain about
  * order problems with blessed lock pairs.  Please do not add an entry to the
  * table without an explanatory comment.
  */
 static struct witness_blessed blessed_list[] = {
 	/*
 	 * See the comment in ufs_dirhash.c.  Basically, a vnode lock serializes
 	 * both lock orders, so a deadlock cannot happen as a result of this
 	 * LOR.
 	 */
 	{ "dirhash",	"bufwait" },
 
 	/*
 	 * A UFS vnode may be locked in vget() while a buffer belonging to the
 	 * parent directory vnode is locked.
 	 */
 	{ "ufs",	"bufwait" },
 
 	/*
 	 * The tarfs decompression stream vnode may be locked while a
 	 * buffer belonging to a tarfs data vnode is locked.
 	 */
 	{ "tarfs",	"bufwait" },
 };
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 /*
  * This global is set to 1 once the static lock orders have been enrolled
  * so that a warning can be issued for any spin locks enrolled later.
  */
 static int witness_spin_warn = 0;
 
 /* Trim useless garbage from filenames. */
 static const char *
 fixup_filename(const char *file)
 {
 
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
 		file += 3;
 	return (file);
 }
 
 /*
  * Calculate the size of early witness structures.
  */
 int
 witness_startup_count(void)
 {
 	int sz;
 
 	sz = sizeof(struct witness) * witness_count;
 	sz += sizeof(*w_rmatrix) * (witness_count + 1);
 	sz += sizeof(*w_rmatrix[0]) * (witness_count + 1) *
 	    (witness_count + 1);
 
 	return (sz);
 }
 
 /*
  * The WITNESS-enabled diagnostic code.  Note that the witness code does
  * assume that the early boot is single-threaded at least until after this
  * routine is completed.
  */
 void
 witness_startup(void *mem)
 {
 	struct lock_object *lock;
 	struct witness_order_list_entry *order;
 	struct witness *w, *w1;
 	uintptr_t p;
 	int i;
 
 	p = (uintptr_t)mem;
 	w_data = (void *)p;
 	p += sizeof(struct witness) * witness_count;
 
 	w_rmatrix = (void *)p;
 	p += sizeof(*w_rmatrix) * (witness_count + 1);
 
 	for (i = 0; i < witness_count + 1; i++) {
 		w_rmatrix[i] = (void *)p;
 		p += sizeof(*w_rmatrix[i]) * (witness_count + 1);
 	}
 	badstack_sbuf_size = witness_count * 256;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
 	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = witness_count - 1; i >= 0; i--) {
 		w = &w_data[i];
 		memset(w, 0, sizeof(*w));
 		w_data[i].w_index = i;	/* Witness index never changes. */
 		witness_free(w);
 	}
 	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
 	    ("%s: Invalid list of free witness objects", __func__));
 
 	/* Witness with index 0 is not used to aid in debugging. */
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 
 	for (i = 0; i < witness_count; i++) {
 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * 
 		    (witness_count + 1));
 	}
 
 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
 		witness_lock_list_free(&w_locklistdata[i]);
 	witness_init_hash_tables();
 
 	/* First add in all the specified order lists. */
 	for (order = order_lists; order->w_name != NULL; order++) {
 		w = enroll(order->w_name, order->w_class);
 		if (w == NULL)
 			continue;
 		w->w_file = "order list";
 		for (order++; order->w_name != NULL; order++) {
 			w1 = enroll(order->w_name, order->w_class);
 			if (w1 == NULL)
 				continue;
 			w1->w_file = "order list";
 			itismychild(w, w1);
 			w = w1;
 		}
 	}
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
 		lock = pending_locks[i].wh_lock;
 		KASSERT(lock->lo_flags & LO_WITNESS,
 		    ("%s: lock %s is on pending list but not LO_WITNESS",
 		    __func__, lock->lo_name));
 		lock->lo_witness = enroll(pending_locks[i].wh_type,
 		    LOCK_CLASS(lock));
 	}
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
 
 	mtx_lock(&Giant);
 }
 
 void
 witness_init(struct lock_object *lock, const char *type)
 {
 	struct lock_class *class;
 
 	/* Various sanity checks. */
 	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be recursable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (class->lc_flags & LC_SLEEPABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be sleepable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
 	    (class->lc_flags & LC_UPGRADABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be upgradable",
 		    __func__, class->lc_name, lock->lo_name);
 
 	/*
 	 * If we shouldn't watch this lock, then just clear lo_witness.
 	 * Otherwise, if witness_cold is set, then it is too early to
 	 * enroll this lock, so defer it to witness_initialize() by adding
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
 	if (witness_watch < 1 || KERNEL_PANICKED() ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
 		pending_locks[pending_cnt].wh_lock = lock;
 		pending_locks[pending_cnt++].wh_type = type;
 		if (pending_cnt > WITNESS_PENDLIST)
 			panic("%s: pending locks list is too small, "
 			    "increase WITNESS_PENDLIST\n",
 			    __func__);
 	} else
 		lock->lo_witness = enroll(type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct witness *w;
 
 	class = LOCK_CLASS(lock);
 
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
 		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
 	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
 		return;
 	w = lock->lo_witness;
 
 	mtx_lock_spin(&w_mtx);
 	MPASS(w->w_refcount > 0);
 	w->w_refcount--;
 
 	if (w->w_refcount == 0)
 		depart(w);
 	mtx_unlock_spin(&w_mtx);
 }
 
 #ifdef DDB
 static void
 witness_ddb_compute_levels(void)
 {
 	struct witness *w;
 
 	/*
 	 * First clear all levels.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_ddb_level = -1;
 
 	/*
 	 * Look for locks with no parents and level all their descendants.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		/* If the witness has ancestors (is not a root), skip it. */
 		if (w->w_num_ancestors > 0)
 			continue;
 		witness_ddb_level_descendants(w, 0);
 	}
 }
 
 static void
 witness_ddb_level_descendants(struct witness *w, int l)
 {
 	int i;
 
 	if (w->w_ddb_level >= l)
 		return;
 
 	w->w_ddb_level = l;
 	l++;
 
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_level_descendants(&w_data[i], l);
 	}
 }
 
 static void
 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
     struct witness *w, int indent)
 {
 	int i;
 
  	for (i = 0; i < indent; i++)
  		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
  	if (w->w_displayed) {
  		prnt(" -- (already displayed)\n");
  		return;
  	}
  	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
 		    w->w_line);
 	else
 		prnt(" -- never acquired\n");
 	indent++;
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (db_pager_quit)
 			return;
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_display_descendants(prnt, &w_data[i],
 			    indent);
 	}
 }
 
 static void
 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
     struct witness_list *list)
 {
 	struct witness *w;
 
 	STAILQ_FOREACH(w, list, w_typelist) {
 		if (w->w_file == NULL || w->w_ddb_level > 0)
 			continue;
 
 		/* This lock has no anscestors - display its descendants. */
 		witness_ddb_display_descendants(prnt, w, 0);
 		if (db_pager_quit)
 			return;
 	}
 }
 
 static void
 witness_ddb_display(int(*prnt)(const char *fmt, ...))
 {
 	struct witness *w;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	witness_ddb_compute_levels();
 
 	/* Clear all the displayed flags. */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 
 	/*
 	 * First, handle sleep locks which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep locks:\n");
 	witness_ddb_display_list(prnt, &w_sleep);
 	if (db_pager_quit)
 		return;
 
 	/*
 	 * Now do spin locks which have been acquired at least once.
 	 */
 	prnt("\nSpin locks:\n");
 	witness_ddb_display_list(prnt, &w_spin);
 	if (db_pager_quit)
 		return;
 
 	/*
 	 * Finally, any locks which have not been acquired yet.
 	 */
 	prnt("\nLocks which were never acquired:\n");
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		if (w->w_file != NULL || w->w_refcount == 0)
 			continue;
 		prnt("%s (type: %s, depth: %d)\n", w->w_name,
 		    w->w_class->lc_name, w->w_ddb_level);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif /* DDB */
 
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
 
 	if (witness_watch == -1 || KERNEL_PANICKED())
 		return (0);
 
 	/* Require locks that witness knows about. */
 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
 	    lock2->lo_witness == NULL)
 		return (EINVAL);
 
 	mtx_assert(&w_mtx, MA_NOTOWNED);
 	mtx_lock_spin(&w_mtx);
 
 	/*
 	 * If we already have either an explicit or implied lock order that
 	 * is the other way around, then return an error.
 	 */
 	if (witness_watch &&
 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
 		mtx_unlock_spin(&w_mtx);
 		return (EDOOFUS);
 	}
 
 	/* Try to add the new order. */
 	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
 	itismychild(lock1->lo_witness, lock2->lo_witness);
 	mtx_unlock_spin(&w_mtx);
 	return (0);
 }
 
 void
 witness_checkorder(struct lock_object *lock, int flags, const char *file,
     int line, struct lock_object *interlock)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1, *lock2, *plock;
 	struct lock_class *class, *iclass;
 	struct witness *w, *w1;
 	struct thread *td;
 	int i, j;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
 	    KERNEL_PANICKED())
 		return;
 
 	w = lock->lo_witness;
 	class = LOCK_CLASS(lock);
 	td = curthread;
 
 	if (class->lc_flags & LC_SLEEPLOCK) {
 		/*
 		 * Since spin locks include a critical section, this check
 		 * implicitly enforces a lock order of all sleep locks before
 		 * all spin locks.
 		 */
 		if (td->td_critnest != 0 && !kdb_active)
 			kassert_panic("acquiring blockable sleep lock with "
 			    "spinlock or critical section held (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 
 		/*
 		 * If this is the first lock acquired then just return as
 		 * no order checking is needed.
 		 */
 		lock_list = td->td_sleeplocks;
 		if (lock_list == NULL || lock_list->ll_count == 0)
 			return;
 	} else {
 		/*
 		 * If this is the first lock, just return as no order
 		 * checking is needed.  Avoid problems with thread
 		 * migration pinning the thread while checking if
 		 * spinlocks are held.  If at least one spinlock is held
 		 * the thread is in a safe path and it is allowed to
 		 * unpin it.
 		 */
 		sched_pin();
 		lock_list = PCPU_GET(spinlocks);
 		if (lock_list == NULL || lock_list->ll_count == 0) {
 			sched_unpin();
 			return;
 		}
 		sched_unpin();
 	}
 
 	/*
 	 * Check to see if we are recursing on a lock we already own.  If
 	 * so, make sure that we don't mismatch exclusive and shared lock
 	 * acquires.
 	 */
 	lock1 = find_instance(lock_list, lock);
 	if (lock1 != NULL) {
 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
 		    (flags & LOP_EXCLUSIVE) == 0) {
 			witness_output("shared lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while exclusively locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("excl->share");
 		}
 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
 		    (flags & LOP_EXCLUSIVE) != 0) {
 			witness_output("exclusive lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while share locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("share->excl");
 		}
 		return;
 	}
 
 	/* Warn if the interlock is not locked exactly once. */
 	if (interlock != NULL) {
 		iclass = LOCK_CLASS(interlock);
 		lock1 = find_instance(lock_list, interlock);
 		if (lock1 == NULL)
 			kassert_panic("interlock (%s) %s not locked @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("interlock (%s) %s recursed @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 	}
 
 	/*
 	 * Find the previously acquired lock, but ignore interlocks.
 	 */
 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
 	if (interlock != NULL && plock->li_lock == interlock) {
 		if (lock_list->ll_count > 1)
 			plock =
 			    &lock_list->ll_children[lock_list->ll_count - 2];
 		else {
 			lle = lock_list->ll_next;
 
 			/*
 			 * The interlock is the only lock we hold, so
 			 * simply return.
 			 */
 			if (lle == NULL)
 				return;
 			plock = &lle->ll_children[lle->ll_count - 1];
 		}
 	}
 
 	/*
 	 * Try to perform most checks without a lock.  If this succeeds we
 	 * can skip acquiring the lock and return success.  Otherwise we redo
 	 * the check with the lock held to handle races with concurrent updates.
 	 */
 	w1 = plock->li_lock->lo_witness;
 	if (witness_lock_order_check(w1, w))
 		return;
 
 	mtx_lock_spin(&w_mtx);
 	if (witness_lock_order_check(w1, w)) {
 		mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	witness_lock_order_add(w1, w);
 
 	/*
 	 * Check for duplicate locks of the same type.  Note that we only
 	 * have to check for this on the last lock we just acquired.  Any
 	 * other cases will be caught as lock order violations.
 	 */
 	if (w1 == w) {
 		i = w->w_index;
 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
 		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			witness_output(
 			    "acquiring duplicate lock of same type: \"%s\"\n", 
 			    w->w_name);
 			witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			    fixup_filename(plock->li_file), plock->li_line);
 			witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
 			    fixup_filename(file), line);
 			witness_debugger(1, __func__);
 		} else
 			mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	mtx_assert(&w_mtx, MA_OWNED);
 
 	/*
 	 * If we know that the lock we are acquiring comes after
 	 * the lock we most recently acquired in the lock order tree,
 	 * then there is no need for any further checks.
 	 */
 	if (isitmychild(w1, w))
 		goto out;
 
 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
 			struct stack pstack;
 			bool pstackv, trace;
 
 			MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
 			lock1 = &lle->ll_children[i];
 
 			/*
 			 * Ignore the interlock.
 			 */
 			if (interlock == lock1->li_lock)
 				continue;
 
 			/*
 			 * If this lock doesn't undergo witness checking,
 			 * then skip it.
 			 */
 			w1 = lock1->li_lock->lo_witness;
 			if (w1 == NULL) {
 				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
 				    ("lock missing witness structure"));
 				continue;
 			}
 
 			/*
 			 * If we are locking Giant and this is a sleepable
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_flags & LI_SLEEPABLE) != 0 &&
 			    lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (flags & LOP_NOSLEEP) == 0 &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * isn't sleepable, we want to treat it as a lock
 			 * order violation to enfore a general lock order of
 			 * sleepable locks before non-sleepable locks.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (flags & LOP_NOSLEEP) == 0 &&
 			    (lock1->li_flags & LI_SLEEPABLE) == 0)
 				goto reversal;
 
 			/*
 			 * If we are locking Giant and this is a non-sleepable
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_flags & LI_SLEEPABLE) == 0 &&
 			    lock == &Giant.lock_object)
 				goto reversal;
 
 			/*
 			 * Check the lock order hierarchy for a reveresal.
 			 */
 			if (!isitmydescendant(w, w1))
 				continue;
 		reversal:
 
 			/*
 			 * We have a lock order violation, check to see if it
 			 * is allowed or has already been yelled about.
 			 */
 
 			/* Bail if this violation is known */
 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
 				goto out;
 
 			/* Record this as a violation */
 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
 			w->w_reversed = w1->w_reversed = 1;
 			witness_increment_graph_generation();
 
 			/*
 			 * If the lock order is blessed, bail before logging
 			 * anything.  We don't look for other lock order
 			 * violations though, which may be a bug.
 			 */
 			if (blessed(w, w1))
 				goto out;
 
 			trace = atomic_load_int(&witness_trace);
 			if (trace) {
 				struct witness_lock_order_data *data;
 
 				pstackv = false;
 				data = witness_lock_order_get(w, w1);
 				if (data != NULL) {
 					stack_copy(&data->wlod_stack,
 					    &pstack);
 					pstackv = true;
 				}
 			}
 			mtx_unlock_spin(&w_mtx);
 
 #ifdef WITNESS_NO_VNODE
 			/*
 			 * There are known LORs between VNODE locks. They are
 			 * not an indication of a bug. VNODE locks are flagged
 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
 			 * is between 2 VNODE locks.
 			 */
 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
 				return;
 #endif
 
 			/*
 			 * Ok, yell about it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (flags & LOP_NOSLEEP) == 0 &&
 			    (lock1->li_flags & LI_SLEEPABLE) == 0)
 				witness_output(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_flags & LI_SLEEPABLE) == 0
 			    && lock == &Giant.lock_object)
 				witness_output(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
 				witness_output("lock order reversal:\n");
 
 			/*
 			 * Try to locate an earlier lock with
 			 * witness w in our list.
 			 */
 			do {
 				lock2 = &lle->ll_children[i];
 				MPASS(lock2->li_lock != NULL);
 				if (lock2->li_lock->lo_witness == w)
 					break;
 				if (i == 0 && lle->ll_next != NULL) {
 					lle = lle->ll_next;
 					i = lle->ll_count - 1;
 					MPASS(i >= 0 && i < LOCK_NCHILDREN);
 				} else
 					i--;
 			} while (i >= 0);
 			if (i < 0) {
 				witness_output(" 1st %p %s (%s, %s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, w1->w_class->lc_name,
 				    fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 2nd %p %s (%s, %s) @ %s:%d\n",
 				    lock, lock->lo_name, w->w_name,
 				    w->w_class->lc_name, fixup_filename(file),
 				    line);
 			} else {
 				struct witness *w2 = lock2->li_lock->lo_witness;
 
 				witness_output(" 1st %p %s (%s, %s) @ %s:%d\n",
 				    lock2->li_lock, lock2->li_lock->lo_name,
 				    w2->w_name, w2->w_class->lc_name,
 				    fixup_filename(lock2->li_file),
 				    lock2->li_line);
 				witness_output(" 2nd %p %s (%s, %s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, w1->w_class->lc_name,
 				    fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 3rd %p %s (%s, %s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    w->w_class->lc_name, fixup_filename(file),
 				    line);
 			}
 			if (trace) {
 				char buf[64];
 				struct sbuf sb;
 
 				sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 				sbuf_set_drain(&sb, witness_output_drain,
 				    NULL);
 
 				if (pstackv) {
 					sbuf_printf(&sb,
 				    "lock order %s -> %s established at:\n",
 					    w->w_name, w1->w_name);
 					stack_sbuf_print_flags(&sb, &pstack,
 					    M_NOWAIT, STACK_SBUF_FMT_LONG);
 				}
 
 				sbuf_printf(&sb,
 				    "lock order %s -> %s attempted at:\n",
 				    w1->w_name, w->w_name);
 				stack_save(&pstack);
 				stack_sbuf_print_flags(&sb, &pstack, M_NOWAIT,
 				    STACK_SBUF_FMT_LONG);
 
 				sbuf_finish(&sb);
 				sbuf_delete(&sb);
 			}
 			witness_enter_debugger(__func__);
 			return;
 		}
 	}
 
 	/*
 	 * If requested, build a new lock order.  However, don't build a new
 	 * relationship between a sleepable lock and Giant if it is in the
 	 * wrong direction.  The correct lock order is that sleepable locks
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
 	    !(plock->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (flags & LOP_NOSLEEP) == 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    w->w_name, plock->li_lock->lo_witness->w_name);
 		itismychild(plock->li_lock->lo_witness, w);
 	}
 out:
 	mtx_unlock_spin(&w_mtx);
 }
 
 void
 witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct witness *w;
 	struct thread *td;
 
 	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
 	    KERNEL_PANICKED())
 		return;
 	w = lock->lo_witness;
 	td = curthread;
 
 	/* Determine lock list for this lock. */
 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
 		instance->li_flags++;
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
 		instance->li_file = file;
 		instance->li_line = line;
 		return;
 	}
 
 	/* Update per-witness last file and line acquire. */
 	w->w_file = file;
 	w->w_line = line;
 
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
 		lle = witness_lock_list_get();
 		if (lle == NULL)
 			return;
 		lle->ll_next = *lock_list;
 		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		*lock_list = lle;
 	}
 	instance = &lle->ll_children[lle->ll_count++];
 	instance->li_lock = lock;
 	instance->li_line = line;
 	instance->li_file = file;
 	instance->li_flags = 0;
 	if ((flags & LOP_EXCLUSIVE) != 0)
 		instance->li_flags |= LI_EXCLUSIVE;
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0)
 		instance->li_flags |= LI_SLEEPABLE;
 	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
 	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
 }
 
 void
 witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED())
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "upgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "upgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "upgrade of exclusive lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags |= LI_EXCLUSIVE;
 }
 
 void
 witness_downgrade(struct lock_object *lock, int flags, const char *file,
     int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED())
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "downgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "downgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "downgrade of shared lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags &= ~LI_EXCLUSIVE;
 }
 
 void
 witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct lock_class *class;
 	struct thread *td;
 	register_t s;
 	int i, j;
 
 	if (witness_cold || lock->lo_witness == NULL || KERNEL_PANICKED())
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
 
 	/* Find lock instance associated with this lock. */
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 	lle = *lock_list;
 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
 		for (i = 0; i < (*lock_list)->ll_count; i++) {
 			instance = &(*lock_list)->ll_children[i];
 			if (instance->li_lock == lock)
 				goto found;
 		}
 
 	/*
 	 * When disabling WITNESS through witness_watch we could end up in
 	 * having registered locks in the td_sleeplocks queue.
 	 * We have to make sure we flush these queues, so just search for
 	 * eventual register locks and remove them.
 	 */
 	if (witness_watch > 0) {
 		kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
 		    lock->lo_name, fixup_filename(file), line);
 		return;
 	} else {
 		return;
 	}
 found:
 
 	/* First, check for shared/exclusive mismatches. */
 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) == 0) {
 		witness_output("shared unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while exclusively locked from %s:%d\n",
 		    fixup_filename(instance->li_file), instance->li_line);
 		kassert_panic("excl->ushare");
 	}
 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) != 0) {
 		witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while share locked from %s:%d\n",
 		    fixup_filename(instance->li_file),
 		    instance->li_line);
 		kassert_panic("share->uexcl");
 	}
 	/* If we are recursed, unrecurse. */
 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
 		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, instance->li_lock->lo_name,
 		    instance->li_flags);
 		instance->li_flags--;
 		return;
 	}
 	/* The lock is now being dropped, check for NORELEASE flag */
 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
 		witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		kassert_panic("lock marked norelease");
 	}
 
 	/* Otherwise, remove this item from the list. */
 	s = intr_disable();
 	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
 	    td->td_proc->p_pid, instance->li_lock->lo_name,
 	    (*lock_list)->ll_count - 1);
 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
 		(*lock_list)->ll_children[j] =
 		    (*lock_list)->ll_children[j + 1];
 	(*lock_list)->ll_count--;
 	intr_restore(s);
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
 	 * head object into lists so that frequent allocation from the 
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
 	 * further objects in the list, so the list ownership needs to be
 	 * hand over to another object if the current head needs to be freed.
 	 */
 	if ((*lock_list)->ll_count == 0) {
 		if (*lock_list == lle) {
 			if (lle->ll_next == NULL)
 				return;
 		} else
 			lle = *lock_list;
 		*lock_list = lle->ll_next;
 		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		witness_lock_list_free(lle);
 	}
 }
 
 void
 witness_thread_exit(struct thread *td)
 {
 	struct lock_list_entry *lle;
 	int i, n;
 
 	lle = td->td_sleeplocks;
 	if (lle == NULL || KERNEL_PANICKED())
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
 			for (i = lle->ll_count - 1; i >= 0; i--) {
 				if (n == 0)
 					witness_output(
 		    "Thread %p exiting with the following locks held:\n", td);
 				n++;
 				witness_list_lock(&lle->ll_children[i],
 				    witness_output);
 				
 			}
 		kassert_panic(
 		    "Thread %p cannot exit while holding sleeplocks\n", td);
 	}
 	witness_lock_list_free(lle);
 }
 
 /*
  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
  * exempt Giant and sleepable locks from the checks as well.  If any
  * non-exempt locks are held, then a supplied message is printed to the
  * output channel along with a list of the offending locks.  If indicated in the
  * flags then a failure results in a panic as well.
  */
 int
 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1;
 	struct thread *td;
 	va_list ap;
 	int i, n;
 
 	if (witness_cold || witness_watch < 1 || KERNEL_PANICKED())
 		return (0);
 	n = 0;
 	td = curthread;
 	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			lock1 = &lle->ll_children[i];
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_flags & LI_SLEEPABLE) != 0)
 				continue;
 			if (n == 0) {
 				va_start(ap, fmt);
 				vprintf(fmt, ap);
 				va_end(ap);
 				printf(" with the following %slocks held:\n",
 				    (flags & WARN_SLEEPOK) != 0 ?
 				    "non-sleepable " : "");
 			}
 			n++;
 			witness_list_lock(lock1, printf);
 		}
 
 	/*
 	 * Pin the thread in order to avoid problems with thread migration.
 	 * Once that all verifies are passed about spinlocks ownership,
 	 * the thread is in a safe path and it can be unpinned.
 	 */
 	sched_pin();
 	lock_list = PCPU_GET(spinlocks);
 	if (lock_list != NULL && lock_list->ll_count != 0) {
 		sched_unpin();
 
 		/*
 		 * We should only have one spinlock and as long as
 		 * the flags cannot match for this locks class,
 		 * check if the first spinlock is the one curthread
 		 * should hold.
 		 */
 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
 		    lock1->li_lock == lock && n == 0)
 			return (0);
 
 		va_start(ap, fmt);
 		vprintf(fmt, ap);
 		va_end(ap);
 		printf(" with the following %slocks held:\n",
 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
 		n += witness_list_locks(&lock_list, printf);
 	} else
 		sched_unpin();
 	if (flags & WARN_PANIC && n)
 		kassert_panic("%s", __func__);
 	else
 		witness_debugger(n, __func__);
 	return (n);
 }
 
 const char *
 witness_file(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return ("?");
 	w = lock->lo_witness;
 	return (w->w_file);
 }
 
 int
 witness_line(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return (0);
 	w = lock->lo_witness;
 	return (w->w_line);
 }
 
 static struct witness *
 enroll(const char *description, struct lock_class *lock_class)
 {
 	struct witness *w;
 
 	MPASS(description != NULL);
 
 	if (witness_watch == -1 || KERNEL_PANICKED())
 		return (NULL);
 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
 		if (witness_skipspin)
 			return (NULL);
 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) {
 		kassert_panic("lock class %s is not sleep or spin",
 		    lock_class->lc_name);
 		return (NULL);
 	}
 
 	mtx_lock_spin(&w_mtx);
 	w = witness_hash_get(description);
 	if (w)
 		goto found;
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	MPASS(strlen(description) < MAX_W_NAME);
 	strcpy(w->w_name, description);
 	w->w_class = lock_class;
 	w->w_refcount = 1;
 	STAILQ_INSERT_HEAD(&w_all, w, w_list);
 	if (lock_class->lc_flags & LC_SPINLOCK) {
 		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
 		w_spin_cnt++;
 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
 		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
 		w_sleep_cnt++;
 	}
 
 	/* Insert new witness into the hash */
 	witness_hash_put(w);
 	witness_increment_graph_generation();
 	mtx_unlock_spin(&w_mtx);
 	return (w);
 found:
 	w->w_refcount++;
 	if (w->w_refcount == 1)
 		w->w_class = lock_class;
 	mtx_unlock_spin(&w_mtx);
 	if (lock_class != w->w_class)
 		kassert_panic(
 		    "lock (%s) %s does not match earlier (%s) lock",
 		    description, lock_class->lc_name,
 		    w->w_class->lc_name);
 	return (w);
 }
 
 static void
 depart(struct witness *w)
 {
 
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		w_sleep_cnt--;
 	} else {
 		w_spin_cnt--;
 	}
 	/*
 	 * Set file to NULL as it may point into a loadable module.
 	 */
 	w->w_file = NULL;
 	w->w_line = 0;
 	witness_increment_graph_generation();
 }
 
 static void
 adopt(struct witness *parent, struct witness *child)
 {
 	int pi, ci, i, j;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	/* If the relationship is already known, there's no work to be done. */
 	if (isitmychild(parent, child))
 		return;
 
 	/* When the structure of the graph changes, bump up the generation. */
 	witness_increment_graph_generation();
 
 	/*
 	 * The hard part ... create the direct relationship, then propagate all
 	 * indirect relationships.
 	 */
 	pi = parent->w_index;
 	ci = child->w_index;
 	WITNESS_INDEX_ASSERT(pi);
 	WITNESS_INDEX_ASSERT(ci);
 	MPASS(pi != ci);
 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
 
 	/*
 	 * If parent was not already an ancestor of child,
 	 * then we increment the descendant and ancestor counters.
 	 */
 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
 		parent->w_num_descendants++;
 		child->w_num_ancestors++;
 	}
 
 	/* 
 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
 			/* 
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
 				continue;
 
 			/*
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
 			/* 
 			 * Make sure we aren't marking a node as both an
 			 * ancestor and descendant. We should have caught 
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    i, j, w_rmatrix[i][j]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    j, i, w_rmatrix[j][i]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 		}
 	}
 }
 
 static void
 itismychild(struct witness *parent, struct witness *child)
 {
 	int unlocked;
 
 	MPASS(child != NULL && parent != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (!witness_lock_type_equal(parent, child)) {
 		if (witness_cold == 0) {
 			unlocked = 1;
 			mtx_unlock_spin(&w_mtx);
 		} else {
 			unlocked = 0;
 		}
 		kassert_panic(
 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
 		    "the same lock type", __func__, parent->w_name,
 		    parent->w_class->lc_name, child->w_name,
 		    child->w_class->lc_name);
 		if (unlocked)
 			mtx_lock_spin(&w_mtx);
 	}
 	adopt(parent, child);
 }
 
 /*
  * Generic code for the isitmy*() functions. The rmask parameter is the
  * expected relationship of w1 to w2.
  */
 static int
 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 {
 	unsigned char r1, r2;
 	int i1, i2;
 
 	i1 = w1->w_index;
 	i2 = w2->w_index;
 	WITNESS_INDEX_ASSERT(i1);
 	WITNESS_INDEX_ASSERT(i2);
 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
 
 	/* The flags on one better be the inverse of the flags on the other */
 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
 		/* Don't squawk if we're potentially racing with an update. */
 		if (!mtx_owned(&w_mtx))
 			return (0);
 		printf("%s: rmatrix mismatch between %s (index %d) and %s "
 		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
 		    "w_rmatrix[%d][%d] == %hhx\n",
 		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
 		    i2, i1, r2);
 		kdb_backtrace();
 		printf("Witness disabled.\n");
 		witness_watch = -1;
 	}
 	return (r1 & rmask);
 }
 
 /*
  * Checks if @child is a direct child of @parent.
  */
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
 /*
  * Checks if @descendant is a direct or inderect descendant of @ancestor.
  */
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
 
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
 
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < nitems(blessed_list); i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_name, b->b_lock1) == 0) {
 			if (strcmp(w2->w_name, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_name, b->b_lock2) == 0)
 			if (strcmp(w2->w_name, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 
 static struct witness *
 witness_get(void)
 {
 	struct witness *w;
 	int index;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (witness_watch == -1) {
 		mtx_unlock_spin(&w_mtx);
 		return (NULL);
 	}
 	if (STAILQ_EMPTY(&w_free)) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("WITNESS: unable to allocate a new witness object\n");
 		return (NULL);
 	}
 	w = STAILQ_FIRST(&w_free);
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
 	MPASS(index > 0 && index == w_max_used_index+1 &&
 	    index < witness_count);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
 	if (index > w_max_used_index)
 		w_max_used_index = index;
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
 
 static struct lock_list_entry *
 witness_lock_list_get(void)
 {
 	struct lock_list_entry *lle;
 
 	if (witness_watch == -1)
 		return (NULL);
 	mtx_lock_spin(&w_mtx);
 	lle = w_lock_list_free;
 	if (lle == NULL) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("%s: witness exhausted\n", __func__);
 		return (NULL);
 	}
 	w_lock_list_free = lle->ll_next;
 	mtx_unlock_spin(&w_mtx);
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
 		
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
 
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
 	mtx_unlock_spin(&w_mtx);
 }
 
 static struct lock_instance *
 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
 {
 	struct lock_list_entry *lle;
 	struct lock_instance *instance;
 	int i;
 
 	for (lle = list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			instance = &lle->ll_children[i];
 			if (instance->li_lock == lock)
 				return (instance);
 		}
 	return (NULL);
 }
 
 static void
 witness_list_lock(struct lock_instance *instance,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_object *lock;
 
 	lock = instance->li_lock;
 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_witness->w_name != lock->lo_name)
 		prnt(" (%s)", lock->lo_witness->w_name);
 	prnt(" r = %d (%p) locked @ %s:%d\n",
 	    instance->li_flags & LI_RECURSEMASK, lock,
 	    fixup_filename(instance->li_file), instance->li_line);
 }
 
 static int
 witness_output(const char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
 	ret = witness_voutput(fmt, ap);
 	va_end(ap);
 	return (ret);
 }
 
 static int
 witness_voutput(const char *fmt, va_list ap)
 {
 	int ret;
 
 	ret = 0;
 	switch (witness_channel) {
 	case WITNESS_CONSOLE:
 		ret = vprintf(fmt, ap);
 		break;
 	case WITNESS_LOG:
 		vlog(LOG_NOTICE, fmt, ap);
 		break;
 	case WITNESS_NONE:
 		break;
 	}
 	return (ret);
 }
 
 #ifdef DDB
 static int
 witness_thread_has_locks(struct thread *td)
 {
 
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
 }
 
 static int
 witness_proc_has_locks(struct proc *p)
 {
 	struct thread *td;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (witness_thread_has_locks(td))
 			return (1);
 	}
 	return (0);
 }
 #endif
 
 int
 witness_list_locks(struct lock_list_entry **lock_list,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_list_entry *lle;
 	int i, nheld;
 
 	nheld = 0;
 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			witness_list_lock(&lle->ll_children[i], prnt);
 			nheld++;
 		}
 	return (nheld);
 }
 
 /*
  * This is a bit risky at best.  We call this function when we have timed
  * out acquiring a spin lock, and we assume that the other CPU is stuck
  * with this lock held.  So, we go groveling around in the other CPU's
  * per-cpu data to try to find the lock instance for this spin lock to
  * see when it was last acquired.
  */
 void
 witness_display_spinlock(struct lock_object *lock, struct thread *owner,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_instance *instance;
 	struct pcpu *pc;
 
 	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
 		return;
 	pc = pcpu_find(owner->td_oncpu);
 	instance = find_instance(pc->pc_spinlocks, lock);
 	if (instance != NULL)
 		witness_list_lock(instance, prnt);
 }
 
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/* Initialize for KMSAN's benefit. */
 	*filep = NULL;
 	*linep = 0;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED())
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
 
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED())
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	if (instance == NULL)
 		return;
 	instance->li_file = file;
 	instance->li_line = line;
 }
 
 static bool
 witness_find_instance(const struct lock_object *lock,
     struct lock_instance **instance)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch < 1 || KERNEL_PANICKED())
 		return (false);
 	class = LOCK_CLASS(lock);
 	if ((class->lc_flags & LC_SLEEPLOCK) != 0) {
 		*instance = find_instance(curthread->td_sleeplocks, lock);
 		return (true);
 	} else if ((class->lc_flags & LC_SPINLOCK) != 0) {
 		*instance = find_instance(PCPU_GET(spinlocks), lock);
 		return (true);
 	} else {
 		kassert_panic("Lock (%s) %s is not sleep or spin!",
 		    class->lc_name, lock->lo_name);
 		return (false);
 	}
 #else
 	return (false);
 #endif
 }
 
 void
 witness_assert(const struct lock_object *lock, int flags, const char *file,
     int line)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (!witness_find_instance(lock, &instance))
 		return;
 	class = LOCK_CLASS(lock);
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			kassert_panic("Lock (%s) %s locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
 	case LA_LOCKED | LA_NOTRECURSED:
 	case LA_SLOCKED:
 	case LA_SLOCKED | LA_RECURSED:
 	case LA_SLOCKED | LA_NOTRECURSED:
 	case LA_XLOCKED:
 	case LA_XLOCKED | LA_RECURSED:
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			kassert_panic("Lock (%s) %s not locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "Lock (%s) %s not exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "Lock (%s) %s exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("Lock (%s) %s recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	default:
 		kassert_panic("Invalid lock assertion at %s:%d.",
 		    fixup_filename(file), line);
 	}
 #endif	/* INVARIANT_SUPPORT */
 }
 
 /*
  * Checks the ownership of the lock by curthread, consulting the witness list.
  * Returns:
  *   0  if witness is disabled or did not work
  *   -1 if not owned
  *   1  if owned
  */
 int
 witness_is_owned(const struct lock_object *lock)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 
 	if (!witness_find_instance(lock, &instance))
 		return (0);
 	return (instance == NULL ? -1 : 1);
 #else
 	return (0);
 #endif
 }
 
 static void
 witness_setflag(struct lock_object *lock, int flag, int set)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED())
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 
 	if (set)
 		instance->li_flags |= flag;
 	else
 		instance->li_flags &= ~flag;
 }
 
 void
 witness_norelease(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
 #ifdef DDB
 static void
 witness_ddb_list(struct thread *td)
 {
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
 	if (witness_watch < 1)
 		return;
 
 	witness_list_locks(&td->td_sleeplocks, db_printf);
 
 	/*
 	 * We only handle spinlocks if td == curthread.  This is somewhat broken
 	 * if td is currently executing on some other CPU and holds spin locks
 	 * as we won't display those locks.  If we had a MI way of getting
 	 * the per-cpu data for a given cpu then we could use
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
 	 * That still wouldn't really fix this unless we locked the scheduler
 	 * lock or stopped the other CPU to make sure it wasn't changing the
 	 * list out from under us.  It is probably best to just not try to
 	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
 }
 
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
 
 	if (have_addr)
 		td = db_lookup_thread(addr, true);
 	else
 		td = kdb_thread;
 	witness_ddb_list(td);
 }
 
 DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
 {
 	struct thread *td;
 	struct proc *p;
 
 	/*
 	 * It would be nice to list only threads and processes that actually
 	 * held sleep locks, but that information is currently not exported
 	 * by WITNESS.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!witness_proc_has_locks(p))
 			continue;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (!witness_thread_has_locks(td))
 				continue;
 			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
 			    p->p_comm, td, td->td_tid);
 			witness_ddb_list(td);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS_FLAGS(alllocks, db_witness_list_all, DB_CMD_MEMSAFE);
 
 DB_SHOW_COMMAND_FLAGS(witness, db_witness_display, DB_CMD_MEMSAFE)
 {
 
 	witness_ddb_display(db_printf);
 }
 #endif
 
 static void
 sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx)
 {
 	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
 	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
 	int generation, i, j;
 
 	tmp_data1 = NULL;
 	tmp_data2 = NULL;
 	tmp_w1 = NULL;
 	tmp_w2 = NULL;
 
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
 
 restart:
 	mtx_lock_spin(&w_mtx);
 	generation = w_generation;
 	mtx_unlock_spin(&w_mtx);
 	sbuf_printf(sb, "Number of known direct relationships is %d\n",
 	    w_lohash.wloh_count);
 	for (i = 1; i < w_max_used_index; i++) {
 		mtx_lock_spin(&w_mtx);
 		if (generation != w_generation) {
 			mtx_unlock_spin(&w_mtx);
 
 			/* The graph has changed, try again. */
 			*oldidx = 0;
 			sbuf_clear(sb);
 			goto restart;
 		}
 
 		w1 = &w_data[i];
 		if (w1->w_reversed == 0) {
 			mtx_unlock_spin(&w_mtx);
 			continue;
 		}
 
 		/* Copy w1 locally so we can release the spin lock. */
 		*tmp_w1 = *w1;
 		mtx_unlock_spin(&w_mtx);
 
 		if (tmp_w1->w_reversed == 0)
 			continue;
 		for (j = 1; j < w_max_used_index; j++) {
 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
 				continue;
 
 			mtx_lock_spin(&w_mtx);
 			if (generation != w_generation) {
 				mtx_unlock_spin(&w_mtx);
 
 				/* The graph has changed, try again. */
 				*oldidx = 0;
 				sbuf_clear(sb);
 				goto restart;
 			}
 
 			w2 = &w_data[j];
 			data1 = witness_lock_order_get(w1, w2);
 			data2 = witness_lock_order_get(w2, w1);
 
 			/*
 			 * Copy information locally so we can release the
 			 * spin lock.
 			 */
 			*tmp_w2 = *w2;
 
 			if (data1) {
 				stack_zero(&tmp_data1->wlod_stack);
 				stack_copy(&data1->wlod_stack,
 				    &tmp_data1->wlod_stack);
 			}
 			if (data2 && data2 != data1) {
 				stack_zero(&tmp_data2->wlod_stack);
 				stack_copy(&data2->wlod_stack,
 				    &tmp_data2->wlod_stack);
 			}
 			mtx_unlock_spin(&w_mtx);
 
 			if (blessed(tmp_w1, tmp_w2))
 				continue;
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
 			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
-				sbuf_printf(sb, "\n");
+				sbuf_putc(sb, '\n');
 			}
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
-				sbuf_printf(sb, "\n");
+				sbuf_putc(sb, '\n');
 			}
 		}
 	}
 	mtx_lock_spin(&w_mtx);
 	if (generation != w_generation) {
 		mtx_unlock_spin(&w_mtx);
 
 		/*
 		 * The graph changed while we were printing stack data,
 		 * try again.
 		 */
 		*oldidx = 0;
 		sbuf_clear(sb);
 		goto restart;
 	}
 	mtx_unlock_spin(&w_mtx);
 
 	/* Free temporary storage space. */
 	free(tmp_data1, M_TEMP);
 	free(tmp_data2, M_TEMP);
 	free(tmp_w1, M_TEMP);
 	free(tmp_w2, M_TEMP);
 }
 
 static int
 sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 	sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_print_witness_badstacks(sb, &req->oldidx);
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 #ifdef DDB
 static int
 sbuf_db_printf_drain(void *arg __unused, const char *data, int len)
 {
 
 	return (db_printf("%.*s", len, data));
 }
 
 DB_SHOW_COMMAND_FLAGS(badstacks, db_witness_badstacks, DB_CMD_MEMSAFE)
 {
 	struct sbuf sb;
 	char buffer[128];
 	size_t dummy;
 
 	sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN);
 	sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL);
 	sbuf_print_witness_badstacks(&sb, &dummy);
 	sbuf_finish(&sb);
 }
 #endif
 
 static int
 sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
 {
 	static const struct {
 		enum witness_channel channel;
 		const char *name;
 	} channels[] = {
 		{ WITNESS_CONSOLE, "console" },
 		{ WITNESS_LOG, "log" },
 		{ WITNESS_NONE, "none" },
 	};
 	char buf[16];
 	u_int i;
 	int error;
 
 	buf[0] = '\0';
 	for (i = 0; i < nitems(channels); i++)
 		if (witness_channel == channels[i].channel) {
 			snprintf(buf, sizeof(buf), "%s", channels[i].name);
 			break;
 		}
 
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = EINVAL;
 	for (i = 0; i < nitems(channels); i++)
 		if (strcmp(channels[i].name, buf) == 0) {
 			witness_channel = channels[i].channel;
 			error = 0;
 			break;
 		}
 	return (error);
 }
 
 static int
 sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
 {
 	struct witness *w;
 	struct sbuf *sb;
 	int error;
 
 #ifdef __i386__
 	error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed));
 	return (error);
 #endif
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
-	sbuf_printf(sb, "\n");
+	sbuf_putc(sb, '\n');
 
 	mtx_lock_spin(&w_mtx);
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 	STAILQ_FOREACH(w, &w_all, w_list)
 		witness_add_fullgraph(sb, w);
 	mtx_unlock_spin(&w_mtx);
 
 	/*
 	 * Close the sbuf and return to userland.
 	 */
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = witness_watch;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value > 1 || value < -1 ||
 	    (witness_watch == -1 && value != witness_watch))
 		return (EINVAL);
 	witness_watch = value;
 	return (0);
 }
 
 static void
 witness_add_fullgraph(struct sbuf *sb, struct witness *w)
 {
 	int i;
 
 	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
 		return;
 	w->w_displayed = 1;
 
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
 			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
 			    w_data[i].w_name);
 			witness_add_fullgraph(sb, &w_data[i]);
 		}
 	}
 }
 
 /*
  * A simple hash function. Takes a key pointer and a key size. If size == 0,
  * interprets the key as a string and reads until the null
  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
  * hash value computed from the key.
  */
 static uint32_t
 witness_hash_djb2(const uint8_t *key, uint32_t size)
 {
 	unsigned int hash = 5381;
 	int i;
 
 	/* hash = hash * 33 + key[i] */
 	if (size)
 		for (i = 0; i < size; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 	else
 		for (i = 0; key[i] != 0; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 
 	return (hash);
 }
 
 /*
  * Initializes the two witness hash tables. Called exactly once from
  * witness_initialize().
  */
 static void
 witness_init_hash_tables(void)
 {
 	int i;
 
 	MPASS(witness_cold);
 
 	/* Initialize the hash tables. */
 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
 		w_hash.wh_array[i] = NULL;
 
 	w_hash.wh_size = WITNESS_HASH_SIZE;
 	w_hash.wh_count = 0;
 
 	/* Initialize the lock order data hash. */
 	w_lofree = NULL;
 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
 		w_lodata[i].wlod_next = w_lofree;
 		w_lofree = &w_lodata[i];
 	}
 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
 	w_lohash.wloh_count = 0;
 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
 		w_lohash.wloh_array[i] = NULL;
 }
 
 static struct witness *
 witness_hash_get(const char *key)
 {
 	struct witness *w;
 	uint32_t hash;
 
 	MPASS(key != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
 	w = w_hash.wh_array[hash];
 	while (w != NULL) {
 		if (strcmp(w->w_name, key) == 0)
 			goto out;
 		w = w->w_hash_next;
 	}
 
 out:
 	return (w);
 }
 
 static void
 witness_hash_put(struct witness *w)
 {
 	uint32_t hash;
 
 	MPASS(w != NULL);
 	MPASS(w->w_name != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	KASSERT(witness_hash_get(w->w_name) == NULL,
 	    ("%s: trying to add a hash entry that already exists!", __func__));
 	KASSERT(w->w_hash_next == NULL,
 	    ("%s: w->w_hash_next != NULL", __func__));
 
 	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
 	w->w_hash_next = w_hash.wh_array[hash];
 	w_hash.wh_array[hash] = w;
 	w_hash.wh_count++;
 }
 
 static struct witness_lock_order_data *
 witness_lock_order_get(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if ((w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
 			break;
 		data = data->wlod_next;
 	}
 
 out:
 	return (data);
 }
 
 /*
  * Verify that parent and child have a known relationship, are not the same,
  * and child is actually a child of parent.  This is done without w_mtx
  * to avoid contention in the common case.
  */
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
 
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
 	    isitmychild(parent, child))
 		return (1);
 
 	return (0);
 }
 
 static int
 witness_lock_order_add(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if (w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
 	if (data == NULL)
 		return (0);
 	w_lofree = data->wlod_next;
 	data->wlod_next = w_lohash.wloh_array[hash];
 	data->wlod_key = key;
 	w_lohash.wloh_array[hash] = data;
 	w_lohash.wloh_count++;
 	stack_save(&data->wlod_stack);
 	return (1);
 }
 
 /* Call this whenever the structure of the witness graph changes. */
 static void
 witness_increment_graph_generation(void)
 {
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
 }
 
 static int
 witness_output_drain(void *arg __unused, const char *data, int len)
 {
 
 	witness_output("%.*s", len, data);
 	return (len);
 }
 
 static void
 witness_debugger(int cond, const char *msg)
 {
 	char buf[32];
 	struct sbuf sb;
 	struct stack st;
 
 	if (!cond)
 		return;
 
 	if (witness_trace) {
 		sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 		sbuf_set_drain(&sb, witness_output_drain, NULL);
 
 		stack_save(&st);
 		witness_output("stack backtrace:\n");
 		stack_sbuf_print_ddb(&sb, &st);
 
 		sbuf_finish(&sb);
 	}
 
 	witness_enter_debugger(msg);
 }
 
 static void
 witness_enter_debugger(const char *msg)
 {
 #ifdef KDB
 	if (witness_kdb)
 		kdb_enter(KDB_WHY_WITNESS, msg);
 #endif
 }
diff --git a/sys/kern/tty_info.c b/sys/kern/tty_info.c
index 15ba5995cea9..f54fc3a30f5e 100644
--- a/sys/kern/tty_info.c
+++ b/sys/kern/tty_info.c
@@ -1,408 +1,408 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2002 Networks Associates Technologies, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed for the FreeBSD Project by
  * ThinkSec AS and NAI Labs, the Security Research Division of Network
  * Associates, Inc.  under DARPA/SPAWAR contract N66001-01-C-8035
  * ("CBOSS"), as part of the DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/cons.h>
 #include <sys/kdb.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 /*
  * Returns 1 if p2 is "better" than p1
  *
  * The algorithm for picking the "interesting" process is thus:
  *
  *	1) Only foreground processes are eligible - implied.
  *	2) Runnable processes are favored over anything else.  The runner
  *	   with the highest cpu utilization is picked (p_estcpu).  Ties are
  *	   broken by picking the highest pid.
  *	3) The sleeper with the shortest sleep time is next.  With ties,
  *	   we pick out just "short-term" sleepers (P_SINTR == 0).
  *	4) Further ties are broken by picking the highest pid.
  */
 
 #define TESTAB(a, b)    ((a)<<1 | (b))
 #define ONLYA   2
 #define ONLYB   1
 #define BOTH    3
 
 static int
 proc_sum(struct proc *p, fixpt_t *estcpup)
 {
 	struct thread *td;
 	int estcpu;
 	int val;
 
 	val = 0;
 	estcpu = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (TD_ON_RUNQ(td) ||
 		    TD_IS_RUNNING(td))
 			val = 1;
 		estcpu += sched_pctcpu(td);
 		thread_unlock(td);
 	}
 	*estcpup = estcpu;
 
 	return (val);
 }
 
 static int
 thread_compare(struct thread *td, struct thread *td2)
 {
 	int runa, runb;
 	int slpa, slpb;
 	fixpt_t esta, estb;
 
 	if (td == NULL)
 		return (1);
 
 	/*
 	 * Fetch running stats, pctcpu usage, and interruptable flag.
 	 */
 	thread_lock(td);
 	runa = TD_IS_RUNNING(td) || TD_ON_RUNQ(td);
 	slpa = td->td_flags & TDF_SINTR;
 	esta = sched_pctcpu(td);
 	thread_unlock(td);
 	thread_lock(td2);
 	runb = TD_IS_RUNNING(td2) || TD_ON_RUNQ(td2);
 	estb = sched_pctcpu(td2);
 	slpb = td2->td_flags & TDF_SINTR;
 	thread_unlock(td2);
 	/*
 	 * see if at least one of them is runnable
 	 */
 	switch (TESTAB(runa, runb)) {
 	case ONLYA:
 		return (0);
 	case ONLYB:
 		return (1);
 	case BOTH:
 		break;
 	}
 	/*
 	 *  favor one with highest recent cpu utilization
 	 */
 	if (estb > esta)
 		return (1);
 	if (esta > estb)
 		return (0);
 	/*
 	 * favor one sleeping in a non-interruptible sleep
 	 */
 	switch (TESTAB(slpa, slpb)) {
 	case ONLYA:
 		return (0);
 	case ONLYB:
 		return (1);
 	case BOTH:
 		break;
 	}
 
 	return (td < td2);
 }
 
 static int
 proc_compare(struct proc *p1, struct proc *p2)
 {
 
 	int runa, runb;
 	fixpt_t esta, estb;
 
 	if (p1 == NULL)
 		return (1);
 
 	/*
 	 * Fetch various stats about these processes.  After we drop the
 	 * lock the information could be stale but the race is unimportant.
 	 */
 	PROC_LOCK(p1);
 	runa = proc_sum(p1, &esta);
 	PROC_UNLOCK(p1);
 	PROC_LOCK(p2);
 	runb = proc_sum(p2, &estb);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * see if at least one of them is runnable
 	 */
 	switch (TESTAB(runa, runb)) {
 	case ONLYA:
 		return (0);
 	case ONLYB:
 		return (1);
 	case BOTH:
 		break;
 	}
 	/*
 	 *  favor one with highest recent cpu utilization
 	 */
 	if (estb > esta)
 		return (1);
 	if (esta > estb)
 		return (0);
 	/*
 	 * weed out zombies
 	 */
 	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
 	case ONLYA:
 		return (1);
 	case ONLYB:
 		return (0);
 	case BOTH:
 		break;
 	}
 
 	return (p2->p_pid > p1->p_pid);		/* tie - return highest pid */
 }
 
 static int
 sbuf_tty_drain(void *a, const char *d, int len)
 {
 	struct tty *tp;
 	int rc;
 
 	tp = a;
 
 	if (kdb_active) {
 		cnputsn(d, len);
 		return (len);
 	}
 	if (tp != NULL && !KERNEL_PANICKED()) {
 		rc = tty_putstrn(tp, d, len);
 		if (rc != 0)
 			return (-ENXIO);
 		return (len);
 	}
 	return (-ENXIO);
 }
 
 #ifdef STACK
 #ifdef INVARIANTS
 static int tty_info_kstacks = STACK_SBUF_FMT_COMPACT;
 #else
 static int tty_info_kstacks = STACK_SBUF_FMT_NONE;
 #endif
 
 static int
 sysctl_tty_info_kstacks(SYSCTL_HANDLER_ARGS)
 {
 	enum stack_sbuf_fmt val;
 	int error;
 
 	val = tty_info_kstacks;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	switch (val) {
 	case STACK_SBUF_FMT_NONE:
 	case STACK_SBUF_FMT_LONG:
 	case STACK_SBUF_FMT_COMPACT:
 		tty_info_kstacks = val;
 		break;
 	default:
 		error = EINVAL;
 	}
 
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, tty_info_kstacks,
     CTLFLAG_RWTUN | CTLFLAG_MPSAFE | CTLTYPE_INT, NULL, 0,
     sysctl_tty_info_kstacks, "I",
     "Adjust format of kernel stack(9) traces on ^T (tty info): "
     "0 - disabled; 1 - long; 2 - compact");
 #endif
 
 /*
  * Report on state of foreground process group.
  */
 void
 tty_info(struct tty *tp)
 {
 	struct timeval rtime, utime, stime;
 #ifdef STACK
 	struct stack stack;
 	int sterr, kstacks_val;
 	bool print_kstacks;
 #endif
 	struct proc *p, *ppick;
 	struct thread *td, *tdpick;
 	const char *stateprefix, *state;
 	struct sbuf sb;
 	long rss;
 	int load, pctcpu;
 	pid_t pid;
 	char comm[MAXCOMLEN + 1];
 	struct rusage ru;
 
 	tty_assert_locked(tp);
 
 	if (tty_checkoutq(tp) == 0)
 		return;
 
 	(void)sbuf_new(&sb, tp->t_prbuf, tp->t_prbufsz, SBUF_FIXEDLEN);
 	sbuf_set_drain(&sb, sbuf_tty_drain, tp);
 
 	/* Print load average. */
 	load = ((int64_t)averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
 	sbuf_printf(&sb, "%sload: %d.%02d ", tp->t_column == 0 ? "" : "\n",
 	    load / 100, load % 100);
 
 	if (tp->t_session == NULL) {
-		sbuf_printf(&sb, "not a controlling terminal\n");
+		sbuf_cat(&sb, "not a controlling terminal\n");
 		goto out;
 	}
 	if (tp->t_pgrp == NULL) {
-		sbuf_printf(&sb, "no foreground process group\n");
+		sbuf_cat(&sb, "no foreground process group\n");
 		goto out;
 	}
 	PGRP_LOCK(tp->t_pgrp);
 	if (LIST_EMPTY(&tp->t_pgrp->pg_members)) {
 		PGRP_UNLOCK(tp->t_pgrp);
-		sbuf_printf(&sb, "empty foreground process group\n");
+		sbuf_cat(&sb, "empty foreground process group\n");
 		goto out;
 	}
 
 	/*
 	 * Pick the most interesting process and copy some of its
 	 * state for printing later.  This operation could rely on stale
 	 * data as we can't hold the proc slock or thread locks over the
 	 * whole list. However, we're guaranteed not to reference an exited
 	 * thread or proc since we hold the tty locked.
 	 */
 	p = NULL;
 	LIST_FOREACH(ppick, &tp->t_pgrp->pg_members, p_pglist)
 		if (proc_compare(p, ppick))
 			p = ppick;
 
 	PROC_LOCK(p);
 	PGRP_UNLOCK(tp->t_pgrp);
 	td = NULL;
 	FOREACH_THREAD_IN_PROC(p, tdpick)
 		if (thread_compare(td, tdpick))
 			td = tdpick;
 	stateprefix = "";
 	thread_lock(td);
 	if (TD_IS_RUNNING(td))
 		state = "running";
 	else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td))
 		state = "runnable";
 	else if (TD_IS_SLEEPING(td)) {
 		/* XXX: If we're sleeping, are we ever not in a queue? */
 		if (TD_ON_SLEEPQ(td))
 			state = td->td_wmesg;
 		else
 			state = "sleeping without queue";
 	} else if (TD_ON_LOCK(td)) {
 		state = td->td_lockname;
 		stateprefix = "*";
 	} else if (TD_IS_SUSPENDED(td))
 		state = "suspended";
 	else if (TD_AWAITING_INTR(td))
 		state = "intrwait";
 	else if (p->p_state == PRS_ZOMBIE)
 		state = "zombie";
 	else
 		state = "unknown";
 	pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT;
 #ifdef STACK
 	kstacks_val = atomic_load_int(&tty_info_kstacks);
 	print_kstacks = (kstacks_val != STACK_SBUF_FMT_NONE);
 
 	if (print_kstacks) {
 		if (TD_IS_SWAPPED(td))
 			sterr = ENOENT;
 		else
 			sterr = stack_save_td(&stack, td);
 	}
 #endif
 	thread_unlock(td);
 	if (p->p_state == PRS_NEW || p->p_state == PRS_ZOMBIE)
 		rss = 0;
 	else
 		rss = pgtok(vmspace_resident_count(p->p_vmspace));
 	microuptime(&rtime);
 	timevalsub(&rtime, &p->p_stats->p_start);
 	rufetchcalc(p, &ru, &utime, &stime);
 	pid = p->p_pid;
 	strlcpy(comm, p->p_comm, sizeof comm);
 	PROC_UNLOCK(p);
 
 	/* Print command, pid, state, rtime, utime, stime, %cpu, and rss. */
 	sbuf_printf(&sb,
 	    " cmd: %s %d [%s%s] %ld.%02ldr %ld.%02ldu %ld.%02lds %d%% %ldk\n",
 	    comm, pid, stateprefix, state,
 	    (long)rtime.tv_sec, rtime.tv_usec / 10000,
 	    (long)utime.tv_sec, utime.tv_usec / 10000,
 	    (long)stime.tv_sec, stime.tv_usec / 10000,
 	    pctcpu / 100, rss);
 
 #ifdef STACK
 	if (print_kstacks && sterr == 0)
 		stack_sbuf_print_flags(&sb, &stack, M_NOWAIT, kstacks_val);
 #endif
 
 out:
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 }
diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c
index 7a429e6392b5..569f8560788c 100644
--- a/sys/kern/vfs_mountroot.c
+++ b/sys/kern/vfs_mountroot.c
@@ -1,1166 +1,1166 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2010 Marcel Moolenaar
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *      The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_rootdevname.h"
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mdioctl.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 
 /*
  * The root filesystem is detailed in the kernel environment variable
  * vfs.root.mountfrom, which is expected to be in the general format
  *
  * <vfsname>:[<path>][	<vfsname>:[<path>] ...]
  * vfsname   := the name of a VFS known to the kernel and capable
  *              of being mounted as root
  * path      := disk device name or other data used by the filesystem
  *              to locate its physical store
  *
  * If the environment variable vfs.root.mountfrom is a space separated list,
  * each list element is tried in turn and the root filesystem will be mounted
  * from the first one that succeeds.
  *
  * The environment variable vfs.root.mountfrom.options is a comma delimited
  * set of string mount options.  These mount options must be parseable
  * by nmount() in the kernel.
  */
 
 static int parse_mount(char **);
 static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
 static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS);
 static void vfs_mountroot_wait(void);
 static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev);
 
 /*
  * The vnode of the system's root (/ in the filesystem, without chroot
  * active.)
  */
 struct vnode *rootvnode;
 
 /*
  * Mount of the system's /dev.
  */
 struct mount *rootdevmp;
 
 char *rootdevnames[2] = {NULL, NULL};
 
 struct mtx root_holds_mtx;
 MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF);
 
 static TAILQ_HEAD(, root_hold_token)	root_holds =
     TAILQ_HEAD_INITIALIZER(root_holds);
 
 enum action {
 	A_CONTINUE,
 	A_PANIC,
 	A_REBOOT,
 	A_RETRY
 };
 
 enum rh_flags {
 	RH_FREE,
 	RH_ALLOC,
 	RH_ARG,
 };
 
 static enum action root_mount_onfail = A_CONTINUE;
 
 static int root_mount_mddev;
 static int root_mount_complete;
 
 /* By default wait up to 3 seconds for devices to appear. */
 static int root_mount_timeout = 3;
 TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
 
 static int root_mount_always_wait = 0;
 SYSCTL_INT(_vfs, OID_AUTO, root_mount_always_wait, CTLFLAG_RDTUN,
     &root_mount_always_wait, 0,
     "Wait for root mount holds even if the root device already exists");
 
 SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_vfs_root_mount_hold, "A",
     "List of root mount hold tokens");
 
 static int
 sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct root_hold_token *h;
 	int error;
 
 	sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL);
 
 	mtx_lock(&root_holds_mtx);
 	TAILQ_FOREACH(h, &root_holds, list) {
 		if (h != TAILQ_FIRST(&root_holds))
 			sbuf_putc(&sb, ' ');
 		sbuf_printf(&sb, "%s", h->who);
 	}
 	mtx_unlock(&root_holds_mtx);
 
 	error = sbuf_finish(&sb);
 	if (error == 0)
 		error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
 	sbuf_delete(&sb);
 	return (error);
 }
 
 struct root_hold_token *
 root_mount_hold(const char *identifier)
 {
 	struct root_hold_token *h;
 
 	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
 	h->flags = RH_ALLOC;
 	h->who = identifier;
 	mtx_lock(&root_holds_mtx);
 	TSHOLD("root mount");
 	TAILQ_INSERT_TAIL(&root_holds, h, list);
 	mtx_unlock(&root_holds_mtx);
 	return (h);
 }
 
 void
 root_mount_hold_token(const char *identifier, struct root_hold_token *h)
 {
 #ifdef INVARIANTS
 	struct root_hold_token *t;
 #endif
 
 	h->flags = RH_ARG;
 	h->who = identifier;
 	mtx_lock(&root_holds_mtx);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(t, &root_holds, list) {
 		if (t == h) {
 			panic("Duplicate mount hold by '%s' on %p",
 			    identifier, h);
 		}
 	}
 #endif
 	TSHOLD("root mount");
 	TAILQ_INSERT_TAIL(&root_holds, h, list);
 	mtx_unlock(&root_holds_mtx);
 }
 
 void
 root_mount_rel(struct root_hold_token *h)
 {
 
 	if (h == NULL || h->flags == RH_FREE)
 		return;
 
 	mtx_lock(&root_holds_mtx);
 	TAILQ_REMOVE(&root_holds, h, list);
 	TSRELEASE("root mount");
 	wakeup(&root_holds);
 	mtx_unlock(&root_holds_mtx);
 	if (h->flags == RH_ALLOC) {
 		free(h, M_DEVBUF);
 	} else
 		h->flags = RH_FREE;
 }
 
 int
 root_mounted(void)
 {
 
 	/* No mutex is acquired here because int stores are atomic. */
 	return (root_mount_complete);
 }
 
 static void
 set_rootvnode(void)
 {
 
 	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode))
 		panic("set_rootvnode: Cannot find root vnode");
 
 	VOP_UNLOCK(rootvnode);
 
 	pwd_set_rootvnode();
 }
 
 static int
 vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
 {
 	struct vfsoptlist *opts;
 	struct vfsconf *vfsp;
 	struct mount *mp;
 	int error;
 
 	*mpp = NULL;
 
 	if (rootdevmp != NULL) {
 		/*
 		 * Already have /dev; this happens during rerooting.
 		 */
 		error = vfs_busy(rootdevmp, 0);
 		if (error != 0)
 			return (error);
 		*mpp = rootdevmp;
 	} else {
 		vfsp = vfs_byname("devfs");
 		KASSERT(vfsp != NULL, ("Could not find devfs by name"));
 		if (vfsp == NULL)
 			return (ENOENT);
 
 		mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
 
 		error = VFS_MOUNT(mp);
 		KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
 		if (error)
 			return (error);
 
 		error = VFS_STATFS(mp, &mp->mnt_stat);
 		KASSERT(error == 0, ("VFS_STATFS(devfs) failed %d", error));
 		if (error)
 			return (error);
 
 		opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 		TAILQ_INIT(opts);
 		mp->mnt_opt = opts;
 
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 
 		*mpp = mp;
 		rootdevmp = mp;
 		vfs_op_exit(mp);
 	}
 
 	set_rootvnode();
 
 	error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE);
 	if (error)
 		printf("kern_symlink /dev -> / returns %d\n", error);
 
 	return (error);
 }
 
 static void
 vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
 {
 	struct nameidata nd;
 	struct mount *mporoot, *mpnroot;
 	struct vnode *vp, *vporoot, *vpdevfs;
 	char *fspath;
 	int error;
 
 	mpnroot = TAILQ_NEXT(mpdevfs, mnt_list);
 
 	/* Shuffle the mountlist. */
 	mtx_lock(&mountlist_mtx);
 	mporoot = TAILQ_FIRST(&mountlist);
 	TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list);
 	if (mporoot != mpdevfs) {
 		TAILQ_REMOVE(&mountlist, mpnroot, mnt_list);
 		TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list);
 	}
 	TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 
 	cache_purgevfs(mporoot);
 	if (mporoot != mpdevfs)
 		cache_purgevfs(mpdevfs);
 
 	if (VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot))
 		panic("vfs_mountroot_shuffle: Cannot find root vnode");
 
 	VI_LOCK(vporoot);
 	vporoot->v_iflag &= ~VI_MOUNT;
 	vn_irflag_unset_locked(vporoot, VIRF_MOUNTPOINT);
 	vporoot->v_mountedhere = NULL;
 	VI_UNLOCK(vporoot);
 	mporoot->mnt_flag &= ~MNT_ROOTFS;
 	mporoot->mnt_vnodecovered = NULL;
 	vput(vporoot);
 
 	/* Set up the new rootvnode, and purge the cache */
 	mpnroot->mnt_vnodecovered = NULL;
 	set_rootvnode();
 	cache_purgevfs(rootvnode->v_mount);
 
 	if (mporoot != mpdevfs) {
 		/* Remount old root under /.mount or /mnt */
 		fspath = "/.mount";
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath);
 		error = namei(&nd);
 		if (error) {
 			fspath = "/mnt";
 			NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 			    fspath);
 			error = namei(&nd);
 		}
 		if (!error) {
 			NDFREE_PNBUF(&nd);
 			vp = nd.ni_vp;
 			error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
 			if (!error)
 				error = vinvalbuf(vp, V_SAVE, 0, 0);
 			if (!error) {
 				cache_purge(vp);
 				VI_LOCK(vp);
 				mporoot->mnt_vnodecovered = vp;
 				vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
 				vp->v_mountedhere = mporoot;
 				strlcpy(mporoot->mnt_stat.f_mntonname,
 				    fspath, MNAMELEN);
 				VI_UNLOCK(vp);
 				VOP_UNLOCK(vp);
 			} else
 				vput(vp);
 		}
 
 		if (error)
 			printf("mountroot: unable to remount previous root "
 			    "under /.mount or /mnt (error %d)\n", error);
 	}
 
 	/* Remount devfs under /dev */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev");
 	error = namei(&nd);
 	if (!error) {
 		NDFREE_PNBUF(&nd);
 		vp = nd.ni_vp;
 		error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
 		if (!error)
 			error = vinvalbuf(vp, V_SAVE, 0, 0);
 		if (!error) {
 			vpdevfs = mpdevfs->mnt_vnodecovered;
 			if (vpdevfs != NULL) {
 				cache_purge(vpdevfs);
 				VI_LOCK(vpdevfs);
 				vn_irflag_unset_locked(vpdevfs, VIRF_MOUNTPOINT);
 				vpdevfs->v_mountedhere = NULL;
 				VI_UNLOCK(vpdevfs);
 				vrele(vpdevfs);
 			}
 			VI_LOCK(vp);
 			mpdevfs->mnt_vnodecovered = vp;
 			vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
 			vp->v_mountedhere = mpdevfs;
 			VI_UNLOCK(vp);
 			VOP_UNLOCK(vp);
 		} else
 			vput(vp);
 	}
 	if (error)
 		printf("mountroot: unable to remount devfs under /dev "
 		    "(error %d)\n", error);
 
 	if (mporoot == mpdevfs) {
 		vfs_unbusy(mpdevfs);
 		/* Unlink the no longer needed /dev/dev -> / symlink */
 		error = kern_funlinkat(td, AT_FDCWD, "/dev/dev", FD_NONE,
 		    UIO_SYSSPACE, 0, 0);
 		if (error)
 			printf("mountroot: unable to unlink /dev/dev "
 			    "(error %d)\n", error);
 	}
 }
 
 /*
  * Configuration parser.
  */
 
 /* Parser character classes. */
 #define	CC_WHITESPACE		-1
 #define	CC_NONWHITESPACE	-2
 
 /* Parse errors. */
 #define	PE_EOF			-1
 #define	PE_EOL			-2
 
 static __inline int
 parse_peek(char **conf)
 {
 
 	return (**conf);
 }
 
 static __inline void
 parse_poke(char **conf, int c)
 {
 
 	**conf = c;
 }
 
 static __inline void
 parse_advance(char **conf)
 {
 
 	(*conf)++;
 }
 
 static int
 parse_skipto(char **conf, int mc)
 {
 	int c, match;
 
 	while (1) {
 		c = parse_peek(conf);
 		if (c == 0)
 			return (PE_EOF);
 		switch (mc) {
 		case CC_WHITESPACE:
 			match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0;
 			break;
 		case CC_NONWHITESPACE:
 			if (c == '\n')
 				return (PE_EOL);
 			match = (c != ' ' && c != '\t') ? 1 : 0;
 			break;
 		default:
 			match = (c == mc) ? 1 : 0;
 			break;
 		}
 		if (match)
 			break;
 		parse_advance(conf);
 	}
 	return (0);
 }
 
 static int
 parse_token(char **conf, char **tok)
 {
 	char *p;
 	size_t len;
 	int error;
 
 	*tok = NULL;
 	error = parse_skipto(conf, CC_NONWHITESPACE);
 	if (error)
 		return (error);
 	p = *conf;
 	error = parse_skipto(conf, CC_WHITESPACE);
 	len = *conf - p;
 	*tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO);
 	bcopy(p, *tok, len);
 	return (0);
 }
 
 static void
 parse_dir_ask_printenv(const char *var)
 {
 	char *val;
 
 	val = kern_getenv(var);
 	if (val != NULL) {
 		printf("  %s=%s\n", var, val);
 		freeenv(val);
 	}
 }
 
 static int
 parse_dir_ask(char **conf)
 {
 	char name[80];
 	char *mnt;
 	int error;
 
 	vfs_mountroot_wait();
 
 	printf("\nLoader variables:\n");
 	parse_dir_ask_printenv("vfs.root.mountfrom");
 	parse_dir_ask_printenv("vfs.root.mountfrom.options");
 
 	printf("\nManual root filesystem specification:\n");
 	printf("  <fstype>:<device> [options]\n");
 	printf("      Mount <device> using filesystem <fstype>\n");
 	printf("      and with the specified (optional) option list.\n");
 	printf("\n");
 	printf("    eg. ufs:/dev/da0s1a\n");
 	printf("        zfs:zroot/ROOT/default\n");
 	printf("        cd9660:/dev/cd0 ro\n");
 	printf("          (which is equivalent to: ");
 	printf("mount -t cd9660 -o ro /dev/cd0 /)\n");
 	printf("\n");
 	printf("  ?               List valid disk boot devices\n");
 	printf("  .               Yield 1 second (for background tasks)\n");
 	printf("  <empty line>    Abort manual input\n");
 
 	do {
 		error = EINVAL;
 		printf("\nmountroot> ");
 		cngets(name, sizeof(name), GETS_ECHO);
 		if (name[0] == '\0')
 			break;
 		if (name[0] == '?' && name[1] == '\0') {
 			printf("\nList of GEOM managed disk devices:\n  ");
 			g_dev_print();
 			continue;
 		}
 		if (name[0] == '.' && name[1] == '\0') {
 			pause("rmask", hz);
 			continue;
 		}
 		mnt = name;
 		error = parse_mount(&mnt);
 		if (error == -1)
 			printf("Invalid file system specification.\n");
 	} while (error != 0);
 
 	return (error);
 }
 
 static int
 parse_dir_md(char **conf)
 {
 	struct stat sb;
 	struct thread *td;
 	struct md_ioctl *mdio;
 	char *path, *tok;
 	int error, fd, len;
 
 	td = curthread;
 	fd = -1;
 
 	error = parse_token(conf, &tok);
 	if (error)
 		return (error);
 
 	len = strlen(tok);
 	mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO);
 	path = (void *)(mdio + 1);
 	bcopy(tok, path, len);
 	free(tok, M_TEMP);
 
 	/* Get file status. */
 	error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb);
 	if (error)
 		goto out;
 
 	/* Open /dev/mdctl so that we can attach/detach. */
 	error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE,
 	    O_RDWR, 0);
 	if (error)
 		goto out;
 
 	fd = td->td_retval[0];
 	mdio->md_version = MDIOVERSION;
 	mdio->md_type = MD_VNODE;
 
 	if (root_mount_mddev != -1) {
 		mdio->md_unit = root_mount_mddev;
 		(void)kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
 		/* Ignore errors. We don't care. */
 		root_mount_mddev = -1;
 	}
 
 	mdio->md_file = (void *)(mdio + 1);
 	mdio->md_options = MD_AUTOUNIT | MD_READONLY;
 	mdio->md_mediasize = sb.st_size;
 	mdio->md_unit = 0;
 	error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio);
 	if (error)
 		goto out;
 
 	if (mdio->md_unit > 9) {
 		printf("rootmount: too many md units\n");
 		mdio->md_file = NULL;
 		mdio->md_options = 0;
 		mdio->md_mediasize = 0;
 		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
 		/* Ignore errors. We don't care. */
 		error = ERANGE;
 		goto out;
 	}
 
 	root_mount_mddev = mdio->md_unit;
 	printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file);
 
  out:
 	if (fd >= 0)
 		(void)kern_close(td, fd);
 	free(mdio, M_TEMP);
 	return (error);
 }
 
 static int
 parse_dir_onfail(char **conf)
 {
 	char *action;
 	int error;
 
 	error = parse_token(conf, &action);
 	if (error)
 		return (error);
 
 	if (!strcmp(action, "continue"))
 		root_mount_onfail = A_CONTINUE;
 	else if (!strcmp(action, "panic"))
 		root_mount_onfail = A_PANIC;
 	else if (!strcmp(action, "reboot"))
 		root_mount_onfail = A_REBOOT;
 	else if (!strcmp(action, "retry"))
 		root_mount_onfail = A_RETRY;
 	else {
 		printf("rootmount: %s: unknown action\n", action);
 		error = EINVAL;
 	}
 
 	free(action, M_TEMP);
 	return (0);
 }
 
 static int
 parse_dir_timeout(char **conf)
 {
 	char *tok, *endtok;
 	long secs;
 	int error;
 
 	error = parse_token(conf, &tok);
 	if (error)
 		return (error);
 
 	secs = strtol(tok, &endtok, 0);
 	error = (secs < 0 || *endtok != '\0') ? EINVAL : 0;
 	if (!error)
 		root_mount_timeout = secs;
 	free(tok, M_TEMP);
 	return (error);
 }
 
 static int
 parse_directive(char **conf)
 {
 	char *dir;
 	int error;
 
 	error = parse_token(conf, &dir);
 	if (error)
 		return (error);
 
 	if (strcmp(dir, ".ask") == 0)
 		error = parse_dir_ask(conf);
 	else if (strcmp(dir, ".md") == 0)
 		error = parse_dir_md(conf);
 	else if (strcmp(dir, ".onfail") == 0)
 		error = parse_dir_onfail(conf);
 	else if (strcmp(dir, ".timeout") == 0)
 		error = parse_dir_timeout(conf);
 	else {
 		printf("mountroot: invalid directive `%s'\n", dir);
 		/* Ignore the rest of the line. */
 		(void)parse_skipto(conf, '\n');
 		error = EINVAL;
 	}
 	free(dir, M_TEMP);
 	return (error);
 }
 
 static bool
 parse_mount_dev_present(const char *dev)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, dev);
 	error = namei(&nd);
 	if (error != 0)
 		return (false);
 	vrele(nd.ni_vp);
 	NDFREE_PNBUF(&nd);
 	return (true);
 }
 
 #define	ERRMSGL	255
 static int
 parse_mount(char **conf)
 {
 	char *errmsg;
 	struct mntarg *ma;
 	char *dev, *fs, *opts, *tok;
 	int delay, error, timeout;
 
 	error = parse_token(conf, &tok);
 	if (error)
 		return (error);
 	fs = tok;
 	error = parse_skipto(&tok, ':');
 	if (error) {
 		free(fs, M_TEMP);
 		return (error);
 	}
 	parse_poke(&tok, '\0');
 	parse_advance(&tok);
 	dev = tok;
 
 	if (root_mount_mddev != -1) {
 		/* Handle substitution for the md unit number. */
 		tok = strstr(dev, "md#");
 		if (tok != NULL)
 			tok[2] = '0' + root_mount_mddev;
 	}
 
 	/* Parse options. */
 	error = parse_token(conf, &tok);
 	opts = (error == 0) ? tok : NULL;
 
 	printf("Trying to mount root from %s:%s [%s]...\n", fs, dev,
 	    (opts != NULL) ? opts : "");
 
 	errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
 
 	if (vfs_byname(fs) == NULL) {
 		strlcpy(errmsg, "unknown file system", ERRMSGL);
 		error = ENOENT;
 		goto out;
 	}
 
 	error = vfs_mountroot_wait_if_neccessary(fs, dev);
 	if (error != 0)
 		goto out;
 
 	delay = hz / 10;
 	timeout = root_mount_timeout * hz;
 
 	for (;;) {
 		ma = NULL;
 		ma = mount_arg(ma, "fstype", fs, -1);
 		ma = mount_arg(ma, "fspath", "/", -1);
 		ma = mount_arg(ma, "from", dev, -1);
 		ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
 		ma = mount_arg(ma, "ro", NULL, 0);
 		ma = parse_mountroot_options(ma, opts);
 
 		error = kernel_mount(ma, MNT_ROOTFS);
 		if (error == 0 || error == EILSEQ || timeout <= 0)
 			break;
 
 		if (root_mount_timeout * hz == timeout ||
 		    (bootverbose && timeout % hz == 0)) {
 			printf("Mounting from %s:%s failed with error %d; "
 			    "retrying for %d more second%s\n", fs, dev, error,
 			    timeout / hz, (timeout / hz > 1) ? "s" : "");
 		}
 		pause("rmretry", delay);
 		timeout -= delay;
 	}
  out:
 	if (error) {
 		printf("Mounting from %s:%s failed with error %d",
 		    fs, dev, error);
 		if (errmsg[0] != '\0')
 			printf(": %s", errmsg);
 		printf(".\n");
 	}
 	free(fs, M_TEMP);
 	free(errmsg, M_TEMP);
 	if (opts != NULL)
 		free(opts, M_TEMP);
 	/* kernel_mount can return -1 on error. */
 	return ((error < 0) ? EDOOFUS : error);
 }
 #undef ERRMSGL
 
 static int
 vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs)
 {
 	struct mount *mp;
 	char *conf;
 	int error;
 
 	root_mount_mddev = -1;
 
 retry:
 	conf = sbuf_data(sb);
 	mp = TAILQ_NEXT(mpdevfs, mnt_list);
 	error = (mp == NULL) ? 0 : EDOOFUS;
 	root_mount_onfail = A_CONTINUE;
 	while (mp == NULL) {
 		error = parse_skipto(&conf, CC_NONWHITESPACE);
 		if (error == PE_EOL) {
 			parse_advance(&conf);
 			continue;
 		}
 		if (error < 0)
 			break;
 		switch (parse_peek(&conf)) {
 		case '#':
 			error = parse_skipto(&conf, '\n');
 			break;
 		case '.':
 			error = parse_directive(&conf);
 			break;
 		default:
 			error = parse_mount(&conf);
 			if (error == -1) {
 				printf("mountroot: invalid file system "
 				    "specification.\n");
 				error = 0;
 			}
 			break;
 		}
 		if (error < 0)
 			break;
 		/* Ignore any trailing garbage on the line. */
 		if (parse_peek(&conf) != '\n') {
 			printf("mountroot: advancing to next directive...\n");
 			(void)parse_skipto(&conf, '\n');
 		}
 		mp = TAILQ_NEXT(mpdevfs, mnt_list);
 	}
 	if (mp != NULL)
 		return (0);
 
 	/*
 	 * We failed to mount (a new) root.
 	 */
 	switch (root_mount_onfail) {
 	case A_CONTINUE:
 		break;
 	case A_PANIC:
 		panic("mountroot: unable to (re-)mount root.");
 		/* NOTREACHED */
 	case A_RETRY:
 		goto retry;
 	case A_REBOOT:
 		kern_reboot(RB_NOSYNC);
 		/* NOTREACHED */
 	}
 
 	return (error);
 }
 
 static void
 vfs_mountroot_conf0(struct sbuf *sb)
 {
 	char *s, *tok, *mnt, *opt;
 	int error;
 
-	sbuf_printf(sb, ".onfail panic\n");
+	sbuf_cat(sb, ".onfail panic\n");
 	sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
 	if (boothowto & RB_ASKNAME)
-		sbuf_printf(sb, ".ask\n");
+		sbuf_cat(sb, ".ask\n");
 #ifdef ROOTDEVNAME
 	if (boothowto & RB_DFLTROOT)
 		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
 #endif
 	if (boothowto & RB_CDROM) {
-		sbuf_printf(sb, "cd9660:/dev/cd0 ro\n");
-		sbuf_printf(sb, ".timeout 0\n");
-		sbuf_printf(sb, "cd9660:/dev/cd1 ro\n");
+		sbuf_cat(sb, "cd9660:/dev/cd0 ro\n");
+		sbuf_cat(sb, ".timeout 0\n");
+		sbuf_cat(sb, "cd9660:/dev/cd1 ro\n");
 		sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
 	}
 	s = kern_getenv("vfs.root.mountfrom");
 	if (s != NULL) {
 		opt = kern_getenv("vfs.root.mountfrom.options");
 		tok = s;
 		error = parse_token(&tok, &mnt);
 		while (!error) {
 			sbuf_printf(sb, "%s %s\n", mnt,
 			    (opt != NULL) ? opt : "");
 			free(mnt, M_TEMP);
 			error = parse_token(&tok, &mnt);
 		}
 		if (opt != NULL)
 			freeenv(opt);
 		freeenv(s);
 	}
 	if (rootdevnames[0] != NULL)
 		sbuf_printf(sb, "%s\n", rootdevnames[0]);
 	if (rootdevnames[1] != NULL)
 		sbuf_printf(sb, "%s\n", rootdevnames[1]);
 #ifdef ROOTDEVNAME
 	if (!(boothowto & RB_DFLTROOT))
 		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
 #endif
 	if (!(boothowto & RB_ASKNAME))
-		sbuf_printf(sb, ".ask\n");
+		sbuf_cat(sb, ".ask\n");
 }
 
 static int
 vfs_mountroot_readconf(struct thread *td, struct sbuf *sb)
 {
 	static char buf[128];
 	struct nameidata nd;
 	off_t ofs;
 	ssize_t resid;
 	int error, flags, len;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf");
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		return (error);
 
 	NDFREE_PNBUF(&nd);
 	ofs = 0;
 	len = sizeof(buf) - 1;
 	while (1) {
 		error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
 		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 		    NOCRED, &resid, td);
 		if (error)
 			break;
 		if (resid == len)
 			break;
 		buf[len - resid] = 0;
 		sbuf_printf(sb, "%s", buf);
 		ofs += len - resid;
 	}
 
 	VOP_UNLOCK(nd.ni_vp);
 	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
 	return (error);
 }
 
 static void
 vfs_mountroot_wait(void)
 {
 	struct root_hold_token *h;
 	struct thread *td;
 	struct timeval lastfail;
 	int curfail;
 
 	TSENTER();
 
 	curfail = 0;
 	lastfail.tv_sec = 0;
 	eventratecheck(&lastfail, &curfail, 1);
 	td = curthread;
 	while (1) {
 		g_waitidle(td);
 		mtx_lock(&root_holds_mtx);
 		if (TAILQ_EMPTY(&root_holds)) {
 			mtx_unlock(&root_holds_mtx);
 			break;
 		}
 		if (eventratecheck(&lastfail, &curfail, 1)) {
 			printf("Root mount waiting for:");
 			TAILQ_FOREACH(h, &root_holds, list)
 				printf(" %s", h->who);
 			printf("\n");
 		}
 		TSWAIT("root mount");
 		msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold",
 		    hz);
 		TSUNWAIT("root mount");
 	}
 	g_waitidle(td);
 
 	TSEXIT();
 }
 
 static int
 vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev)
 {
 	int delay, timeout;
 
 	/*
 	 * In case of ZFS and NFS we don't have a way to wait for
 	 * specific device.  Also do the wait if the user forced that
 	 * behaviour by setting vfs.root_mount_always_wait=1.
 	 */
 	if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL ||
 	    dev[0] == '\0' || root_mount_always_wait != 0) {
 		vfs_mountroot_wait();
 		return (0);
 	}
 
 	/*
 	 * Otherwise, no point in waiting if the device is already there.
 	 * Note that we must wait for GEOM to finish reconfiguring itself,
 	 * eg for geom_part(4) to finish tasting.
 	 */
 	g_waitidle(curthread);
 	if (parse_mount_dev_present(dev))
 		return (0);
 
 	/*
 	 * No luck.  Let's wait.  This code looks weird, but it's that way
 	 * to behave exactly as it used to work before.
 	 */
 	vfs_mountroot_wait();
 	if (parse_mount_dev_present(dev))
 		return (0);
 	printf("mountroot: waiting for device %s...\n", dev);
 	delay = hz / 10;
 	timeout = root_mount_timeout * hz;
 	do {
 		pause("rmdev", delay);
 		timeout -= delay;
 	} while (timeout > 0 && !parse_mount_dev_present(dev));
 
 	if (timeout <= 0)
 		return (ENODEV);
 
 	return (0);
 }
 
 void
 vfs_mountroot(void)
 {
 	struct mount *mp;
 	struct sbuf *sb;
 	struct thread *td;
 	time_t timebase;
 	int error;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	TSENTER();
 
 	td = curthread;
 
 	sb = sbuf_new_auto();
 	vfs_mountroot_conf0(sb);
 	sbuf_finish(sb);
 
 	error = vfs_mountroot_devfs(td, &mp);
 	while (!error) {
 		error = vfs_mountroot_parse(sb, mp);
 		if (!error) {
 			vfs_mountroot_shuffle(td, mp);
 			sbuf_clear(sb);
 			error = vfs_mountroot_readconf(td, sb);
 			sbuf_finish(sb);
 		}
 	}
 
 	sbuf_delete(sb);
 
 	/*
 	 * Iterate over all currently mounted file systems and use
 	 * the time stamp found to check and/or initialize the RTC.
 	 * Call inittodr() only once and pass it the largest of the
 	 * timestamps we encounter.
 	 */
 	timebase = 0;
 	mtx_lock(&mountlist_mtx);
 	mp = TAILQ_FIRST(&mountlist);
 	while (mp != NULL) {
 		if (mp->mnt_time > timebase)
 			timebase = mp->mnt_time;
 		mp = TAILQ_NEXT(mp, mnt_list);
 	}
 	mtx_unlock(&mountlist_mtx);
 	inittodr(timebase);
 
 	/* Keep prison0's root in sync with the global rootvnode. */
 	mtx_lock(&prison0.pr_mtx);
 	prison0.pr_root = rootvnode;
 	vref(prison0.pr_root);
 	mtx_unlock(&prison0.pr_mtx);
 
 	mtx_lock(&root_holds_mtx);
 	atomic_store_rel_int(&root_mount_complete, 1);
 	wakeup(&root_mount_complete);
 	mtx_unlock(&root_holds_mtx);
 
 	EVENTHANDLER_INVOKE(mountroot);
 
 	TSEXIT();
 }
 
 static struct mntarg *
 parse_mountroot_options(struct mntarg *ma, const char *options)
 {
 	char *p;
 	char *name, *name_arg;
 	char *val, *val_arg;
 	char *opts;
 
 	if (options == NULL || options[0] == '\0')
 		return (ma);
 
 	p = opts = strdup(options, M_MOUNT);
 	if (opts == NULL) {
 		return (ma);
 	}
 
 	while((name = strsep(&p, ",")) != NULL) {
 		if (name[0] == '\0')
 			break;
 
 		val = strchr(name, '=');
 		if (val != NULL) {
 			*val = '\0';
 			++val;
 		}
 		if (strcmp(name, "rw") == 0 || strcmp(name, "noro") == 0) {
 			/*
 			 * The first time we mount the root file system,
 			 * we need to mount 'ro', so We need to ignore
 			 * 'rw' and 'noro' mount options.
 			 */
 			continue;
 		}
 		name_arg = strdup(name, M_MOUNT);
 		val_arg = NULL;
 		if (val != NULL)
 			val_arg = strdup(val, M_MOUNT);
 
 		ma = mount_arg(ma, name_arg, val_arg,
 		    (val_arg != NULL ? -1 : 0));
 	}
 	free(opts, M_MOUNT);
 	return (ma);
 }