Index: head/sys/kern/imgact_aout.c
===================================================================
--- head/sys/kern/imgact_aout.c	(revision 326270)
+++ head/sys/kern/imgact_aout.c	(revision 326271)
@@ -1,339 +1,341 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <machine/frame.h>
 #include <machine/md_var.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_param.h>
 
 #ifdef __amd64__
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/ia32/ia32_signal.h>
 #endif
 
 static int	exec_aout_imgact(struct image_params *imgp);
 static int	aout_fixup(register_t **stack_base, struct image_params *imgp);
 
 #if defined(__i386__)
 struct sysentvec aout_sysvec = {
 	.sv_size	= SYS_MAXSYSCALL,
 	.sv_table	= sysent,
 	.sv_mask	= 0,
 	.sv_errsize	= 0,
 	.sv_errtbl	= NULL,
 	.sv_transtrap	= NULL,
 	.sv_fixup	= aout_fixup,
 	.sv_sendsig	= sendsig,
 	.sv_sigcode	= sigcode,
 	.sv_szsigcode	= &szsigcode,
 	.sv_name	= "FreeBSD a.out",
 	.sv_coredump	= NULL,
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= MINSIGSTKSZ,
 	.sv_pagesize	= PAGE_SIZE,
 	.sv_minuser	= VM_MIN_ADDRESS,
 	.sv_maxuser	= VM_MAXUSER_ADDRESS,
 	.sv_usrstack	= USRSTACK,
 	.sv_psstrings	= PS_STRINGS,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_strings	= exec_copyout_strings,
 	.sv_setregs	= exec_setregs,
 	.sv_fixlimit	= NULL,
 	.sv_maxssiz	= NULL,
 	.sv_flags	= SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
 	.sv_set_syscall_retval = cpu_set_syscall_retval,
 	.sv_fetch_syscall_args = cpu_fetch_syscall_args,
 	.sv_syscallnames = syscallnames,
 	.sv_schedtail	= NULL,
 	.sv_thread_detach = NULL,
 	.sv_trap	= NULL,
 };
 
 #elif defined(__amd64__)
 
 #define	AOUT32_USRSTACK	0xbfc00000
 #define	AOUT32_PS_STRINGS \
     (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
 #define	AOUT32_MINUSER	FREEBSD32_MINUSER
 
 extern const char *freebsd32_syscallnames[];
 extern u_long ia32_maxssiz;
 
 struct sysentvec aout_sysvec = {
 	.sv_size	= FREEBSD32_SYS_MAXSYSCALL,
 	.sv_table	= freebsd32_sysent,
 	.sv_mask	= 0,
 	.sv_errsize	= 0,
 	.sv_errtbl	= NULL,
 	.sv_transtrap	= NULL,
 	.sv_fixup	= aout_fixup,
 	.sv_sendsig	= ia32_sendsig,
 	.sv_sigcode	= ia32_sigcode,
 	.sv_szsigcode	= &sz_ia32_sigcode,
 	.sv_name	= "FreeBSD a.out",
 	.sv_coredump	= NULL,
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= MINSIGSTKSZ,
 	.sv_pagesize	= IA32_PAGE_SIZE,
 	.sv_minuser	= AOUT32_MINUSER,
 	.sv_maxuser	= AOUT32_USRSTACK,
 	.sv_usrstack	= AOUT32_USRSTACK,
 	.sv_psstrings	= AOUT32_PS_STRINGS,
 	.sv_stackprot	= VM_PROT_ALL,
 	.sv_copyout_strings	= freebsd32_copyout_strings,
 	.sv_setregs	= ia32_setregs,
 	.sv_fixlimit	= ia32_fixlimit,
 	.sv_maxssiz	= &ia32_maxssiz,
 	.sv_flags	= SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
 	.sv_set_syscall_retval = ia32_set_syscall_retval,
 	.sv_fetch_syscall_args = ia32_fetch_syscall_args,
 	.sv_syscallnames = freebsd32_syscallnames,
 };
 #else
 #error "Port me"
 #endif
 
 static int
 aout_fixup(register_t **stack_base, struct image_params *imgp)
 {
 
 	*(char **)stack_base -= sizeof(uint32_t);
 	return (suword32(*stack_base, imgp->args->argc));
 }
 
 static int
 exec_aout_imgact(struct image_params *imgp)
 {
 	const struct exec *a_out = (const struct exec *) imgp->image_header;
 	struct vmspace *vmspace;
 	vm_map_t map;
 	vm_object_t object;
 	vm_offset_t text_end, data_end;
 	unsigned long virtual_offset;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	int error;
 
 	/*
 	 * Linux and *BSD binaries look very much alike,
 	 * only the machine id is different:
 	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
 	 * NetBSD is in network byte order.. ugh.
 	 */
 	if (((a_out->a_midmag >> 16) & 0xff) != 0x86 &&
 	    ((a_out->a_midmag >> 16) & 0xff) != 0 &&
 	    ((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86)
                 return -1;
 
 	/*
 	 * Set file/virtual offset based on a.out variant.
 	 *	We do two cases: host byte order and network byte order
 	 *	(for NetBSD compatibility)
 	 */
 	switch ((int)(a_out->a_midmag & 0xffff)) {
 	case ZMAGIC:
 		virtual_offset = 0;
 		if (a_out->a_text) {
 			file_offset = PAGE_SIZE;
 		} else {
 			/* Bill's "screwball mode" */
 			file_offset = 0;
 		}
 		break;
 	case QMAGIC:
 		virtual_offset = PAGE_SIZE;
 		file_offset = 0;
 		/* Pass PS_STRINGS for BSD/OS binaries only. */
 		if (N_GETMID(*a_out) == MID_ZERO)
 			imgp->ps_strings = aout_sysvec.sv_psstrings;
 		break;
 	default:
 		/* NetBSD compatibility */
 		switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			virtual_offset = PAGE_SIZE;
 			file_offset = 0;
 			break;
 		default:
 			return (-1);
 		}
 	}
 
 	bss_size = roundup(a_out->a_bss, PAGE_SIZE);
 
 	/*
 	 * Check various fields in header for validity/bounds.
 	 */
 	if (/* entry point must lay with text region */
 	    a_out->a_entry < virtual_offset ||
 	    a_out->a_entry >= virtual_offset + a_out->a_text ||
 
 	    /* text and data size must each be page rounded */
 	    a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK
 
 #ifdef __amd64__
 	    ||
 	    /* overflows */
 	    virtual_offset + a_out->a_text + a_out->a_data + bss_size > UINT_MAX
 #endif
 	    )
 		return (-1);
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 		return (EFAULT);
 
 	/*
 	 * text/data/bss must not exceed limits
 	 */
 	PROC_LOCK(imgp->proc);
 	if (/* text can't exceed maximum text size */
 	    a_out->a_text > maxtsiz ||
 
 	    /* data + bss can't exceed rlimit */
 	    a_out->a_data + bss_size > lim_cur_proc(imgp->proc, RLIMIT_DATA) ||
 	    racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) {
 			PROC_UNLOCK(imgp->proc);
 			return (ENOMEM);
 	}
 	PROC_UNLOCK(imgp->proc);
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
 	error = exec_new_vmspace(imgp, &aout_sysvec);
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error)
 		return (error);
 
 	/*
 	 * The vm space can be changed by exec_new_vmspace
 	 */
 	vmspace = imgp->proc->p_vmspace;
 
 	object = imgp->object;
 	map = &vmspace->vm_map;
 	vm_map_lock(map);
 	vm_object_reference(object);
 
 	text_end = virtual_offset + a_out->a_text;
 	error = vm_map_insert(map, object,
 		file_offset,
 		virtual_offset, text_end,
 		VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
 		MAP_COPY_ON_WRITE | MAP_PREFAULT);
 	if (error) {
 		vm_map_unlock(map);
 		vm_object_deallocate(object);
 		return (error);
 	}
 	data_end = text_end + a_out->a_data;
 	if (a_out->a_data) {
 		vm_object_reference(object);
 		error = vm_map_insert(map, object,
 			file_offset + a_out->a_text,
 			text_end, data_end,
 			VM_PROT_ALL, VM_PROT_ALL,
 			MAP_COPY_ON_WRITE | MAP_PREFAULT);
 		if (error) {
 			vm_map_unlock(map);
 			vm_object_deallocate(object);
 			return (error);
 		}
 	}
 
 	if (bss_size) {
 		error = vm_map_insert(map, NULL, 0,
 			data_end, data_end + bss_size,
 			VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error) {
 			vm_map_unlock(map);
 			return (error);
 		}
 	}
 	vm_map_unlock(map);
 
 	/* Fill in process VM information */
 	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
 	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
 	vmspace->vm_daddr = (caddr_t) (uintptr_t)
 			    (virtual_offset + a_out->a_text);
 
 	/* Fill in image_params */
 	imgp->interpreted = 0;
 	imgp->entry_addr = a_out->a_entry;
 
 	imgp->proc->p_sysent = &aout_sysvec;
 
 	return (0);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
 EXEC_SET(aout, aout_execsw);
Index: head/sys/kern/imgact_elf.c
===================================================================
--- head/sys/kern/imgact_elf.c	(revision 326270)
+++ head/sys/kern/imgact_elf.c	(revision 326271)
@@ -1,2481 +1,2483 @@
 /*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
  * Copyright (c) 2017 Dell EMC
  * Copyright (c) 2000 David O'Brien
  * Copyright (c) 1995-1996 Søren Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_gzio.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/gzio.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/ptrace.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/eventhandler.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/elf.h>
 #include <machine/md_var.h>
 
 #define ELF_NOTE_ROUNDSIZE	4
 #define OLD_EI_BRAND	8
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
     const char *interp, int interp_name_len, int32_t *osrel);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry, size_t pagesize);
 static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
     caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
     size_t pagesize);
 static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
 static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note,
     int32_t *osrel);
 static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
 static boolean_t __elfN(check_note)(struct image_params *imgp,
     Elf_Brandnote *checknote, int32_t *osrel);
 static vm_prot_t __elfN(trans_prot)(Elf_Word);
 static Elf_Word __elfN(untrans_prot)(vm_prot_t);
 
 SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
     "");
 
 #define	CORE_BUF_SIZE	(16 * 1024)
 
 int __elfN(fallback_brand) = -1;
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
 
 static int elf_legacy_coredump = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
     &elf_legacy_coredump, 0,
     "include all and only RW pages in core dumps");
 
 int __elfN(nxstack) =
 #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ || \
     (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__)
 	1;
 #else
 	0;
 #endif
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
 
 #if __ELF_WORD_SIZE == 32
 #if defined(__amd64__)
 int i386_read_exec = 0;
 SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
     "enable execution from readable segments");
 #endif
 #endif
 
 static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
 #define	trunc_page_ps(va, ps)	rounddown2(va, ps)
 #define	round_page_ps(va, ps)	roundup2(va, ps)
 #define	aligned(a, t)	(trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a))
 
 static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
 
 Elf_Brandnote __elfN(freebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(FREEBSD_ABI_VENDOR),
 	.hdr.n_descsz	= sizeof(int32_t),
 	.hdr.n_type	= NT_FREEBSD_ABI_TAG,
 	.vendor		= FREEBSD_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= __elfN(freebsd_trans_osrel)
 };
 
 static boolean_t
 __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
 {
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 	*osrel = *(const int32_t *)(p);
 
 	return (TRUE);
 }
 
 static const char GNU_ABI_VENDOR[] = "GNU";
 static int GNU_KFREEBSD_ABI_DESC = 3;
 
 Elf_Brandnote __elfN(kfreebsd_brandnote) = {
 	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
 	.hdr.n_descsz	= 16,	/* XXX at least 16 */
 	.hdr.n_type	= 1,
 	.vendor		= GNU_ABI_VENDOR,
 	.flags		= BN_TRANSLATE_OSREL,
 	.trans_osrel	= kfreebsd_trans_osrel
 };
 
 static boolean_t
 kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
 {
 	const Elf32_Word *desc;
 	uintptr_t p;
 
 	p = (uintptr_t)(note + 1);
 	p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
 
 	desc = (const Elf32_Word *)p;
 	if (desc[0] != GNU_KFREEBSD_ABI_DESC)
 		return (FALSE);
 
 	/*
 	 * Debian GNU/kFreeBSD embed the earliest compatible kernel version
 	 * (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
 	 */
 	*osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
 
 	return (TRUE);
 }
 
 int
 __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == NULL) {
 			elf_brand_list[i] = entry;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS) {
 		printf("WARNING: %s: could not insert brandinfo entry: %p\n",
 			__func__, entry);
 		return (-1);
 	}
 	return (0);
 }
 
 int
 __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == entry) {
 			elf_brand_list[i] = NULL;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(brand_inuse)(Elf_Brandinfo *entry)
 {
 	struct proc *p;
 	int rval = FALSE;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_sysent == entry->sysvec) {
 			rval = TRUE;
 			break;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	return (rval);
 }
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
     int interp_name_len, int32_t *osrel)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	Elf_Brandinfo *bi, *bi_m;
 	boolean_t ret;
 	int i;
 
 	/*
 	 * We support four types of branding -- (1) the ELF EI_OSABI field
 	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
 	 * branding w/in the ELF header, (3) path of the `interp_path'
 	 * field, and (4) the ".note.ABI-tag" ELF section.
 	 */
 
 	/* Look for an ".note.ABI-tag" ELF section */
 	bi_m = NULL;
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL)
 			continue;
 		if (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)
 			continue;
 		if (hdr->e_machine == bi->machine && (bi->flags &
 		    (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
 			ret = __elfN(check_note)(imgp, bi->brand_note, osrel);
 			/* Give brand a chance to veto check_note's guess */
 			if (ret && bi->header_supported)
 				ret = bi->header_supported(imgp);
 			/*
 			 * If note checker claimed the binary, but the
 			 * interpreter path in the image does not
 			 * match default one for the brand, try to
 			 * search for other brands with the same
 			 * interpreter.  Either there is better brand
 			 * with the right interpreter, or, failing
 			 * this, we return first brand which accepted
 			 * our note and, optionally, header.
 			 */
 			if (ret && bi_m == NULL && interp != NULL &&
 			    (bi->interp_path == NULL ||
 			    (strlen(bi->interp_path) + 1 != interp_name_len ||
 			    strncmp(interp, bi->interp_path, interp_name_len)
 			    != 0))) {
 				bi_m = bi;
 				ret = 0;
 			}
 			if (ret)
 				return (bi);
 		}
 	}
 	if (bi_m != NULL)
 		return (bi_m);
 
 	/* If the executable has a brand, search for it in the brand list. */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
 		    (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
 			continue;
 		if (hdr->e_machine == bi->machine &&
 		    (hdr->e_ident[EI_OSABI] == bi->brand ||
 		    (bi->compat_3_brand != NULL &&
 		    strcmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
 		    bi->compat_3_brand) == 0))) {
 			/* Looks good, but give brand a chance to veto */
 			if (!bi->header_supported ||
 			    bi->header_supported(imgp)) {
 				/*
 				 * Again, prefer strictly matching
 				 * interpreter path.
 				 */
 				if (interp_name_len == 0 &&
 				    bi->interp_path == NULL)
 					return (bi);
 				if (bi->interp_path != NULL &&
 				    strlen(bi->interp_path) + 1 ==
 				    interp_name_len && strncmp(interp,
 				    bi->interp_path, interp_name_len) == 0)
 					return (bi);
 				if (bi_m == NULL)
 					bi_m = bi;
 			}
 		}
 	}
 	if (bi_m != NULL)
 		return (bi_m);
 
 	/* No known brand, see if the header is recognized by any brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY ||
 		    bi->header_supported == NULL)
 			continue;
 		if (hdr->e_machine == bi->machine) {
 			ret = bi->header_supported(imgp);
 			if (ret)
 				return (bi);
 		}
 	}
 
 	/* Lacking a known brand, search for a recognized interpreter. */
 	if (interp != NULL) {
 		for (i = 0; i < MAX_BRANDS; i++) {
 			bi = elf_brand_list[i];
 			if (bi == NULL || (bi->flags &
 			    (BI_BRAND_NOTE_MANDATORY | BI_BRAND_ONLY_STATIC))
 			    != 0)
 				continue;
 			if (hdr->e_machine == bi->machine &&
 			    bi->interp_path != NULL &&
 			    /* ELF image p_filesz includes terminating zero */
 			    strlen(bi->interp_path) + 1 == interp_name_len &&
 			    strncmp(interp, bi->interp_path, interp_name_len)
 			    == 0)
 				return (bi);
 		}
 	}
 
 	/* Lacking a recognized interpreter, try the default brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 ||
 		    (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0))
 			continue;
 		if (hdr->e_machine == bi->machine &&
 		    __elfN(fallback_brand) == bi->brand)
 			return (bi);
 	}
 	return (NULL);
 }
 
 static int
 __elfN(check_header)(const Elf_Ehdr *hdr)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	if (!IS_ELF(*hdr) ||
 	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
 	    hdr->e_version != ELF_TARG_VER)
 		return (ENOEXEC);
 
 	/*
 	 * Make sure we have at least one brand for this machine.
 	 */
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && bi->machine == hdr->e_machine)
 			break;
 	}
 	if (i == MAX_BRANDS)
 		return (ENOEXEC);
 
 	return (0);
 }
 
 static int
 __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	int error;
 	vm_offset_t off;
 
 	/*
 	 * Create the page if it doesn't exist yet. Ignore errors.
 	 */
 	vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) -
 	    trunc_page(start), VM_PROT_ALL, VM_PROT_ALL, MAP_CHECK_EXCL);
 
 	/*
 	 * Find the page from the underlying object.
 	 */
 	if (object != NULL) {
 		sf = vm_imgact_map_page(object, offset);
 		if (sf == NULL)
 			return (KERN_FAILURE);
 		off = offset - trunc_page(offset);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
 		    end - start);
 		vm_imgact_unmap_page(sf);
 		if (error != 0)
 			return (KERN_FAILURE);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(map_insert)(struct image_params *imgp, vm_map_t map, vm_object_t object,
     vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot,
     int cow)
 {
 	struct sf_buf *sf;
 	vm_offset_t off;
 	vm_size_t sz;
 	int error, locked, rv;
 
 	if (start != trunc_page(start)) {
 		rv = __elfN(map_partial)(map, object, offset, start,
 		    round_page(start), prot);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		offset += round_page(start) - start;
 		start = round_page(start);
 	}
 	if (end != round_page(end)) {
 		rv = __elfN(map_partial)(map, object, offset +
 		    trunc_page(end) - start, trunc_page(end), end, prot);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		end = trunc_page(end);
 	}
 	if (start >= end)
 		return (KERN_SUCCESS);
 	if ((offset & PAGE_MASK) != 0) {
 		/*
 		 * The mapping is not page aligned.  This means that we have
 		 * to copy the data.
 		 */
 		rv = vm_map_fixed(map, NULL, 0, start, end - start,
 		    prot | VM_PROT_WRITE, VM_PROT_ALL, MAP_CHECK_EXCL);
 		if (rv != KERN_SUCCESS)
 			return (rv);
 		if (object == NULL)
 			return (KERN_SUCCESS);
 		for (; start < end; start += sz) {
 			sf = vm_imgact_map_page(object, offset);
 			if (sf == NULL)
 				return (KERN_FAILURE);
 			off = offset - trunc_page(offset);
 			sz = end - start;
 			if (sz > PAGE_SIZE - off)
 				sz = PAGE_SIZE - off;
 			error = copyout((caddr_t)sf_buf_kva(sf) + off,
 			    (caddr_t)start, sz);
 			vm_imgact_unmap_page(sf);
 			if (error != 0)
 				return (KERN_FAILURE);
 			offset += sz;
 		}
 	} else {
 		vm_object_reference(object);
 		rv = vm_map_fixed(map, object, offset, start, end - start,
 		    prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL);
 		if (rv != KERN_SUCCESS) {
 			locked = VOP_ISLOCKED(imgp->vp);
 			VOP_UNLOCK(imgp->vp, 0);
 			vm_object_deallocate(object);
 			vn_lock(imgp->vp, locked | LK_RETRY);
 			return (rv);
 		}
 	}
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset,
     caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
     size_t pagesize)
 {
 	struct sf_buf *sf;
 	size_t map_len;
 	vm_map_t map;
 	vm_object_t object;
 	vm_offset_t off, map_addr;
 	int error, rv, cow;
 	size_t copy_len;
 	vm_ooffset_t file_addr;
 
 	/*
 	 * It's necessary to fail if the filsz + offset taken from the
 	 * header is greater than the actual file pager object's size.
 	 * If we were to allow this, then the vm_map_find() below would
 	 * walk right off the end of the file object and into the ether.
 	 *
 	 * While I'm here, might as well check for something else that
 	 * is invalid: filsz cannot be greater than memsz.
 	 */
 	if ((filsz != 0 && (off_t)filsz + offset > imgp->attr->va_size) ||
 	    filsz > memsz) {
 		uprintf("elf_load_section: truncated ELF file\n");
 		return (ENOEXEC);
 	}
 
 	object = imgp->object;
 	map = &imgp->proc->p_vmspace->vm_map;
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
 	file_addr = trunc_page_ps(offset, pagesize);
 
 	/*
 	 * We have two choices.  We can either clear the data in the last page
 	 * of an oversized mapping, or we can start the anon mapping a page
 	 * early and copy the initialized data into that first page.  We
 	 * choose the second.
 	 */
 	if (filsz == 0)
 		map_len = 0;
 	else if (memsz > filsz)
 		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
 	else
 		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
 
 	if (map_len != 0) {
 		/* cow flags: don't dump readonly sections in core */
 		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
 		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
 
 		rv = __elfN(map_insert)(imgp, map,
 				      object,
 				      file_addr,	/* file offset */
 				      map_addr,		/* virtual start */
 				      map_addr + map_len,/* virtual end */
 				      prot,
 				      cow);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 
 		/* we can stop now if we've covered it all */
 		if (memsz == filsz)
 			return (0);
 	}
 
 
 	/*
 	 * We have to get the remaining bit of the file into the first part
 	 * of the oversized map segment.  This is normally because the .data
 	 * segment in the file is extended to provide bss.  It's a neat idea
 	 * to try and save a page, but it's a pain in the behind to implement.
 	 */
 	copy_len = filsz == 0 ? 0 : (offset + filsz) - trunc_page_ps(offset +
 	    filsz, pagesize);
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
 	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
 	    map_addr;
 
 	/* This had damn well better be true! */
 	if (map_len != 0) {
 		rv = __elfN(map_insert)(imgp, map, NULL, 0, map_addr,
 		    map_addr + map_len, prot, 0);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 	}
 
 	if (copy_len != 0) {
 		sf = vm_imgact_map_page(object, offset + filsz);
 		if (sf == NULL)
 			return (EIO);
 
 		/* send the page fragment to user space */
 		off = trunc_page_ps(offset + filsz, pagesize) -
 		    trunc_page(offset + filsz);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off,
 		    (caddr_t)map_addr, copy_len);
 		vm_imgact_unmap_page(sf);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Remove write access to the page if it was only granted by map_insert
 	 * to allow copyout.
 	 */
 	if ((prot & VM_PROT_WRITE) == 0)
 		vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
 		    map_len), prot, FALSE);
 
 	return (0);
 }
 
 /*
  * Load the file "file" into memory.  It may be either a shared object
  * or an executable.
  *
  * The "addr" reference parameter is in/out.  On entry, it specifies
  * the address where a shared object should be loaded.  If the file is
  * an executable, this value is ignored.  On exit, "addr" specifies
  * where the file was actually loaded.
  *
  * The "entry" reference parameter is out only.  On exit, it specifies
  * the entry point for the loaded file.
  */
 static int
 __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
 	u_long *entry, size_t pagesize)
 {
 	struct {
 		struct nameidata nd;
 		struct vattr attr;
 		struct image_params image_params;
 	} *tempdata;
 	const Elf_Ehdr *hdr = NULL;
 	const Elf_Phdr *phdr = NULL;
 	struct nameidata *nd;
 	struct vattr *attr;
 	struct image_params *imgp;
 	vm_prot_t prot;
 	u_long rbase;
 	u_long base_addr = 0;
 	int error, i, numsegs;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * XXXJA: This check can go away once we are sufficiently confident
 	 * that the checks in namei() are correct.
 	 */
 	if (IN_CAPABILITY_MODE(curthread))
 		return (ECAPMODE);
 #endif
 
 	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
 	nd = &tempdata->nd;
 	attr = &tempdata->attr;
 	imgp = &tempdata->image_params;
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->attr = attr;
 	imgp->firstpage = NULL;
 	imgp->image_header = NULL;
 	imgp->object = NULL;
 	imgp->execlabel = NULL;
 
 	NDINIT(nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, curthread);
 	if ((error = namei(nd)) != 0) {
 		nd->ni_vp = NULL;
 		goto fail;
 	}
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd->ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto fail;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto fail;
 
 	/*
 	 * Also make certain that the interpreter stays the same, so set
 	 * its VV_TEXT flag, too.
 	 */
 	VOP_SET_TEXT(nd->ni_vp);
 
 	imgp->object = nd->ni_vp->v_object;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	if ((error = __elfN(check_header)(hdr)) != 0)
 		goto fail;
 	if (hdr->e_type == ET_DYN)
 		rbase = *addr;
 	else if (hdr->e_type == ET_EXEC)
 		rbase = 0;
 	else {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	/* Only support headers that fit within first page for now      */
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	if (!aligned(phdr, Elf_Addr)) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) {
 			/* Loadable segment */
 			prot = __elfN(trans_prot)(phdr[i].p_flags);
 			error = __elfN(load_section)(imgp, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize);
 			if (error != 0)
 				goto fail;
 			/*
 			 * Establish the base address if this is the
 			 * first segment.
 			 */
 			if (numsegs == 0)
   				base_addr = trunc_page(phdr[i].p_vaddr +
 				    rbase);
 			numsegs++;
 		}
 	}
 	*addr = base_addr;
 	*entry = (unsigned long)hdr->e_entry + rbase;
 
 fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (nd->ni_vp)
 		vput(nd->ni_vp);
 
 	free(tempdata, M_TEMP);
 
 	return (error);
 }
 
 static int
 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 {
 	struct thread *td;
 	const Elf_Ehdr *hdr;
 	const Elf_Phdr *phdr;
 	Elf_Auxargs *elf_auxargs;
 	struct vmspace *vmspace;
 	const char *err_str, *newinterp;
 	char *interp, *interp_buf, *path;
 	Elf_Brandinfo *brand_info;
 	struct sysentvec *sv;
 	vm_prot_t prot;
 	u_long text_size, data_size, total_size, text_addr, data_addr;
 	u_long seg_size, seg_addr, addr, baddr, et_dyn_addr, entry, proghdr;
 	int32_t osrel;
 	int error, i, n, interp_name_len, have_interp;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 
 	/*
 	 * Do we have a valid ELF header ?
 	 *
 	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
 	 * if particular brand doesn't support it.
 	 */
 	if (__elfN(check_header)(hdr) != 0 ||
 	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
 		return (-1);
 
 	/*
 	 * From here on down, we return an errno, not -1, as we've
 	 * detected an ELF file.
 	 */
 
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
 		/* Only support headers in first page for now */
 		uprintf("Program headers not in the first page\n");
 		return (ENOEXEC);
 	}
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); 
 	if (!aligned(phdr, Elf_Addr)) {
 		uprintf("Unaligned program headers\n");
 		return (ENOEXEC);
 	}
 
 	n = error = 0;
 	baddr = 0;
 	osrel = 0;
 	text_size = data_size = total_size = text_addr = data_addr = 0;
 	entry = proghdr = 0;
 	interp_name_len = 0;
 	err_str = newinterp = NULL;
 	interp = interp_buf = NULL;
 	td = curthread;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:
 			if (n == 0)
 				baddr = phdr[i].p_vaddr;
 			n++;
 			break;
 		case PT_INTERP:
 			/* Path to interpreter */
 			if (phdr[i].p_filesz > MAXPATHLEN) {
 				uprintf("Invalid PT_INTERP\n");
 				error = ENOEXEC;
 				goto ret;
 			}
 			if (interp != NULL) {
 				uprintf("Multiple PT_INTERP headers\n");
 				error = ENOEXEC;
 				goto ret;
 			}
 			interp_name_len = phdr[i].p_filesz;
 			if (phdr[i].p_offset > PAGE_SIZE ||
 			    interp_name_len > PAGE_SIZE - phdr[i].p_offset) {
 				VOP_UNLOCK(imgp->vp, 0);
 				interp_buf = malloc(interp_name_len + 1, M_TEMP,
 				    M_WAITOK);
 				vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 				error = vn_rdwr(UIO_READ, imgp->vp, interp_buf,
 				    interp_name_len, phdr[i].p_offset,
 				    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 				    NOCRED, NULL, td);
 				if (error != 0) {
 					uprintf("i/o error PT_INTERP\n");
 					goto ret;
 				}
 				interp_buf[interp_name_len] = '\0';
 				interp = interp_buf;
 			} else {
 				interp = __DECONST(char *, imgp->image_header) +
 				    phdr[i].p_offset;
 			}
 			break;
 		case PT_GNU_STACK:
 			if (__elfN(nxstack))
 				imgp->stack_prot =
 				    __elfN(trans_prot)(phdr[i].p_flags);
 			imgp->stack_sz = phdr[i].p_memsz;
 			break;
 		}
 	}
 
 	brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len,
 	    &osrel);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
 		error = ENOEXEC;
 		goto ret;
 	}
 	et_dyn_addr = 0;
 	if (hdr->e_type == ET_DYN) {
 		if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0) {
 			uprintf("Cannot execute shared object\n");
 			error = ENOEXEC;
 			goto ret;
 		}
 		/*
 		 * Honour the base load address from the dso if it is
 		 * non-zero for some reason.
 		 */
 		if (baddr == 0)
 			et_dyn_addr = ET_DYN_LOAD_ADDR;
 	}
 	sv = brand_info->sysvec;
 	if (interp != NULL && brand_info->interp_newpath != NULL)
 		newinterp = brand_info->interp_newpath;
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 *
 	 * The VV_TEXT flag prevents modifications to the executable while
 	 * the vnode is unlocked.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	error = exec_new_vmspace(imgp, sv);
 	imgp->proc->p_sysent = sv;
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error != 0)
 		goto ret;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:	/* Loadable segment */
 			if (phdr[i].p_memsz == 0)
 				break;
 			prot = __elfN(trans_prot)(phdr[i].p_flags);
 			error = __elfN(load_section)(imgp, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
 			    sv->sv_pagesize);
 			if (error != 0)
 				goto ret;
 
 			/*
 			 * If this segment contains the program headers,
 			 * remember their virtual address for the AT_PHDR
 			 * aux entry. Static binaries don't usually include
 			 * a PT_PHDR entry.
 			 */
 			if (phdr[i].p_offset == 0 &&
 			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
 				<= phdr[i].p_filesz)
 				proghdr = phdr[i].p_vaddr + hdr->e_phoff +
 				    et_dyn_addr;
 
 			seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
 			seg_size = round_page(phdr[i].p_memsz +
 			    phdr[i].p_vaddr + et_dyn_addr - seg_addr);
 
 			/*
 			 * Make the largest executable segment the official
 			 * text segment and all others data.
 			 *
 			 * Note that obreak() assumes that data_addr + 
 			 * data_size == end of data load area, and the ELF
 			 * file format expects segments to be sorted by
 			 * address.  If multiple data segments exist, the
 			 * last one will be used.
 			 */
 
 			if (phdr[i].p_flags & PF_X && text_size < seg_size) {
 				text_size = seg_size;
 				text_addr = seg_addr;
 			} else {
 				data_size = seg_size;
 				data_addr = seg_addr;
 			}
 			total_size += seg_size;
 			break;
 		case PT_PHDR: 	/* Program header table info */
 			proghdr = phdr[i].p_vaddr + et_dyn_addr;
 			break;
 		default:
 			break;
 		}
 	}
 	
 	if (data_addr == 0 && data_size == 0) {
 		data_addr = text_addr;
 		data_size = text_size;
 	}
 
 	entry = (u_long)hdr->e_entry + et_dyn_addr;
 
 	/*
 	 * Check limits.  It should be safe to check the
 	 * limits after loading the segments since we do
 	 * not actually fault in all the segments pages.
 	 */
 	PROC_LOCK(imgp->proc);
 	if (data_size > lim_cur_proc(imgp->proc, RLIMIT_DATA))
 		err_str = "Data segment size exceeds process limit";
 	else if (text_size > maxtsiz)
 		err_str = "Text segment size exceeds system limit";
 	else if (total_size > lim_cur_proc(imgp->proc, RLIMIT_VMEM))
 		err_str = "Total segment size exceeds process limit";
 	else if (racct_set(imgp->proc, RACCT_DATA, data_size) != 0)
 		err_str = "Data segment size exceeds resource limit";
 	else if (racct_set(imgp->proc, RACCT_VMEM, total_size) != 0)
 		err_str = "Total segment size exceeds resource limit";
 	if (err_str != NULL) {
 		PROC_UNLOCK(imgp->proc);
 		uprintf("%s\n", err_str);
 		error = ENOMEM;
 		goto ret;
 	}
 
 	vmspace = imgp->proc->p_vmspace;
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
 
 	/*
 	 * We load the dynamic linker where a userland call
 	 * to mmap(0, ...) would put it.  The rationale behind this
 	 * calculation is that it leaves room for the heap to grow to
 	 * its maximum allowed size.
 	 */
 	addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(td,
 	    RLIMIT_DATA));
 	PROC_UNLOCK(imgp->proc);
 
 	imgp->entry_addr = entry;
 
 	if (interp != NULL) {
 		have_interp = FALSE;
 		VOP_UNLOCK(imgp->vp, 0);
 		if (brand_info->emul_path != NULL &&
 		    brand_info->emul_path[0] != '\0') {
 			path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 			snprintf(path, MAXPATHLEN, "%s%s",
 			    brand_info->emul_path, interp);
 			error = __elfN(load_file)(imgp->proc, path, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 			free(path, M_TEMP);
 			if (error == 0)
 				have_interp = TRUE;
 		}
 		if (!have_interp && newinterp != NULL &&
 		    (brand_info->interp_path == NULL ||
 		    strcmp(interp, brand_info->interp_path) == 0)) {
 			error = __elfN(load_file)(imgp->proc, newinterp, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 			if (error == 0)
 				have_interp = TRUE;
 		}
 		if (!have_interp) {
 			error = __elfN(load_file)(imgp->proc, interp, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 		}
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0) {
 			uprintf("ELF interpreter %s not found, error %d\n",
 			    interp, error);
 			goto ret;
 		}
 	} else
 		addr = et_dyn_addr;
 
 	/*
 	 * Construct auxargs table (used by the fixup routine)
 	 */
 	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
 	elf_auxargs->execfd = -1;
 	elf_auxargs->phdr = proghdr;
 	elf_auxargs->phent = hdr->e_phentsize;
 	elf_auxargs->phnum = hdr->e_phnum;
 	elf_auxargs->pagesz = PAGE_SIZE;
 	elf_auxargs->base = addr;
 	elf_auxargs->flags = 0;
 	elf_auxargs->entry = entry;
 	elf_auxargs->hdr_eflags = hdr->e_flags;
 
 	imgp->auxargs = elf_auxargs;
 	imgp->interpreted = 0;
 	imgp->reloc_base = addr;
 	imgp->proc->p_osrel = osrel;
 	imgp->proc->p_elf_machine = hdr->e_machine;
 	imgp->proc->p_elf_flags = hdr->e_flags;
 
 ret:
 	free(interp_buf, M_TEMP);
 	return (error);
 }
 
 #define	suword __CONCAT(suword, __ELF_WORD_SIZE)
 
 int
 __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
 {
 	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
 	Elf_Addr *base;
 	Elf_Addr *pos;
 
 	base = (Elf_Addr *)*stack_base;
 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
 
 	if (args->execfd != -1)
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_EHDRFLAGS, args->hdr_eflags);
 	if (imgp->execpathp != 0)
 		AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp);
 	AUXARGS_ENTRY(pos, AT_OSRELDATE,
 	    imgp->proc->p_ucred->cr_prison->pr_osreldate);
 	if (imgp->canary != 0) {
 		AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary);
 		AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen);
 	}
 	AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus);
 	if (imgp->pagesizes != 0) {
 		AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
 		AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
 	}
 	if (imgp->sysent->sv_timekeep_base != 0) {
 		AUXARGS_ENTRY(pos, AT_TIMEKEEP,
 		    imgp->sysent->sv_timekeep_base);
 	}
 	AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
 	    != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    imgp->sysent->sv_stackprot);
 	if (imgp->sysent->sv_hwcap != NULL)
 		AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap);
 	if (imgp->sysent->sv_hwcap2 != NULL)
 		AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 
 	base--;
 	suword(base, (long)imgp->args->argc);
 	*stack_base = (register_t *)base;
 	return (0);
 }
 
 /*
  * Code for generating ELF core dumps.
  */
 
 typedef void (*segment_callback)(vm_map_entry_t, void *);
 
 /* Closure for cb_put_phdr(). */
 struct phdr_closure {
 	Elf_Phdr *phdr;		/* Program header to fill in */
 	Elf_Off offset;		/* Offset of segment in core file */
 };
 
 /* Closure for cb_size_segment(). */
 struct sseg_closure {
 	int count;		/* Count of writable segments. */
 	size_t size;		/* Total size of all writable segments. */
 };
 
 typedef void (*outfunc_t)(void *, struct sbuf *, size_t *);
 
 struct note_info {
 	int		type;		/* Note type. */
 	outfunc_t 	outfunc; 	/* Output function. */
 	void		*outarg;	/* Argument for the output function. */
 	size_t		outsize;	/* Output size. */
 	TAILQ_ENTRY(note_info) link;	/* Link to the next note info. */
 };
 
 TAILQ_HEAD(note_info_list, note_info);
 
 /* Coredump output parameters. */
 struct coredump_params {
 	off_t		offset;
 	struct ucred	*active_cred;
 	struct ucred	*file_cred;
 	struct thread	*td;
 	struct vnode	*vp;
 	struct gzio_stream *gzs;
 };
 
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static int core_write(struct coredump_params *, const void *, size_t, off_t,
     enum uio_seg);
 static void each_dumpable_segment(struct thread *, segment_callback, void *);
 static int __elfN(corehdr)(struct coredump_params *, int, void *, size_t,
     struct note_info_list *, size_t);
 static void __elfN(prepare_notes)(struct thread *, struct note_info_list *,
     size_t *);
 static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t);
 static void __elfN(putnote)(struct note_info *, struct sbuf *);
 static size_t register_note(struct note_info_list *, int, outfunc_t, void *);
 static int sbuf_drain_core_output(void *, const char *, int);
 static int sbuf_drain_count(void *arg, const char *data, int len);
 
 static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *);
 static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *);
 static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *);
 static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *);
 static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *);
 static void __elfN(note_ptlwpinfo)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *);
 static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *);
 static void note_procstat_files(void *, struct sbuf *, size_t *);
 static void note_procstat_groups(void *, struct sbuf *, size_t *);
 static void note_procstat_osrel(void *, struct sbuf *, size_t *);
 static void note_procstat_rlimit(void *, struct sbuf *, size_t *);
 static void note_procstat_umask(void *, struct sbuf *, size_t *);
 static void note_procstat_vmmap(void *, struct sbuf *, size_t *);
 
 #ifdef GZIO
 extern int compress_user_cores_gzlevel;
 
 /*
  * Write out a core segment to the compression stream.
  */
 static int
 compress_chunk(struct coredump_params *p, char *base, char *buf, u_int len)
 {
 	u_int chunk_len;
 	int error;
 
 	while (len > 0) {
 		chunk_len = MIN(len, CORE_BUF_SIZE);
 
 		/*
 		 * We can get EFAULT error here.
 		 * In that case zero out the current chunk of the segment.
 		 */
 		error = copyin(base, buf, chunk_len);
 		if (error != 0)
 			bzero(buf, chunk_len);
 		error = gzio_write(p->gzs, buf, chunk_len);
 		if (error != 0)
 			break;
 		base += chunk_len;
 		len -= chunk_len;
 	}
 	return (error);
 }
 
 static int
 core_gz_write(void *base, size_t len, off_t offset, void *arg)
 {
 
 	return (core_write((struct coredump_params *)arg, base, len, offset,
 	    UIO_SYSSPACE));
 }
 #endif /* GZIO */
 
 static int
 core_write(struct coredump_params *p, const void *base, size_t len,
     off_t offset, enum uio_seg seg)
 {
 
 	return (vn_rdwr_inchunks(UIO_WRITE, p->vp, __DECONST(void *, base),
 	    len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
 	    p->active_cred, p->file_cred, NULL, p->td));
 }
 
 static int
 core_output(void *base, size_t len, off_t offset, struct coredump_params *p,
     void *tmpbuf)
 {
 	int error;
 
 #ifdef GZIO
 	if (p->gzs != NULL)
 		return (compress_chunk(p, base, tmpbuf, len));
 #endif
 	/*
 	 * EFAULT is a non-fatal error that we can get, for example,
 	 * if the segment is backed by a file but extends beyond its
 	 * end.
 	 */
 	error = core_write(p, base, len, offset, UIO_USERSPACE);
 	if (error == EFAULT) {
 		log(LOG_WARNING, "Failed to fully fault in a core file segment "
 		    "at VA %p with size 0x%zx to be written at offset 0x%jx "
 		    "for process %s\n", base, len, offset, curproc->p_comm);
 
 		/*
 		 * Write a "real" zero byte at the end of the target region
 		 * in the case this is the last segment.
 		 * The intermediate space will be implicitly zero-filled.
 		 */
 		error = core_write(p, zero_region, 1, offset + len - 1,
 		    UIO_SYSSPACE);
 	}
 	return (error);
 }
 
 /*
  * Drain into a core file.
  */
 static int
 sbuf_drain_core_output(void *arg, const char *data, int len)
 {
 	struct coredump_params *p;
 	int error, locked;
 
 	p = (struct coredump_params *)arg;
 
 	/*
 	 * Some kern_proc out routines that print to this sbuf may
 	 * call us with the process lock held. Draining with the
 	 * non-sleepable lock held is unsafe. The lock is needed for
 	 * those routines when dumping a live process. In our case we
 	 * can safely release the lock before draining and acquire
 	 * again after.
 	 */
 	locked = PROC_LOCKED(p->td->td_proc);
 	if (locked)
 		PROC_UNLOCK(p->td->td_proc);
 #ifdef GZIO
 	if (p->gzs != NULL)
 		error = gzio_write(p->gzs, __DECONST(char *, data), len);
 	else
 #endif
 		error = core_write(p, __DECONST(void *, data), len, p->offset,
 		    UIO_SYSSPACE);
 	if (locked)
 		PROC_LOCK(p->td->td_proc);
 	if (error != 0)
 		return (-error);
 	p->offset += len;
 	return (len);
 }
 
 /*
  * Drain into a counter.
  */
 static int
 sbuf_drain_count(void *arg, const char *data __unused, int len)
 {
 	size_t *sizep;
 
 	sizep = (size_t *)arg;
 	*sizep += len;
 	return (len);
 }
 
 int
 __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
 {
 	struct ucred *cred = td->td_ucred;
 	int error = 0;
 	struct sseg_closure seginfo;
 	struct note_info_list notelst;
 	struct coredump_params params;
 	struct note_info *ninfo;
 	void *hdr, *tmpbuf;
 	size_t hdrsize, notesz, coresize;
 #ifdef GZIO
 	boolean_t compress;
 
 	compress = (flags & IMGACT_CORE_COMPRESS) != 0;
 #endif
 	hdr = NULL;
 	tmpbuf = NULL;
 	TAILQ_INIT(&notelst);
 
 	/* Size the program segments. */
 	seginfo.count = 0;
 	seginfo.size = 0;
 	each_dumpable_segment(td, cb_size_segment, &seginfo);
 
 	/*
 	 * Collect info about the core file header area.
 	 */
 	hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count);
 	if (seginfo.count + 1 >= PN_XNUM)
 		hdrsize += sizeof(Elf_Shdr);
 	__elfN(prepare_notes)(td, &notelst, &notesz);
 	coresize = round_page(hdrsize + notesz) + seginfo.size;
 
 	/* Set up core dump parameters. */
 	params.offset = 0;
 	params.active_cred = cred;
 	params.file_cred = NOCRED;
 	params.td = td;
 	params.vp = vp;
 	params.gzs = NULL;
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		error = racct_add(td->td_proc, RACCT_CORE, coresize);
 		PROC_UNLOCK(td->td_proc);
 		if (error != 0) {
 			error = EFAULT;
 			goto done;
 		}
 	}
 #endif
 	if (coresize >= limit) {
 		error = EFAULT;
 		goto done;
 	}
 
 #ifdef GZIO
 	/* Create a compression stream if necessary. */
 	if (compress) {
 		params.gzs = gzio_init(core_gz_write, GZIO_DEFLATE,
 		    CORE_BUF_SIZE, compress_user_cores_gzlevel, &params);
 		if (params.gzs == NULL) {
 			error = EFAULT;
 			goto done;
 		}
 		tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
         }
 #endif
 
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out following the notes.
 	 */
 	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
 	error = __elfN(corehdr)(&params, seginfo.count, hdr, hdrsize, &notelst,
 	    notesz);
 
 	/* Write the contents of all of the writable segments. */
 	if (error == 0) {
 		Elf_Phdr *php;
 		off_t offset;
 		int i;
 
 		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
 		offset = round_page(hdrsize + notesz);
 		for (i = 0; i < seginfo.count; i++) {
 			error = core_output((caddr_t)(uintptr_t)php->p_vaddr,
 			    php->p_filesz, offset, &params, tmpbuf);
 			if (error != 0)
 				break;
 			offset += php->p_filesz;
 			php++;
 		}
 #ifdef GZIO
 		if (error == 0 && compress)
 			error = gzio_flush(params.gzs);
 #endif
 	}
 	if (error) {
 		log(LOG_WARNING,
 		    "Failed to write core file for process %s (error %d)\n",
 		    curproc->p_comm, error);
 	}
 
 done:
 #ifdef GZIO
 	if (compress) {
 		free(tmpbuf, M_TEMP);
 		if (params.gzs != NULL)
 			gzio_fini(params.gzs);
 	}
 #endif
 	while ((ninfo = TAILQ_FIRST(&notelst)) != NULL) {
 		TAILQ_REMOVE(&notelst, ninfo, link);
 		free(ninfo, M_TEMP);
 	}
 	if (hdr != NULL)
 		free(hdr, M_TEMP);
 
 	return (error);
 }
 
 /*
  * A callback for each_dumpable_segment() to write out the segment's
  * program header entry.
  */
 static void
 cb_put_phdr(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct phdr_closure *phc = (struct phdr_closure *)closure;
 	Elf_Phdr *phdr = phc->phdr;
 
 	phc->offset = round_page(phc->offset);
 
 	phdr->p_type = PT_LOAD;
 	phdr->p_offset = phc->offset;
 	phdr->p_vaddr = entry->start;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
 	phdr->p_align = PAGE_SIZE;
 	phdr->p_flags = __elfN(untrans_prot)(entry->protection);
 
 	phc->offset += phdr->p_filesz;
 	phc->phdr++;
 }
 
 /*
  * A callback for each_dumpable_segment() to gather information about
  * the number of segments and their total size.
  */
 static void
 cb_size_segment(vm_map_entry_t entry, void *closure)
 {
 	struct sseg_closure *ssc = (struct sseg_closure *)closure;
 
 	ssc->count++;
 	ssc->size += entry->end - entry->start;
 }
 
 /*
  * For each writable segment in the process's memory map, call the given
  * function with a pointer to the map entry and some arbitrary
  * caller-supplied data.
  */
 static void
 each_dumpable_segment(struct thread *td, segment_callback func, void *closure)
 {
 	struct proc *p = td->td_proc;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, object;
 	boolean_t ignore_entry;
 
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		/*
 		 * Don't dump inaccessible mappings, deal with legacy
 		 * coredump mode.
 		 *
 		 * Note that read-only segments related to the elf binary
 		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
 		 * need to arbitrarily ignore such segments.
 		 */
 		if (elf_legacy_coredump) {
 			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
 				continue;
 		} else {
 			if ((entry->protection & VM_PROT_ALL) == 0)
 				continue;
 		}
 
 		/*
 		 * Dont include memory segment in the coredump if
 		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
 		 * madvise(2).  Do not dump submaps (i.e. parts of the
 		 * kernel map).
 		 */
 		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
 			continue;
 
 		if ((object = entry->object.vm_object) == NULL)
 			continue;
 
 		/* Ignore memory-mapped devices and such things. */
 		VM_OBJECT_RLOCK(object);
 		while ((backing_object = object->backing_object) != NULL) {
 			VM_OBJECT_RLOCK(backing_object);
 			VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
 		ignore_entry = object->type != OBJT_DEFAULT &&
 		    object->type != OBJT_SWAP && object->type != OBJT_VNODE &&
 		    object->type != OBJT_PHYS;
 		VM_OBJECT_RUNLOCK(object);
 		if (ignore_entry)
 			continue;
 
 		(*func)(entry, closure);
 	}
 	vm_map_unlock_read(map);
 }
 
 /*
  * Write the core file header to the file, including padding up to
  * the page boundary.
  */
 static int
 __elfN(corehdr)(struct coredump_params *p, int numsegs, void *hdr,
     size_t hdrsize, struct note_info_list *notelst, size_t notesz)
 {
 	struct note_info *ninfo;
 	struct sbuf *sb;
 	int error;
 
 	/* Fill in the header. */
 	bzero(hdr, hdrsize);
 	__elfN(puthdr)(p->td, hdr, hdrsize, numsegs, notesz);
 
 	sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN);
 	sbuf_set_drain(sb, sbuf_drain_core_output, p);
 	sbuf_start_section(sb, NULL);
 	sbuf_bcat(sb, hdr, hdrsize);
 	TAILQ_FOREACH(ninfo, notelst, link)
 	    __elfN(putnote)(ninfo, sb);
 	/* Align up to a page boundary for the program segments. */
 	sbuf_end_section(sb, -1, PAGE_SIZE, 0);
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static void
 __elfN(prepare_notes)(struct thread *td, struct note_info_list *list,
     size_t *sizep)
 {
 	struct proc *p;
 	struct thread *thr;
 	size_t size;
 
 	p = td->td_proc;
 	size = 0;
 
 	size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p);
 
 	/*
 	 * To have the debugger select the right thread (LWP) as the initial
 	 * thread, we dump the state of the thread passed to us in td first.
 	 * This is the thread that causes the core dump and thus likely to
 	 * be the right thread one wants to have selected in the debugger.
 	 */
 	thr = td;
 	while (thr != NULL) {
 		size += register_note(list, NT_PRSTATUS,
 		    __elfN(note_prstatus), thr);
 		size += register_note(list, NT_FPREGSET,
 		    __elfN(note_fpregset), thr);
 		size += register_note(list, NT_THRMISC,
 		    __elfN(note_thrmisc), thr);
 		size += register_note(list, NT_PTLWPINFO,
 		    __elfN(note_ptlwpinfo), thr);
 		size += register_note(list, -1,
 		    __elfN(note_threadmd), thr);
 
 		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
 		    TAILQ_NEXT(thr, td_plist);
 		if (thr == td)
 			thr = TAILQ_NEXT(thr, td_plist);
 	}
 
 	size += register_note(list, NT_PROCSTAT_PROC,
 	    __elfN(note_procstat_proc), p);
 	size += register_note(list, NT_PROCSTAT_FILES,
 	    note_procstat_files, p);
 	size += register_note(list, NT_PROCSTAT_VMMAP,
 	    note_procstat_vmmap, p);
 	size += register_note(list, NT_PROCSTAT_GROUPS,
 	    note_procstat_groups, p);
 	size += register_note(list, NT_PROCSTAT_UMASK,
 	    note_procstat_umask, p);
 	size += register_note(list, NT_PROCSTAT_RLIMIT,
 	    note_procstat_rlimit, p);
 	size += register_note(list, NT_PROCSTAT_OSREL,
 	    note_procstat_osrel, p);
 	size += register_note(list, NT_PROCSTAT_PSSTRINGS,
 	    __elfN(note_procstat_psstrings), p);
 	size += register_note(list, NT_PROCSTAT_AUXV,
 	    __elfN(note_procstat_auxv), p);
 
 	*sizep = size;
 }
 
 static void
 __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
     size_t notesz)
 {
 	Elf_Ehdr *ehdr;
 	Elf_Phdr *phdr;
 	Elf_Shdr *shdr;
 	struct phdr_closure phc;
 
 	ehdr = (Elf_Ehdr *)hdr;
 
 	ehdr->e_ident[EI_MAG0] = ELFMAG0;
 	ehdr->e_ident[EI_MAG1] = ELFMAG1;
 	ehdr->e_ident[EI_MAG2] = ELFMAG2;
 	ehdr->e_ident[EI_MAG3] = ELFMAG3;
 	ehdr->e_ident[EI_CLASS] = ELF_CLASS;
 	ehdr->e_ident[EI_DATA] = ELF_DATA;
 	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
 	ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
 	ehdr->e_ident[EI_ABIVERSION] = 0;
 	ehdr->e_ident[EI_PAD] = 0;
 	ehdr->e_type = ET_CORE;
 	ehdr->e_machine = td->td_proc->p_elf_machine;
 	ehdr->e_version = EV_CURRENT;
 	ehdr->e_entry = 0;
 	ehdr->e_phoff = sizeof(Elf_Ehdr);
 	ehdr->e_flags = td->td_proc->p_elf_flags;
 	ehdr->e_ehsize = sizeof(Elf_Ehdr);
 	ehdr->e_phentsize = sizeof(Elf_Phdr);
 	ehdr->e_shentsize = sizeof(Elf_Shdr);
 	ehdr->e_shstrndx = SHN_UNDEF;
 	if (numsegs + 1 < PN_XNUM) {
 		ehdr->e_phnum = numsegs + 1;
 		ehdr->e_shnum = 0;
 	} else {
 		ehdr->e_phnum = PN_XNUM;
 		ehdr->e_shnum = 1;
 
 		ehdr->e_shoff = ehdr->e_phoff +
 		    (numsegs + 1) * ehdr->e_phentsize;
 		KASSERT(ehdr->e_shoff == hdrsize - sizeof(Elf_Shdr),
 		    ("e_shoff: %zu, hdrsize - shdr: %zu",
 		     (size_t)ehdr->e_shoff, hdrsize - sizeof(Elf_Shdr)));
 
 		shdr = (Elf_Shdr *)((char *)hdr + ehdr->e_shoff);
 		memset(shdr, 0, sizeof(*shdr));
 		/*
 		 * A special first section is used to hold large segment and
 		 * section counts.  This was proposed by Sun Microsystems in
 		 * Solaris and has been adopted by Linux; the standard ELF
 		 * tools are already familiar with the technique.
 		 *
 		 * See table 7-7 of the Solaris "Linker and Libraries Guide"
 		 * (or 12-7 depending on the version of the document) for more
 		 * details.
 		 */
 		shdr->sh_type = SHT_NULL;
 		shdr->sh_size = ehdr->e_shnum;
 		shdr->sh_link = ehdr->e_shstrndx;
 		shdr->sh_info = numsegs + 1;
 	}
 
 	/*
 	 * Fill in the program header entries.
 	 */
 	phdr = (Elf_Phdr *)((char *)hdr + ehdr->e_phoff);
 
 	/* The note segement. */
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = hdrsize;
 	phdr->p_vaddr = 0;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = notesz;
 	phdr->p_memsz = 0;
 	phdr->p_flags = PF_R;
 	phdr->p_align = ELF_NOTE_ROUNDSIZE;
 	phdr++;
 
 	/* All the writable segments from the program. */
 	phc.phdr = phdr;
 	phc.offset = round_page(hdrsize + notesz);
 	each_dumpable_segment(td, cb_put_phdr, &phc);
 }
 
 static size_t
 register_note(struct note_info_list *list, int type, outfunc_t out, void *arg)
 {
 	struct note_info *ninfo;
 	size_t size, notesize;
 
 	size = 0;
 	out(arg, NULL, &size);
 	ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK);
 	ninfo->type = type;
 	ninfo->outfunc = out;
 	ninfo->outarg = arg;
 	ninfo->outsize = size;
 	TAILQ_INSERT_TAIL(list, ninfo, link);
 
 	if (type == -1)
 		return (size);
 
 	notesize = sizeof(Elf_Note) +		/* note header */
 	    roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
 						/* note name */
 	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
 
 	return (notesize);
 }
 
 static size_t
 append_note_data(const void *src, void *dst, size_t len)
 {
 	size_t padded_len;
 
 	padded_len = roundup2(len, ELF_NOTE_ROUNDSIZE);
 	if (dst != NULL) {
 		bcopy(src, dst, len);
 		bzero((char *)dst + len, padded_len - len);
 	}
 	return (padded_len);
 }
 
 size_t
 __elfN(populate_note)(int type, void *src, void *dst, size_t size, void **descp)
 {
 	Elf_Note *note;
 	char *buf;
 	size_t notesize;
 
 	buf = dst;
 	if (buf != NULL) {
 		note = (Elf_Note *)buf;
 		note->n_namesz = sizeof(FREEBSD_ABI_VENDOR);
 		note->n_descsz = size;
 		note->n_type = type;
 		buf += sizeof(*note);
 		buf += append_note_data(FREEBSD_ABI_VENDOR, buf,
 		    sizeof(FREEBSD_ABI_VENDOR));
 		append_note_data(src, buf, size);
 		if (descp != NULL)
 			*descp = buf;
 	}
 
 	notesize = sizeof(Elf_Note) +		/* note header */
 	    roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) +
 						/* note name */
 	    roundup2(size, ELF_NOTE_ROUNDSIZE);	/* note description */
 
 	return (notesize);
 }
 
 static void
 __elfN(putnote)(struct note_info *ninfo, struct sbuf *sb)
 {
 	Elf_Note note;
 	ssize_t old_len, sect_len;
 	size_t new_len, descsz, i;
 
 	if (ninfo->type == -1) {
 		ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
 		return;
 	}
 
 	note.n_namesz = sizeof(FREEBSD_ABI_VENDOR);
 	note.n_descsz = ninfo->outsize;
 	note.n_type = ninfo->type;
 
 	sbuf_bcat(sb, &note, sizeof(note));
 	sbuf_start_section(sb, &old_len);
 	sbuf_bcat(sb, FREEBSD_ABI_VENDOR, sizeof(FREEBSD_ABI_VENDOR));
 	sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
 	if (note.n_descsz == 0)
 		return;
 	sbuf_start_section(sb, &old_len);
 	ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
 	sect_len = sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
 	if (sect_len < 0)
 		return;
 
 	new_len = (size_t)sect_len;
 	descsz = roundup(note.n_descsz, ELF_NOTE_ROUNDSIZE);
 	if (new_len < descsz) {
 		/*
 		 * It is expected that individual note emitters will correctly
 		 * predict their expected output size and fill up to that size
 		 * themselves, padding in a format-specific way if needed.
 		 * However, in case they don't, just do it here with zeros.
 		 */
 		for (i = 0; i < descsz - new_len; i++)
 			sbuf_putc(sb, 0);
 	} else if (new_len > descsz) {
 		/*
 		 * We can't always truncate sb -- we may have drained some
 		 * of it already.
 		 */
 		KASSERT(new_len == descsz, ("%s: Note type %u changed as we "
 		    "read it (%zu > %zu).  Since it is longer than "
 		    "expected, this coredump's notes are corrupt.  THIS "
 		    "IS A BUG in the note_procstat routine for type %u.\n",
 		    __func__, (unsigned)note.n_type, new_len, descsz,
 		    (unsigned)note.n_type));
 	}
 }
 
 /*
  * Miscellaneous note out functions.
  */
 
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 
 typedef struct prstatus32 elf_prstatus_t;
 typedef struct prpsinfo32 elf_prpsinfo_t;
 typedef struct fpreg32 elf_prfpregset_t;
 typedef struct fpreg32 elf_fpregset_t;
 typedef struct reg32 elf_gregset_t;
 typedef struct thrmisc32 elf_thrmisc_t;
 #define ELF_KERN_PROC_MASK	KERN_PROC_MASK32
 typedef struct kinfo_proc32 elf_kinfo_proc_t;
 typedef uint32_t elf_ps_strings_t;
 #else
 typedef prstatus_t elf_prstatus_t;
 typedef prpsinfo_t elf_prpsinfo_t;
 typedef prfpregset_t elf_prfpregset_t;
 typedef prfpregset_t elf_fpregset_t;
 typedef gregset_t elf_gregset_t;
 typedef thrmisc_t elf_thrmisc_t;
 #define ELF_KERN_PROC_MASK	0
 typedef struct kinfo_proc elf_kinfo_proc_t;
 typedef vm_offset_t elf_ps_strings_t;
 #endif
 
 static void
 __elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct sbuf sbarg;
 	size_t len;
 	char *cp, *end;
 	struct proc *p;
 	elf_prpsinfo_t *psinfo;
 	int error;
 
 	p = (struct proc *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*psinfo), ("invalid size"));
 		psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK);
 		psinfo->pr_version = PRPSINFO_VERSION;
 		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
 		strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
 		PROC_LOCK(p);
 		if (p->p_args != NULL) {
 			len = sizeof(psinfo->pr_psargs) - 1;
 			if (len > p->p_args->ar_length)
 				len = p->p_args->ar_length;
 			memcpy(psinfo->pr_psargs, p->p_args->ar_args, len);
 			PROC_UNLOCK(p);
 			error = 0;
 		} else {
 			_PHOLD(p);
 			PROC_UNLOCK(p);
 			sbuf_new(&sbarg, psinfo->pr_psargs,
 			    sizeof(psinfo->pr_psargs), SBUF_FIXEDLEN);
 			error = proc_getargv(curthread, p, &sbarg);
 			PRELE(p);
 			if (sbuf_finish(&sbarg) == 0)
 				len = sbuf_len(&sbarg) - 1;
 			else
 				len = sizeof(psinfo->pr_psargs) - 1;
 			sbuf_delete(&sbarg);
 		}
 		if (error || len == 0)
 			strlcpy(psinfo->pr_psargs, p->p_comm,
 			    sizeof(psinfo->pr_psargs));
 		else {
 			KASSERT(len < sizeof(psinfo->pr_psargs),
 			    ("len is too long: %zu vs %zu", len,
 			    sizeof(psinfo->pr_psargs)));
 			cp = psinfo->pr_psargs;
 			end = cp + len - 1;
 			for (;;) {
 				cp = memchr(cp, '\0', end - cp);
 				if (cp == NULL)
 					break;
 				*cp = ' ';
 			}
 		}
 		psinfo->pr_pid = p->p_pid;
 		sbuf_bcat(sb, psinfo, sizeof(*psinfo));
 		free(psinfo, M_TEMP);
 	}
 	*sizep = sizeof(*psinfo);
 }
 
 static void
 __elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_prstatus_t *status;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*status), ("invalid size"));
 		status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK);
 		status->pr_version = PRSTATUS_VERSION;
 		status->pr_statussz = sizeof(elf_prstatus_t);
 		status->pr_gregsetsz = sizeof(elf_gregset_t);
 		status->pr_fpregsetsz = sizeof(elf_fpregset_t);
 		status->pr_osreldate = osreldate;
 		status->pr_cursig = td->td_proc->p_sig;
 		status->pr_pid = td->td_tid;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		fill_regs32(td, &status->pr_reg);
 #else
 		fill_regs(td, &status->pr_reg);
 #endif
 		sbuf_bcat(sb, status, sizeof(*status));
 		free(status, M_TEMP);
 	}
 	*sizep = sizeof(*status);
 }
 
 static void
 __elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_prfpregset_t *fpregset;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(*fpregset), ("invalid size"));
 		fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK);
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		fill_fpregs32(td, fpregset);
 #else
 		fill_fpregs(td, fpregset);
 #endif
 		sbuf_bcat(sb, fpregset, sizeof(*fpregset));
 		free(fpregset, M_TEMP);
 	}
 	*sizep = sizeof(*fpregset);
 }
 
 static void
 __elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	elf_thrmisc_t thrmisc;
 
 	td = (struct thread *)arg;
 	if (sb != NULL) {
 		KASSERT(*sizep == sizeof(thrmisc), ("invalid size"));
 		bzero(&thrmisc._pad, sizeof(thrmisc._pad));
 		strcpy(thrmisc.pr_tname, td->td_name);
 		sbuf_bcat(sb, &thrmisc, sizeof(thrmisc));
 	}
 	*sizep = sizeof(thrmisc);
 }
 
 static void
 __elfN(note_ptlwpinfo)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	size_t size;
 	int structsize;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 	struct ptrace_lwpinfo32 pl;
 #else
 	struct ptrace_lwpinfo pl;
 #endif
 
 	td = (struct thread *)arg;
 	size = sizeof(structsize) + sizeof(pl);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(pl);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		bzero(&pl, sizeof(pl));
 		pl.pl_lwpid = td->td_tid;
 		pl.pl_event = PL_EVENT_NONE;
 		pl.pl_sigmask = td->td_sigmask;
 		pl.pl_siglist = td->td_siglist;
 		if (td->td_si.si_signo != 0) {
 			pl.pl_event = PL_EVENT_SIGNAL;
 			pl.pl_flags |= PL_FLAG_SI;
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 			siginfo_to_siginfo32(&td->td_si, &pl.pl_siginfo);
 #else
 			pl.pl_siginfo = td->td_si;
 #endif
 		}
 		strcpy(pl.pl_tdname, td->td_name);
 		/* XXX TODO: supply more information in struct ptrace_lwpinfo*/
 		sbuf_bcat(sb, &pl, sizeof(pl));
 	}
 	*sizep = size;
 }
 
 /*
  * Allow for MD specific notes, as well as any MD
  * specific preparations for writing MI notes.
  */
 static void
 __elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct thread *td;
 	void *buf;
 	size_t size;
 
 	td = (struct thread *)arg;
 	size = *sizep;
 	if (size != 0 && sb != NULL)
 		buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
 	else
 		buf = NULL;
 	size = 0;
 	__elfN(dump_thread)(td, buf, &size);
 	KASSERT(sb == NULL || *sizep == size, ("invalid size"));
 	if (size != 0 && sb != NULL)
 		sbuf_bcat(sb, buf, size);
 	free(buf, M_TEMP);
 	*sizep = size;
 }
 
 #ifdef KINFO_PROC_SIZE
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 #endif
 
 static void
 __elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + p->p_numthreads *
 	    sizeof(elf_kinfo_proc_t);
 
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(elf_kinfo_proc_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sx_slock(&proctree_lock);
 		PROC_LOCK(p);
 		kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
 		sx_sunlock(&proctree_lock);
 	}
 	*sizep = size;
 }
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static void
 note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size, sect_sz, i;
 	ssize_t start_len, sect_len;
 	int structsize, filedesc_flags;
 
 	if (coredump_pack_fileinfo)
 		filedesc_flags = KERN_FILEDESC_PACK_KINFO;
 	else
 		filedesc_flags = 0;
 
 	p = (struct proc *)arg;
 	structsize = sizeof(struct kinfo_file);
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, -1, filedesc_flags);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		sbuf_start_section(sb, &start_len);
 
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, *sizep - sizeof(structsize),
 		    filedesc_flags);
 
 		sect_len = sbuf_end_section(sb, start_len, 0, 0);
 		if (sect_len < 0)
 			return;
 		sect_sz = sect_len;
 
 		KASSERT(sect_sz <= *sizep,
 		    ("kern_proc_filedesc_out did not respect maxlen; "
 		     "requested %zu, got %zu", *sizep - sizeof(structsize),
 		     sect_sz - sizeof(structsize)));
 
 		for (i = 0; i < *sizep - sect_sz && sb->s_error == 0; i++)
 			sbuf_putc(sb, 0);
 	}
 }
 
 #ifdef KINFO_VMENTRY_SIZE
 CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
 #endif
 
 static void
 note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize, vmmap_flags;
 
 	if (coredump_pack_vmmapinfo)
 		vmmap_flags = KERN_VMMAP_PACK_KINFO;
 	else
 		vmmap_flags = 0;
 
 	p = (struct proc *)arg;
 	structsize = sizeof(struct kinfo_vmentry);
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_vmmap_out(p, sb, -1, vmmap_flags);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		kern_proc_vmmap_out(p, sb, *sizep - sizeof(structsize),
 		    vmmap_flags);
 	}
 }
 
 static void
 note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(gid_t);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups *
 		    sizeof(gid_t));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(p->p_fd->fd_cmask);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	struct rlimit rlim[RLIM_NLIMITS];
 	size_t size;
 	int structsize, i;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(rlim);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(rlim);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PROC_LOCK(p);
 		for (i = 0; i < RLIM_NLIMITS; i++)
 			lim_rlimit_proc(p, i, &rlim[i]);
 		PROC_UNLOCK(p);
 		sbuf_bcat(sb, rlim, sizeof(rlim));
 	}
 	*sizep = size;
 }
 
 static void
 note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(p->p_osrel);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(p->p_osrel);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel));
 	}
 	*sizep = size;
 }
 
 static void
 __elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	elf_ps_strings_t ps_strings;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	size = sizeof(structsize) + sizeof(ps_strings);
 	if (sb != NULL) {
 		KASSERT(*sizep == size, ("invalid size"));
 		structsize = sizeof(ps_strings);
 #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
 		ps_strings = PTROUT(p->p_sysent->sv_psstrings);
 #else
 		ps_strings = p->p_sysent->sv_psstrings;
 #endif
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		sbuf_bcat(sb, &ps_strings, sizeof(ps_strings));
 	}
 	*sizep = size;
 }
 
 static void
 __elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep)
 {
 	struct proc *p;
 	size_t size;
 	int structsize;
 
 	p = (struct proc *)arg;
 	if (sb == NULL) {
 		size = 0;
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PHOLD(p);
 		proc_getauxv(curthread, p, sb);
 		PRELE(p);
 		sbuf_finish(sb);
 		sbuf_delete(sb);
 		*sizep = size;
 	} else {
 		structsize = sizeof(Elf_Auxinfo);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
 		PHOLD(p);
 		proc_getauxv(curthread, p, sb);
 		PRELE(p);
 	}
 }
 
 static boolean_t
 __elfN(parse_notes)(struct image_params *imgp, Elf_Brandnote *checknote,
     int32_t *osrel, const Elf_Phdr *pnote)
 {
 	const Elf_Note *note, *note0, *note_end;
 	const char *note_name;
 	char *buf;
 	int i, error;
 	boolean_t res;
 
 	/* We need some limit, might as well use PAGE_SIZE. */
 	if (pnote == NULL || pnote->p_filesz > PAGE_SIZE)
 		return (FALSE);
 	ASSERT_VOP_LOCKED(imgp->vp, "parse_notes");
 	if (pnote->p_offset > PAGE_SIZE ||
 	    pnote->p_filesz > PAGE_SIZE - pnote->p_offset) {
 		VOP_UNLOCK(imgp->vp, 0);
 		buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK);
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz,
 		    pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED,
 		    curthread->td_ucred, NOCRED, NULL, curthread);
 		if (error != 0) {
 			uprintf("i/o error PT_NOTE\n");
 			res = FALSE;
 			goto ret;
 		}
 		note = note0 = (const Elf_Note *)buf;
 		note_end = (const Elf_Note *)(buf + pnote->p_filesz);
 	} else {
 		note = note0 = (const Elf_Note *)(imgp->image_header +
 		    pnote->p_offset);
 		note_end = (const Elf_Note *)(imgp->image_header +
 		    pnote->p_offset + pnote->p_filesz);
 		buf = NULL;
 	}
 	for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
 		if (!aligned(note, Elf32_Addr) || (const char *)note_end -
 		    (const char *)note < sizeof(Elf_Note)) {
 			res = FALSE;
 			goto ret;
 		}
 		if (note->n_namesz != checknote->hdr.n_namesz ||
 		    note->n_descsz != checknote->hdr.n_descsz ||
 		    note->n_type != checknote->hdr.n_type)
 			goto nextnote;
 		note_name = (const char *)(note + 1);
 		if (note_name + checknote->hdr.n_namesz >=
 		    (const char *)note_end || strncmp(checknote->vendor,
 		    note_name, checknote->hdr.n_namesz) != 0)
 			goto nextnote;
 
 		/*
 		 * Fetch the osreldate for binary
 		 * from the ELF OSABI-note if necessary.
 		 */
 		if ((checknote->flags & BN_TRANSLATE_OSREL) != 0 &&
 		    checknote->trans_osrel != NULL) {
 			res = checknote->trans_osrel(note, osrel);
 			goto ret;
 		}
 		res = TRUE;
 		goto ret;
 nextnote:
 		note = (const Elf_Note *)((const char *)(note + 1) +
 		    roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
 		    roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE));
 	}
 	res = FALSE;
 ret:
 	free(buf, M_TEMP);
 	return (res);
 }
 
 /*
  * Try to find the appropriate ABI-note section for checknote,
  * fetch the osreldate for binary from the ELF OSABI-note. Only the
  * first page of the image is searched, the same as for headers.
  */
 static boolean_t
 __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote,
     int32_t *osrel)
 {
 	const Elf_Phdr *phdr;
 	const Elf_Ehdr *hdr;
 	int i;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_NOTE &&
 		    __elfN(parse_notes)(imgp, checknote, osrel, &phdr[i]))
 			return (TRUE);
 	}
 	return (FALSE);
 
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw __elfN(execsw) = {
 	__CONCAT(exec_, __elfN(imgact)),
 	__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
 };
 EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
 
 static vm_prot_t
 __elfN(trans_prot)(Elf_Word flags)
 {
 	vm_prot_t prot;
 
 	prot = 0;
 	if (flags & PF_X)
 		prot |= VM_PROT_EXECUTE;
 	if (flags & PF_W)
 		prot |= VM_PROT_WRITE;
 	if (flags & PF_R)
 		prot |= VM_PROT_READ;
 #if __ELF_WORD_SIZE == 32
 #if defined(__amd64__)
 	if (i386_read_exec && (flags & PF_R))
 		prot |= VM_PROT_EXECUTE;
 #endif
 #endif
 	return (prot);
 }
 
 static Elf_Word
 __elfN(untrans_prot)(vm_prot_t prot)
 {
 	Elf_Word flags;
 
 	flags = 0;
 	if (prot & VM_PROT_EXECUTE)
 		flags |= PF_X;
 	if (prot & VM_PROT_READ)
 		flags |= PF_R;
 	if (prot & VM_PROT_WRITE)
 		flags |= PF_W;
 	return (flags);
 }
Index: head/sys/kern/imgact_elf32.c
===================================================================
--- head/sys/kern/imgact_elf32.c	(revision 326270)
+++ head/sys/kern/imgact_elf32.c	(revision 326271)
@@ -1,31 +1,33 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2002 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define __ELF_WORD_SIZE 32
 #include <kern/imgact_elf.c>
Index: head/sys/kern/imgact_elf64.c
===================================================================
--- head/sys/kern/imgact_elf64.c	(revision 326270)
+++ head/sys/kern/imgact_elf64.c	(revision 326271)
@@ -1,31 +1,33 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2002 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define __ELF_WORD_SIZE 64
 #include <kern/imgact_elf.c>
Index: head/sys/kern/imgact_shell.c
===================================================================
--- head/sys/kern/imgact_shell.c	(revision 326270)
+++ head/sys/kern/imgact_shell.c	(revision 326271)
@@ -1,257 +1,259 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/vnode.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define SHELLMAGIC	0x2123 /* #! */
 #else
 #define SHELLMAGIC	0x2321
 #endif
 
 /*
  * At the time of this writing, MAXSHELLCMDLEN == PAGE_SIZE.  This is
  * significant because the caller has only mapped in one page of the
  * file we're reading.
  */
 #if MAXSHELLCMDLEN > PAGE_SIZE
 #error "MAXSHELLCMDLEN is larger than a single page!"
 #endif
 
 /*
  * MAXSHELLCMDLEN must be at least MAXINTERP plus the size of the `#!'
  * prefix and terminating newline.
  */
 CTASSERT(MAXSHELLCMDLEN >= MAXINTERP + 3);
 
 /**
  * Shell interpreter image activator. An interpreter name beginning at
  * imgp->args->begin_argv is the minimal successful exit requirement.
  *
  * If the given file is a shell-script, then the first line will start
  * with the two characters `#!' (aka SHELLMAGIC), followed by the name
  * of the shell-interpreter to run, followed by zero or more tokens.
  *
  * The interpreter is then started up such that it will see:
  *    arg[0] -> The name of interpreter as specified after `#!' in the
  *		first line of the script.  The interpreter name must
  *		not be longer than MAXSHELLCMDLEN bytes.
  *    arg[1] -> *If* there are any additional tokens on the first line,
  *		then we add a new arg[1], which is a copy of the rest of
  *		that line.  The copy starts at the first token after the
  *		interpreter name.  We leave it to the interpreter to
  *		parse the tokens in that value.
  *    arg[x] -> the full pathname of the script.  This will either be
  *		arg[2] or arg[1], depending on whether or not tokens
  *		were found after the interpreter name.
  *  arg[x+1] -> all the arguments that were specified on the original
  *		command line.
  *
  * This processing is described in the execve(2) man page.
  */
 
 /*
  * HISTORICAL NOTE: From 1993 to mid-2005, FreeBSD parsed out the tokens as
  * found on the first line of the script, and setup each token as a separate
  * value in arg[].  This extra processing did not match the behavior of other
  * OS's, and caused a few subtle problems.  For one, it meant the kernel was
  * deciding how those values should be parsed (wrt characters for quoting or
  * comments, etc), while the interpreter might have other rules for parsing.
  * It also meant the interpreter had no way of knowing which arguments came
  * from the first line of the shell script, and which arguments were specified
  * by the user on the command line.  That extra processing was dropped in the
  * 6.x branch on May 28, 2005 (matching __FreeBSD_version 600029).
  */
 int
 exec_shell_imgact(struct image_params *imgp)
 {
 	const char *image_header = imgp->image_header;
 	const char *ihp, *interpb, *interpe, *maxp, *optb, *opte, *fname;
 	int error, offset;
 	size_t length;
 	struct vattr vattr;
 	struct sbuf *sname;
 
 	/* a shell script? */
 	if (((const short *)image_header)[0] != SHELLMAGIC)
 		return (-1);
 
 	/*
 	 * Don't allow a shell script to be the shell for a shell
 	 *	script. :-)
 	 */
 	if (imgp->interpreted & IMGACT_SHELL)
 		return (ENOEXEC);
 
 	imgp->interpreted |= IMGACT_SHELL;
 
 	/*
 	 * At this point we have the first page of the file mapped.
 	 * However, we don't know how far into the page the contents are
 	 * valid -- the actual file might be much shorter than the page.
 	 * So find out the file size.
 	 */
 	error = VOP_GETATTR(imgp->vp, &vattr, imgp->proc->p_ucred);
 	if (error)
 		return (error);
 
 	/*
 	 * Copy shell name and arguments from image_header into a string
 	 * buffer.
 	 */
 	maxp = &image_header[MIN(vattr.va_size, MAXSHELLCMDLEN)];
 	ihp = &image_header[2];
 
 	/*
 	 * Find the beginning and end of the interpreter_name.  If the
 	 * line does not include any interpreter, or if the name which
 	 * was found is too long, we bail out.
 	 */
 	while (ihp < maxp && ((*ihp == ' ') || (*ihp == '\t')))
 		ihp++;
 	interpb = ihp;
 	while (ihp < maxp && ((*ihp != ' ') && (*ihp != '\t') && (*ihp != '\n')
 	    && (*ihp != '\0')))
 		ihp++;
 	interpe = ihp;
 	if (interpb == interpe)
 		return (ENOEXEC);
 	if (interpe - interpb >= MAXINTERP)
 		return (ENAMETOOLONG);
 
 	/*
 	 * Find the beginning of the options (if any), and the end-of-line.
 	 * Then trim the trailing blanks off the value.  Note that some
 	 * other operating systems do *not* trim the trailing whitespace...
 	 */
 	while (ihp < maxp && ((*ihp == ' ') || (*ihp == '\t')))
 		ihp++;
 	optb = ihp;
 	while (ihp < maxp && ((*ihp != '\n') && (*ihp != '\0')))
 		ihp++;
 	opte = ihp;
 	if (opte == maxp)
 		return (ENOEXEC);
 	while (--ihp > optb && ((*ihp == ' ') || (*ihp == '\t')))
 		opte = ihp;
 
 	if (imgp->args->fname != NULL) {
 		fname = imgp->args->fname;
 		sname = NULL;
 	} else {
 		sname = sbuf_new_auto();
 		sbuf_printf(sname, "/dev/fd/%d", imgp->args->fd);
 		sbuf_finish(sname);
 		fname = sbuf_data(sname);
 	}
 
 	/*
 	 * We need to "pop" (remove) the present value of arg[0], and "push"
 	 * either two or three new values in the arg[] list.  To do this,
 	 * we first shift all the other values in the `begin_argv' area to
 	 * provide the exact amount of room for the values added.  Set up
 	 * `offset' as the number of bytes to be added to the `begin_argv'
 	 * area, and 'length' as the number of bytes being removed.
 	 */
 	offset = interpe - interpb + 1;			/* interpreter */
 	if (opte > optb)				/* options (if any) */
 		offset += opte - optb + 1;
 	offset += strlen(fname) + 1;			/* fname of script */
 	length = (imgp->args->argc == 0) ? 0 :
 	    strlen(imgp->args->begin_argv) + 1;		/* bytes to delete */
 
 	if (offset > imgp->args->stringspace + length) {
 		if (sname != NULL)
 			sbuf_delete(sname);
 		return (E2BIG);
 	}
 
 	bcopy(imgp->args->begin_argv + length, imgp->args->begin_argv + offset,
 	    imgp->args->endp - (imgp->args->begin_argv + length));
 
 	offset -= length;		/* calculate actual adjustment */
 	imgp->args->begin_envv += offset;
 	imgp->args->endp += offset;
 	imgp->args->stringspace -= offset;
 
 	/*
 	 * If there was no arg[0] when we started, then the interpreter_name
 	 * is adding an argument (instead of replacing the arg[0] we started
 	 * with).  And we're always adding an argument when we include the
 	 * full pathname of the original script.
 	 */
 	if (imgp->args->argc == 0)
 		imgp->args->argc = 1;
 	imgp->args->argc++;
 
 	/*
 	 * The original arg[] list has been shifted appropriately.  Copy in
 	 * the interpreter name and options-string.
 	 */
 	length = interpe - interpb;
 	bcopy(interpb, imgp->args->begin_argv, length);
 	*(imgp->args->begin_argv + length) = '\0';
 	offset = length + 1;
 	if (opte > optb) {
 		length = opte - optb;
 		bcopy(optb, imgp->args->begin_argv + offset, length);
 		*(imgp->args->begin_argv + offset + length) = '\0';
 		offset += length + 1;
 		imgp->args->argc++;
 	}
 
 	/*
 	 * Finally, add the filename onto the end for the interpreter to
 	 * use and copy the interpreter's name to imgp->interpreter_name
 	 * for exec to use.
 	 */
 	error = copystr(fname, imgp->args->begin_argv + offset,
 	    imgp->args->stringspace, NULL);
 
 	if (error == 0)
 		imgp->interpreter_name = imgp->args->begin_argv;
 
 	if (sname != NULL)
 		sbuf_delete(sname);
 	return (error);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw shell_execsw = { exec_shell_imgact, "#!" };
 EXEC_SET(shell, shell_execsw);
Index: head/sys/kern/kern_alq.c
===================================================================
--- head/sys/kern/kern_alq.c	(revision 326270)
+++ head/sys/kern/kern_alq.c	(revision 326271)
@@ -1,973 +1,975 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2008-2009, Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2009-2010, The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/alq.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/fcntl.h>
 #include <sys/eventhandler.h>
 
 #include <security/mac/mac_framework.h>
 
 /* Async. Logging Queue */
 struct alq {
 	char	*aq_entbuf;		/* Buffer for stored entries */
 	int	aq_entmax;		/* Max entries */
 	int	aq_entlen;		/* Entry length */
 	int	aq_freebytes;		/* Bytes available in buffer */
 	int	aq_buflen;		/* Total length of our buffer */
 	int	aq_writehead;		/* Location for next write */
 	int	aq_writetail;		/* Flush starts at this location */
 	int	aq_wrapearly;		/* # bytes left blank at end of buf */
 	int	aq_flags;		/* Queue flags */
 	int	aq_waiters;		/* Num threads waiting for resources
 					 * NB: Used as a wait channel so must
 					 * not be first field in the alq struct
 					 */
 	struct	ale	aq_getpost;	/* ALE for use by get/post */
 	struct mtx	aq_mtx;		/* Queue lock */
 	struct vnode	*aq_vp;		/* Open vnode handle */
 	struct ucred	*aq_cred;	/* Credentials of the opening thread */
 	LIST_ENTRY(alq)	aq_act;		/* List of active queues */
 	LIST_ENTRY(alq)	aq_link;	/* List of all queues */
 };
 
 #define	AQ_WANTED	0x0001		/* Wakeup sleeper when io is done */
 #define	AQ_ACTIVE	0x0002		/* on the active list */
 #define	AQ_FLUSHING	0x0004		/* doing IO */
 #define	AQ_SHUTDOWN	0x0008		/* Queue no longer valid */
 #define	AQ_ORDERED	0x0010		/* Queue enforces ordered writes */
 #define	AQ_LEGACY	0x0020		/* Legacy queue (fixed length writes) */
 
 #define	ALQ_LOCK(alq)	mtx_lock_spin(&(alq)->aq_mtx)
 #define	ALQ_UNLOCK(alq)	mtx_unlock_spin(&(alq)->aq_mtx)
 
 #define HAS_PENDING_DATA(alq) ((alq)->aq_freebytes != (alq)->aq_buflen)
 
 static MALLOC_DEFINE(M_ALD, "ALD", "ALD");
 
 /*
  * The ald_mtx protects the ald_queues list and the ald_active list.
  */
 static struct mtx ald_mtx;
 static LIST_HEAD(, alq) ald_queues;
 static LIST_HEAD(, alq) ald_active;
 static int ald_shutingdown = 0;
 struct thread *ald_thread;
 static struct proc *ald_proc;
 static eventhandler_tag alq_eventhandler_tag = NULL;
 
 #define	ALD_LOCK()	mtx_lock(&ald_mtx)
 #define	ALD_UNLOCK()	mtx_unlock(&ald_mtx)
 
 /* Daemon functions */
 static int ald_add(struct alq *);
 static int ald_rem(struct alq *);
 static void ald_startup(void *);
 static void ald_daemon(void);
 static void ald_shutdown(void *, int);
 static void ald_activate(struct alq *);
 static void ald_deactivate(struct alq *);
 
 /* Internal queue functions */
 static void alq_shutdown(struct alq *);
 static void alq_destroy(struct alq *);
 static int alq_doio(struct alq *);
 
 
 /*
  * Add a new queue to the global list.  Fail if we're shutting down.
  */
 static int
 ald_add(struct alq *alq)
 {
 	int error;
 
 	error = 0;
 
 	ALD_LOCK();
 	if (ald_shutingdown) {
 		error = EBUSY;
 		goto done;
 	}
 	LIST_INSERT_HEAD(&ald_queues, alq, aq_link);
 done:
 	ALD_UNLOCK();
 	return (error);
 }
 
 /*
  * Remove a queue from the global list unless we're shutting down.  If so,
  * the ald will take care of cleaning up it's resources.
  */
 static int
 ald_rem(struct alq *alq)
 {
 	int error;
 
 	error = 0;
 
 	ALD_LOCK();
 	if (ald_shutingdown) {
 		error = EBUSY;
 		goto done;
 	}
 	LIST_REMOVE(alq, aq_link);
 done:
 	ALD_UNLOCK();
 	return (error);
 }
 
 /*
  * Put a queue on the active list.  This will schedule it for writing.
  */
 static void
 ald_activate(struct alq *alq)
 {
 	LIST_INSERT_HEAD(&ald_active, alq, aq_act);
 	wakeup(&ald_active);
 }
 
 static void
 ald_deactivate(struct alq *alq)
 {
 	LIST_REMOVE(alq, aq_act);
 	alq->aq_flags &= ~AQ_ACTIVE;
 }
 
 static void
 ald_startup(void *unused)
 {
 	mtx_init(&ald_mtx, "ALDmtx", NULL, MTX_DEF|MTX_QUIET);
 	LIST_INIT(&ald_queues);
 	LIST_INIT(&ald_active);
 }
 
 static void
 ald_daemon(void)
 {
 	int needwakeup;
 	struct alq *alq;
 
 	ald_thread = FIRST_THREAD_IN_PROC(ald_proc);
 
 	alq_eventhandler_tag = EVENTHANDLER_REGISTER(shutdown_pre_sync,
 	    ald_shutdown, NULL, SHUTDOWN_PRI_FIRST);
 
 	ALD_LOCK();
 
 	for (;;) {
 		while ((alq = LIST_FIRST(&ald_active)) == NULL &&
 		    !ald_shutingdown)
 			mtx_sleep(&ald_active, &ald_mtx, PWAIT, "aldslp", 0);
 
 		/* Don't shutdown until all active ALQs are flushed. */
 		if (ald_shutingdown && alq == NULL) {
 			ALD_UNLOCK();
 			break;
 		}
 
 		ALQ_LOCK(alq);
 		ald_deactivate(alq);
 		ALD_UNLOCK();
 		needwakeup = alq_doio(alq);
 		ALQ_UNLOCK(alq);
 		if (needwakeup)
 			wakeup_one(alq);
 		ALD_LOCK();
 	}
 
 	kproc_exit(0);
 }
 
 static void
 ald_shutdown(void *arg, int howto)
 {
 	struct alq *alq;
 
 	ALD_LOCK();
 
 	/* Ensure no new queues can be created. */
 	ald_shutingdown = 1;
 
 	/* Shutdown all ALQs prior to terminating the ald_daemon. */
 	while ((alq = LIST_FIRST(&ald_queues)) != NULL) {
 		LIST_REMOVE(alq, aq_link);
 		ALD_UNLOCK();
 		alq_shutdown(alq);
 		ALD_LOCK();
 	}
 
 	/* At this point, all ALQs are flushed and shutdown. */
 
 	/*
 	 * Wake ald_daemon so that it exits. It won't be able to do
 	 * anything until we mtx_sleep because we hold the ald_mtx.
 	 */
 	wakeup(&ald_active);
 
 	/* Wait for ald_daemon to exit. */
 	mtx_sleep(ald_proc, &ald_mtx, PWAIT, "aldslp", 0);
 
 	ALD_UNLOCK();
 }
 
 static void
 alq_shutdown(struct alq *alq)
 {
 	ALQ_LOCK(alq);
 
 	/* Stop any new writers. */
 	alq->aq_flags |= AQ_SHUTDOWN;
 
 	/*
 	 * If the ALQ isn't active but has unwritten data (possible if
 	 * the ALQ_NOACTIVATE flag has been used), explicitly activate the
 	 * ALQ here so that the pending data gets flushed by the ald_daemon.
 	 */
 	if (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq)) {
 		alq->aq_flags |= AQ_ACTIVE;
 		ALQ_UNLOCK(alq);
 		ALD_LOCK();
 		ald_activate(alq);
 		ALD_UNLOCK();
 		ALQ_LOCK(alq);
 	}
 
 	/* Drain IO */
 	while (alq->aq_flags & AQ_ACTIVE) {
 		alq->aq_flags |= AQ_WANTED;
 		msleep_spin(alq, &alq->aq_mtx, "aldclose", 0);
 	}
 	ALQ_UNLOCK(alq);
 
 	vn_close(alq->aq_vp, FWRITE, alq->aq_cred,
 	    curthread);
 	crfree(alq->aq_cred);
 }
 
 void
 alq_destroy(struct alq *alq)
 {
 	/* Drain all pending IO. */
 	alq_shutdown(alq);
 
 	mtx_destroy(&alq->aq_mtx);
 	free(alq->aq_entbuf, M_ALD);
 	free(alq, M_ALD);
 }
 
 /*
  * Flush all pending data to disk.  This operation will block.
  */
 static int
 alq_doio(struct alq *alq)
 {
 	struct thread *td;
 	struct mount *mp;
 	struct vnode *vp;
 	struct uio auio;
 	struct iovec aiov[2];
 	int totlen;
 	int iov;
 	int wrapearly;
 
 	KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
 
 	vp = alq->aq_vp;
 	td = curthread;
 	totlen = 0;
 	iov = 1;
 	wrapearly = alq->aq_wrapearly;
 
 	bzero(&aiov, sizeof(aiov));
 	bzero(&auio, sizeof(auio));
 
 	/* Start the write from the location of our buffer tail pointer. */
 	aiov[0].iov_base = alq->aq_entbuf + alq->aq_writetail;
 
 	if (alq->aq_writetail < alq->aq_writehead) {
 		/* Buffer not wrapped. */
 		totlen = aiov[0].iov_len = alq->aq_writehead - alq->aq_writetail;
 	} else if (alq->aq_writehead == 0) {
 		/* Buffer not wrapped (special case to avoid an empty iov). */
 		totlen = aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail -
 		    wrapearly;
 	} else {
 		/*
 		 * Buffer wrapped, requires 2 aiov entries:
 		 * - first is from writetail to end of buffer
 		 * - second is from start of buffer to writehead
 		 */
 		aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail -
 		    wrapearly;
 		iov++;
 		aiov[1].iov_base = alq->aq_entbuf;
 		aiov[1].iov_len =  alq->aq_writehead;
 		totlen = aiov[0].iov_len + aiov[1].iov_len;
 	}
 
 	alq->aq_flags |= AQ_FLUSHING;
 	ALQ_UNLOCK(alq);
 
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_iovcnt = iov;
 	auio.uio_resid = totlen;
 	auio.uio_td = td;
 
 	/*
 	 * Do all of the junk required to write now.
 	 */
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	/*
 	 * XXX: VOP_WRITE error checks are ignored.
 	 */
 #ifdef MAC
 	if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0)
 #endif
 		VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 
 	ALQ_LOCK(alq);
 	alq->aq_flags &= ~AQ_FLUSHING;
 
 	/* Adjust writetail as required, taking into account wrapping. */
 	alq->aq_writetail = (alq->aq_writetail + totlen + wrapearly) %
 	    alq->aq_buflen;
 	alq->aq_freebytes += totlen + wrapearly;
 
 	/*
 	 * If we just flushed part of the buffer which wrapped, reset the
 	 * wrapearly indicator.
 	 */
 	if (wrapearly)
 		alq->aq_wrapearly = 0;
 
 	/*
 	 * If we just flushed the buffer completely, reset indexes to 0 to
 	 * minimise buffer wraps.
 	 * This is also required to ensure alq_getn() can't wedge itself.
 	 */
 	if (!HAS_PENDING_DATA(alq))
 		alq->aq_writehead = alq->aq_writetail = 0;
 
 	KASSERT((alq->aq_writetail >= 0 && alq->aq_writetail < alq->aq_buflen),
 	    ("%s: aq_writetail < 0 || aq_writetail >= aq_buflen", __func__));
 
 	if (alq->aq_flags & AQ_WANTED) {
 		alq->aq_flags &= ~AQ_WANTED;
 		return (1);
 	}
 
 	return(0);
 }
 
 static struct kproc_desc ald_kp = {
         "ALQ Daemon",
         ald_daemon,
         &ald_proc
 };
 
 SYSINIT(aldthread, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &ald_kp);
 SYSINIT(ald, SI_SUB_LOCK, SI_ORDER_ANY, ald_startup, NULL);
 
 
 /* User visible queue functions */
 
 /*
  * Create the queue data structure, allocate the buffer, and open the file.
  */
 
 int
 alq_open_flags(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
     int size, int flags)
 {
 	struct thread *td;
 	struct nameidata nd;
 	struct alq *alq;
 	int oflags;
 	int error;
 
 	KASSERT((size > 0), ("%s: size <= 0", __func__));
 
 	*alqp = NULL;
 	td = curthread;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td);
 	oflags = FWRITE | O_NOFOLLOW | O_CREAT;
 
 	error = vn_open_cred(&nd, &oflags, cmode, 0, cred, NULL);
 	if (error)
 		return (error);
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	/* We just unlock so we hold a reference */
 	VOP_UNLOCK(nd.ni_vp, 0);
 
 	alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO);
 	alq->aq_vp = nd.ni_vp;
 	alq->aq_cred = crhold(cred);
 
 	mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET);
 
 	alq->aq_buflen = size;
 	alq->aq_entmax = 0;
 	alq->aq_entlen = 0;
 
 	alq->aq_freebytes = alq->aq_buflen;
 	alq->aq_entbuf = malloc(alq->aq_buflen, M_ALD, M_WAITOK|M_ZERO);
 	alq->aq_writehead = alq->aq_writetail = 0;
 	if (flags & ALQ_ORDERED)
 		alq->aq_flags |= AQ_ORDERED;
 
 	if ((error = ald_add(alq)) != 0) {
 		alq_destroy(alq);
 		return (error);
 	}
 
 	*alqp = alq;
 
 	return (0);
 }
 
 int
 alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
     int size, int count)
 {
 	int ret;
 
 	KASSERT((count >= 0), ("%s: count < 0", __func__));
 
 	if (count > 0) {
 		if ((ret = alq_open_flags(alqp, file, cred, cmode,
 		    size*count, 0)) == 0) {
 			(*alqp)->aq_flags |= AQ_LEGACY;
 			(*alqp)->aq_entmax = count;
 			(*alqp)->aq_entlen = size;
 		}
 	} else
 		ret = alq_open_flags(alqp, file, cred, cmode, size, 0);
 
 	return (ret);
 }
 
 
 /*
  * Copy a new entry into the queue.  If the operation would block either
  * wait or return an error depending on the value of waitok.
  */
 int
 alq_writen(struct alq *alq, void *data, int len, int flags)
 {
 	int activate, copy, ret;
 	void *waitchan;
 
 	KASSERT((len > 0 && len <= alq->aq_buflen),
 	    ("%s: len <= 0 || len > aq_buflen", __func__));
 
 	activate = ret = 0;
 	copy = len;
 	waitchan = NULL;
 
 	ALQ_LOCK(alq);
 
 	/*
 	 * Fail to perform the write and return EWOULDBLOCK if:
 	 * - The message is larger than our underlying buffer.
 	 * - The ALQ is being shutdown.
 	 * - There is insufficient free space in our underlying buffer
 	 *   to accept the message and the user can't wait for space.
 	 * - There is insufficient free space in our underlying buffer
 	 *   to accept the message and the alq is inactive due to prior
 	 *   use of the ALQ_NOACTIVATE flag (which would lead to deadlock).
 	 */
 	if (len > alq->aq_buflen ||
 	    alq->aq_flags & AQ_SHUTDOWN ||
 	    (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) &&
 	    HAS_PENDING_DATA(alq))) && alq->aq_freebytes < len)) {
 		ALQ_UNLOCK(alq);
 		return (EWOULDBLOCK);
 	}
 
 	/*
 	 * If we want ordered writes and there is already at least one thread
 	 * waiting for resources to become available, sleep until we're woken.
 	 */
 	if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) {
 		KASSERT(!(flags & ALQ_NOWAIT),
 		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
 		alq->aq_waiters++;
 		msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqwnord", 0);
 		alq->aq_waiters--;
 	}
 
 	/*
 	 * (ALQ_WAITOK && aq_freebytes < len) or aq_freebytes >= len, either
 	 * enter while loop and sleep until we have enough free bytes (former)
 	 * or skip (latter). If AQ_ORDERED is set, only 1 thread at a time will
 	 * be in this loop. Otherwise, multiple threads may be sleeping here
 	 * competing for ALQ resources.
 	 */
 	while (alq->aq_freebytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) {
 		KASSERT(!(flags & ALQ_NOWAIT),
 		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
 		alq->aq_flags |= AQ_WANTED;
 		alq->aq_waiters++;
 		if (waitchan)
 			wakeup(waitchan);
 		msleep_spin(alq, &alq->aq_mtx, "alqwnres", 0);
 		alq->aq_waiters--;
 
 		/*
 		 * If we're the first thread to wake after an AQ_WANTED wakeup
 		 * but there isn't enough free space for us, we're going to loop
 		 * and sleep again. If there are other threads waiting in this
 		 * loop, schedule a wakeup so that they can see if the space
 		 * they require is available.
 		 */
 		if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) &&
 		    alq->aq_freebytes < len && !(alq->aq_flags & AQ_WANTED))
 			waitchan = alq;
 		else
 			waitchan = NULL;
 	}
 
 	/*
 	 * If there are waiters, we need to signal the waiting threads after we
 	 * complete our work. The alq ptr is used as a wait channel for threads
 	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
 	 * are not allowed to concurrently compete for resources in the above
 	 * while loop, so we use a different wait channel in this case.
 	 */
 	if (alq->aq_waiters > 0) {
 		if (alq->aq_flags & AQ_ORDERED)
 			waitchan = &alq->aq_waiters;
 		else
 			waitchan = alq;
 	} else
 		waitchan = NULL;
 
 	/* Bail if we're shutting down. */
 	if (alq->aq_flags & AQ_SHUTDOWN) {
 		ret = EWOULDBLOCK;
 		goto unlock;
 	}
 
 	/*
 	 * If we need to wrap the buffer to accommodate the write,
 	 * we'll need 2 calls to bcopy.
 	 */
 	if ((alq->aq_buflen - alq->aq_writehead) < len)
 		copy = alq->aq_buflen - alq->aq_writehead;
 
 	/* Copy message (or part thereof if wrap required) to the buffer. */
 	bcopy(data, alq->aq_entbuf + alq->aq_writehead, copy);
 	alq->aq_writehead += copy;
 
 	if (alq->aq_writehead >= alq->aq_buflen) {
 		KASSERT((alq->aq_writehead == alq->aq_buflen),
 		    ("%s: alq->aq_writehead (%d) > alq->aq_buflen (%d)",
 		    __func__,
 		    alq->aq_writehead,
 		    alq->aq_buflen));
 		alq->aq_writehead = 0;
 	}
 
 	if (copy != len) {
 		/*
 		 * Wrap the buffer by copying the remainder of our message
 		 * to the start of the buffer and resetting aq_writehead.
 		 */
 		bcopy(((uint8_t *)data)+copy, alq->aq_entbuf, len - copy);
 		alq->aq_writehead = len - copy;
 	}
 
 	KASSERT((alq->aq_writehead >= 0 && alq->aq_writehead < alq->aq_buflen),
 	    ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen", __func__));
 
 	alq->aq_freebytes -= len;
 
 	if (!(alq->aq_flags & AQ_ACTIVE) && !(flags & ALQ_NOACTIVATE)) {
 		alq->aq_flags |= AQ_ACTIVE;
 		activate = 1;
 	}
 
 	KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
 
 unlock:
 	ALQ_UNLOCK(alq);
 
 	if (activate) {
 		ALD_LOCK();
 		ald_activate(alq);
 		ALD_UNLOCK();
 	}
 
 	/* NB: We rely on wakeup_one waking threads in a FIFO manner. */
 	if (waitchan != NULL)
 		wakeup_one(waitchan);
 
 	return (ret);
 }
 
 int
 alq_write(struct alq *alq, void *data, int flags)
 {
 	/* Should only be called in fixed length message (legacy) mode. */
 	KASSERT((alq->aq_flags & AQ_LEGACY),
 	    ("%s: fixed length write on variable length queue", __func__));
 	return (alq_writen(alq, data, alq->aq_entlen, flags));
 }
 
 /*
  * Retrieve a pointer for the ALQ to write directly into, avoiding bcopy.
  */
 struct ale *
 alq_getn(struct alq *alq, int len, int flags)
 {
 	int contigbytes;
 	void *waitchan;
 
 	KASSERT((len > 0 && len <= alq->aq_buflen),
 	    ("%s: len <= 0 || len > alq->aq_buflen", __func__));
 
 	waitchan = NULL;
 
 	ALQ_LOCK(alq);
 
 	/*
 	 * Determine the number of free contiguous bytes.
 	 * We ensure elsewhere that if aq_writehead == aq_writetail because
 	 * the buffer is empty, they will both be set to 0 and therefore
 	 * aq_freebytes == aq_buflen and is fully contiguous.
 	 * If they are equal and the buffer is not empty, aq_freebytes will
 	 * be 0 indicating the buffer is full.
 	 */
 	if (alq->aq_writehead <= alq->aq_writetail)
 		contigbytes = alq->aq_freebytes;
 	else {
 		contigbytes = alq->aq_buflen - alq->aq_writehead;
 
 		if (contigbytes < len) {
 			/*
 			 * Insufficient space at end of buffer to handle a
 			 * contiguous write. Wrap early if there's space at
 			 * the beginning. This will leave a hole at the end
 			 * of the buffer which we will have to skip over when
 			 * flushing the buffer to disk.
 			 */
 			if (alq->aq_writetail >= len || flags & ALQ_WAITOK) {
 				/* Keep track of # bytes left blank. */
 				alq->aq_wrapearly = contigbytes;
 				/* Do the wrap and adjust counters. */
 				contigbytes = alq->aq_freebytes =
 				    alq->aq_writetail;
 				alq->aq_writehead = 0;
 			}
 		}
 	}
 
 	/*
 	 * Return a NULL ALE if:
 	 * - The message is larger than our underlying buffer.
 	 * - The ALQ is being shutdown.
 	 * - There is insufficient free space in our underlying buffer
 	 *   to accept the message and the user can't wait for space.
 	 * - There is insufficient free space in our underlying buffer
 	 *   to accept the message and the alq is inactive due to prior
 	 *   use of the ALQ_NOACTIVATE flag (which would lead to deadlock).
 	 */
 	if (len > alq->aq_buflen ||
 	    alq->aq_flags & AQ_SHUTDOWN ||
 	    (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) &&
 	    HAS_PENDING_DATA(alq))) && contigbytes < len)) {
 		ALQ_UNLOCK(alq);
 		return (NULL);
 	}
 
 	/*
 	 * If we want ordered writes and there is already at least one thread
 	 * waiting for resources to become available, sleep until we're woken.
 	 */
 	if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) {
 		KASSERT(!(flags & ALQ_NOWAIT),
 		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
 		alq->aq_waiters++;
 		msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqgnord", 0);
 		alq->aq_waiters--;
 	}
 
 	/*
 	 * (ALQ_WAITOK && contigbytes < len) or contigbytes >= len, either enter
 	 * while loop and sleep until we have enough contiguous free bytes
 	 * (former) or skip (latter). If AQ_ORDERED is set, only 1 thread at a
 	 * time will be in this loop. Otherwise, multiple threads may be
 	 * sleeping here competing for ALQ resources.
 	 */
 	while (contigbytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) {
 		KASSERT(!(flags & ALQ_NOWAIT),
 		    ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
 		alq->aq_flags |= AQ_WANTED;
 		alq->aq_waiters++;
 		if (waitchan)
 			wakeup(waitchan);
 		msleep_spin(alq, &alq->aq_mtx, "alqgnres", 0);
 		alq->aq_waiters--;
 
 		if (alq->aq_writehead <= alq->aq_writetail)
 			contigbytes = alq->aq_freebytes;
 		else
 			contigbytes = alq->aq_buflen - alq->aq_writehead;
 
 		/*
 		 * If we're the first thread to wake after an AQ_WANTED wakeup
 		 * but there isn't enough free space for us, we're going to loop
 		 * and sleep again. If there are other threads waiting in this
 		 * loop, schedule a wakeup so that they can see if the space
 		 * they require is available.
 		 */
 		if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) &&
 		    contigbytes < len && !(alq->aq_flags & AQ_WANTED))
 			waitchan = alq;
 		else
 			waitchan = NULL;
 	}
 
 	/*
 	 * If there are waiters, we need to signal the waiting threads after we
 	 * complete our work. The alq ptr is used as a wait channel for threads
 	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
 	 * are not allowed to concurrently compete for resources in the above
 	 * while loop, so we use a different wait channel in this case.
 	 */
 	if (alq->aq_waiters > 0) {
 		if (alq->aq_flags & AQ_ORDERED)
 			waitchan = &alq->aq_waiters;
 		else
 			waitchan = alq;
 	} else
 		waitchan = NULL;
 
 	/* Bail if we're shutting down. */
 	if (alq->aq_flags & AQ_SHUTDOWN) {
 		ALQ_UNLOCK(alq);
 		if (waitchan != NULL)
 			wakeup_one(waitchan);
 		return (NULL);
 	}
 
 	/*
 	 * If we are here, we have a contiguous number of bytes >= len
 	 * available in our buffer starting at aq_writehead.
 	 */
 	alq->aq_getpost.ae_data = alq->aq_entbuf + alq->aq_writehead;
 	alq->aq_getpost.ae_bytesused = len;
 
 	return (&alq->aq_getpost);
 }
 
 struct ale *
 alq_get(struct alq *alq, int flags)
 {
 	/* Should only be called in fixed length message (legacy) mode. */
 	KASSERT((alq->aq_flags & AQ_LEGACY),
 	    ("%s: fixed length get on variable length queue", __func__));
 	return (alq_getn(alq, alq->aq_entlen, flags));
 }
 
 void
 alq_post_flags(struct alq *alq, struct ale *ale, int flags)
 {
 	int activate;
 	void *waitchan;
 
 	activate = 0;
 
 	if (ale->ae_bytesused > 0) {
 		if (!(alq->aq_flags & AQ_ACTIVE) &&
 		    !(flags & ALQ_NOACTIVATE)) {
 			alq->aq_flags |= AQ_ACTIVE;
 			activate = 1;
 		}
 
 		alq->aq_writehead += ale->ae_bytesused;
 		alq->aq_freebytes -= ale->ae_bytesused;
 
 		/* Wrap aq_writehead if we filled to the end of the buffer. */
 		if (alq->aq_writehead == alq->aq_buflen)
 			alq->aq_writehead = 0;
 
 		KASSERT((alq->aq_writehead >= 0 &&
 		    alq->aq_writehead < alq->aq_buflen),
 		    ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen",
 		    __func__));
 
 		KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
 	}
 
 	/*
 	 * If there are waiters, we need to signal the waiting threads after we
 	 * complete our work. The alq ptr is used as a wait channel for threads
 	 * requiring resources to be freed up. In the AQ_ORDERED case, threads
 	 * are not allowed to concurrently compete for resources in the
 	 * alq_getn() while loop, so we use a different wait channel in this case.
 	 */
 	if (alq->aq_waiters > 0) {
 		if (alq->aq_flags & AQ_ORDERED)
 			waitchan = &alq->aq_waiters;
 		else
 			waitchan = alq;
 	} else
 		waitchan = NULL;
 
 	ALQ_UNLOCK(alq);
 
 	if (activate) {
 		ALD_LOCK();
 		ald_activate(alq);
 		ALD_UNLOCK();
 	}
 
 	/* NB: We rely on wakeup_one waking threads in a FIFO manner. */
 	if (waitchan != NULL)
 		wakeup_one(waitchan);
 }
 
 void
 alq_flush(struct alq *alq)
 {
 	int needwakeup = 0;
 
 	ALD_LOCK();
 	ALQ_LOCK(alq);
 
 	/*
 	 * Pull the lever iff there is data to flush and we're
 	 * not already in the middle of a flush operation.
 	 */
 	if (HAS_PENDING_DATA(alq) && !(alq->aq_flags & AQ_FLUSHING)) {
 		if (alq->aq_flags & AQ_ACTIVE)
 			ald_deactivate(alq);
 
 		ALD_UNLOCK();
 		needwakeup = alq_doio(alq);
 	} else
 		ALD_UNLOCK();
 
 	ALQ_UNLOCK(alq);
 
 	if (needwakeup)
 		wakeup_one(alq);
 }
 
 /*
  * Flush remaining data, close the file and free all resources.
  */
 void
 alq_close(struct alq *alq)
 {
 	/* Only flush and destroy alq if not already shutting down. */
 	if (ald_rem(alq) == 0)
 		alq_destroy(alq);
 }
 
 static int
 alq_load_handler(module_t mod, int what, void *arg)
 {
 	int ret;
 	
 	ret = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 	case MOD_SHUTDOWN:
 		break;
 
 	case MOD_QUIESCE:
 		ALD_LOCK();
 		/* Only allow unload if there are no open queues. */
 		if (LIST_FIRST(&ald_queues) == NULL) {
 			ald_shutingdown = 1;
 			ALD_UNLOCK();
 			EVENTHANDLER_DEREGISTER(shutdown_pre_sync,
 			    alq_eventhandler_tag);
 			ald_shutdown(NULL, 0);
 			mtx_destroy(&ald_mtx);
 		} else {
 			ALD_UNLOCK();
 			ret = EBUSY;
 		}
 		break;
 
 	case MOD_UNLOAD:
 		/* If MOD_QUIESCE failed we must fail here too. */
 		if (ald_shutingdown == 0)
 			ret = EBUSY;
 		break;
 
 	default:
 		ret = EINVAL;
 		break;
 	}
 
 	return (ret);
 }
 
 static moduledata_t alq_mod =
 {
 	"alq",
 	alq_load_handler,
 	NULL
 };
 
 DECLARE_MODULE(alq, alq_mod, SI_SUB_LAST, SI_ORDER_ANY);
 MODULE_VERSION(alq, 1);
Index: head/sys/kern/kern_clocksource.c
===================================================================
--- head/sys/kern/kern_clocksource.c	(revision 326270)
+++ head/sys/kern/kern_clocksource.c	(revision 326271)
@@ -1,967 +1,969 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Common routines to manage event timers hardware.
  */
 
 #include "opt_device_polling.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/timeet.h>
 #include <sys/timetc.h>
 
 #include <machine/atomic.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 int			cpu_disable_c2_sleep = 0; /* Timer dies in C2. */
 int			cpu_disable_c3_sleep = 0; /* Timer dies in C3. */
 
 static void		setuptimer(void);
 static void		loadtimer(sbintime_t now, int first);
 static int		doconfigtimer(void);
 static void		configtimer(int start);
 static int		round_freq(struct eventtimer *et, int freq);
 
 static sbintime_t	getnextcpuevent(int idle);
 static sbintime_t	getnextevent(void);
 static int		handleevents(sbintime_t now, int fake);
 
 static struct mtx	et_hw_mtx;
 
 #define	ET_HW_LOCK(state)						\
 	{								\
 		if (timer->et_flags & ET_FLAGS_PERCPU)			\
 			mtx_lock_spin(&(state)->et_hw_mtx);		\
 		else							\
 			mtx_lock_spin(&et_hw_mtx);			\
 	}
 
 #define	ET_HW_UNLOCK(state)						\
 	{								\
 		if (timer->et_flags & ET_FLAGS_PERCPU)			\
 			mtx_unlock_spin(&(state)->et_hw_mtx);		\
 		else							\
 			mtx_unlock_spin(&et_hw_mtx);			\
 	}
 
 static struct eventtimer *timer = NULL;
 static sbintime_t	timerperiod;	/* Timer period for periodic mode. */
 static sbintime_t	statperiod;	/* statclock() events period. */
 static sbintime_t	profperiod;	/* profclock() events period. */
 static sbintime_t	nexttick;	/* Next global timer tick time. */
 static u_int		busy = 1;	/* Reconfiguration is in progress. */
 static int		profiling;	/* Profiling events enabled. */
 
 static char		timername[32];	/* Wanted timer. */
 TUNABLE_STR("kern.eventtimer.timer", timername, sizeof(timername));
 
 static int		singlemul;	/* Multiplier for periodic mode. */
 SYSCTL_INT(_kern_eventtimer, OID_AUTO, singlemul, CTLFLAG_RWTUN, &singlemul,
     0, "Multiplier for periodic mode");
 
 static u_int		idletick;	/* Run periodic events when idle. */
 SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RWTUN, &idletick,
     0, "Run periodic events when idle");
 
 static int		periodic;	/* Periodic or one-shot mode. */
 static int		want_periodic;	/* What mode to prefer. */
 TUNABLE_INT("kern.eventtimer.periodic", &want_periodic);
 
 struct pcpu_state {
 	struct mtx	et_hw_mtx;	/* Per-CPU timer mutex. */
 	u_int		action;		/* Reconfiguration requests. */
 	u_int		handle;		/* Immediate handle resuests. */
 	sbintime_t	now;		/* Last tick time. */
 	sbintime_t	nextevent;	/* Next scheduled event on this CPU. */
 	sbintime_t	nexttick;	/* Next timer tick time. */
 	sbintime_t	nexthard;	/* Next hardclock() event. */
 	sbintime_t	nextstat;	/* Next statclock() event. */
 	sbintime_t	nextprof;	/* Next profclock() event. */
 	sbintime_t	nextcall;	/* Next callout event. */
 	sbintime_t	nextcallopt;	/* Next optional callout event. */
 	int		ipi;		/* This CPU needs IPI. */
 	int		idle;		/* This CPU is in idle mode. */
 };
 
 static DPCPU_DEFINE(struct pcpu_state, timerstate);
 DPCPU_DEFINE(sbintime_t, hardclocktime);
 
 /*
  * Timer broadcast IPI handler.
  */
 int
 hardclockintr(void)
 {
 	sbintime_t now;
 	struct pcpu_state *state;
 	int done;
 
 	if (doconfigtimer() || busy)
 		return (FILTER_HANDLED);
 	state = DPCPU_PTR(timerstate);
 	now = state->now;
 	CTR3(KTR_SPARE2, "ipi  at %d:    now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 	done = handleevents(now, 0);
 	return (done ? FILTER_HANDLED : FILTER_STRAY);
 }
 
 /*
  * Handle all events for specified time on this CPU
  */
 static int
 handleevents(sbintime_t now, int fake)
 {
 	sbintime_t t, *hct;
 	struct trapframe *frame;
 	struct pcpu_state *state;
 	int usermode;
 	int done, runs;
 
 	CTR3(KTR_SPARE2, "handle at %d:  now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 	done = 0;
 	if (fake) {
 		frame = NULL;
 		usermode = 0;
 	} else {
 		frame = curthread->td_intr_frame;
 		usermode = TRAPF_USERMODE(frame);
 	}
 
 	state = DPCPU_PTR(timerstate);
 
 	runs = 0;
 	while (now >= state->nexthard) {
 		state->nexthard += tick_sbt;
 		runs++;
 	}
 	if (runs) {
 		hct = DPCPU_PTR(hardclocktime);
 		*hct = state->nexthard - tick_sbt;
 		if (fake < 2) {
 			hardclock_cnt(runs, usermode);
 			done = 1;
 		}
 	}
 	runs = 0;
 	while (now >= state->nextstat) {
 		state->nextstat += statperiod;
 		runs++;
 	}
 	if (runs && fake < 2) {
 		statclock_cnt(runs, usermode);
 		done = 1;
 	}
 	if (profiling) {
 		runs = 0;
 		while (now >= state->nextprof) {
 			state->nextprof += profperiod;
 			runs++;
 		}
 		if (runs && !fake) {
 			profclock_cnt(runs, usermode, TRAPF_PC(frame));
 			done = 1;
 		}
 	} else
 		state->nextprof = state->nextstat;
 	if (now >= state->nextcallopt || now >= state->nextcall) {
 		state->nextcall = state->nextcallopt = SBT_MAX;
 		callout_process(now);
 	}
 
 	t = getnextcpuevent(0);
 	ET_HW_LOCK(state);
 	if (!busy) {
 		state->idle = 0;
 		state->nextevent = t;
 		loadtimer(now, (fake == 2) &&
 		    (timer->et_flags & ET_FLAGS_PERCPU));
 	}
 	ET_HW_UNLOCK(state);
 	return (done);
 }
 
 /*
  * Schedule binuptime of the next event on current CPU.
  */
 static sbintime_t
 getnextcpuevent(int idle)
 {
 	sbintime_t event;
 	struct pcpu_state *state;
 	u_int hardfreq;
 
 	state = DPCPU_PTR(timerstate);
 	/* Handle hardclock() events, skipping some if CPU is idle. */
 	event = state->nexthard;
 	if (idle) {
 		hardfreq = (u_int)hz / 2;
 		if (tc_min_ticktock_freq > 2
 #ifdef SMP
 		    && curcpu == CPU_FIRST()
 #endif
 		    )
 			hardfreq = hz / tc_min_ticktock_freq;
 		if (hardfreq > 1)
 			event += tick_sbt * (hardfreq - 1);
 	}
 	/* Handle callout events. */
 	if (event > state->nextcall)
 		event = state->nextcall;
 	if (!idle) { /* If CPU is active - handle other types of events. */
 		if (event > state->nextstat)
 			event = state->nextstat;
 		if (profiling && event > state->nextprof)
 			event = state->nextprof;
 	}
 	return (event);
 }
 
 /*
  * Schedule binuptime of the next event on all CPUs.
  */
 static sbintime_t
 getnextevent(void)
 {
 	struct pcpu_state *state;
 	sbintime_t event;
 #ifdef SMP
 	int	cpu;
 #endif
 	int	c;
 
 	state = DPCPU_PTR(timerstate);
 	event = state->nextevent;
 	c = -1;
 #ifdef SMP
 	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) {
 		CPU_FOREACH(cpu) {
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			if (event > state->nextevent) {
 				event = state->nextevent;
 				c = cpu;
 			}
 		}
 	}
 #endif
 	CTR4(KTR_SPARE2, "next at %d:    next %d.%08x by %d",
 	    curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c);
 	return (event);
 }
 
 /* Hardware timer callback function. */
 static void
 timercb(struct eventtimer *et, void *arg)
 {
 	sbintime_t now;
 	sbintime_t *next;
 	struct pcpu_state *state;
 #ifdef SMP
 	int cpu, bcast;
 #endif
 
 	/* Do not touch anything if somebody reconfiguring timers. */
 	if (busy)
 		return;
 	/* Update present and next tick times. */
 	state = DPCPU_PTR(timerstate);
 	if (et->et_flags & ET_FLAGS_PERCPU) {
 		next = &state->nexttick;
 	} else
 		next = &nexttick;
 	now = sbinuptime();
 	if (periodic)
 		*next = now + timerperiod;
 	else
 		*next = -1;	/* Next tick is not scheduled yet. */
 	state->now = now;
 	CTR3(KTR_SPARE2, "intr at %d:    now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 
 #ifdef SMP
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 #endif
 	/* Prepare broadcasting to other CPUs for non-per-CPU timers. */
 	bcast = 0;
 #ifdef EARLY_AP_STARTUP
 	if ((et->et_flags & ET_FLAGS_PERCPU) == 0) {
 #else
 	if ((et->et_flags & ET_FLAGS_PERCPU) == 0 && smp_started) {
 #endif
 		CPU_FOREACH(cpu) {
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			ET_HW_LOCK(state);
 			state->now = now;
 			if (now >= state->nextevent) {
 				state->nextevent += SBT_1S;
 				if (curcpu != cpu) {
 					state->ipi = 1;
 					bcast = 1;
 				}
 			}
 			ET_HW_UNLOCK(state);
 		}
 	}
 #endif
 
 	/* Handle events for this time on this CPU. */
 	handleevents(now, 0);
 
 #ifdef SMP
 	/* Broadcast interrupt to other CPUs for non-per-CPU timers. */
 	if (bcast) {
 		CPU_FOREACH(cpu) {
 			if (curcpu == cpu)
 				continue;
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			if (state->ipi) {
 				state->ipi = 0;
 				ipi_cpu(cpu, IPI_HARDCLOCK);
 			}
 		}
 	}
 #endif
 }
 
 /*
  * Load new value into hardware timer.
  */
 static void
 loadtimer(sbintime_t now, int start)
 {
 	struct pcpu_state *state;
 	sbintime_t new;
 	sbintime_t *next;
 	uint64_t tmp;
 	int eq;
 
 	if (timer->et_flags & ET_FLAGS_PERCPU) {
 		state = DPCPU_PTR(timerstate);
 		next = &state->nexttick;
 	} else
 		next = &nexttick;
 	if (periodic) {
 		if (start) {
 			/*
 			 * Try to start all periodic timers aligned
 			 * to period to make events synchronous.
 			 */
 			tmp = now % timerperiod;
 			new = timerperiod - tmp;
 			if (new < tmp)		/* Left less then passed. */
 				new += timerperiod;
 			CTR5(KTR_SPARE2, "load p at %d:   now %d.%08x first in %d.%08x",
 			    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff),
 			    (int)(new >> 32), (u_int)(new & 0xffffffff));
 			*next = new + now;
 			et_start(timer, new, timerperiod);
 		}
 	} else {
 		new = getnextevent();
 		eq = (new == *next);
 		CTR4(KTR_SPARE2, "load at %d:    next %d.%08x eq %d",
 		    curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq);
 		if (!eq) {
 			*next = new;
 			et_start(timer, new - now, 0);
 		}
 	}
 }
 
 /*
  * Prepare event timer parameters after configuration changes.
  */
 static void
 setuptimer(void)
 {
 	int freq;
 
 	if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0)
 		periodic = 0;
 	else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0)
 		periodic = 1;
 	singlemul = MIN(MAX(singlemul, 1), 20);
 	freq = hz * singlemul;
 	while (freq < (profiling ? profhz : stathz))
 		freq += hz;
 	freq = round_freq(timer, freq);
 	timerperiod = SBT_1S / freq;
 }
 
 /*
  * Reconfigure specified per-CPU timer on other CPU. Called from IPI handler.
  */
 static int
 doconfigtimer(void)
 {
 	sbintime_t now;
 	struct pcpu_state *state;
 
 	state = DPCPU_PTR(timerstate);
 	switch (atomic_load_acq_int(&state->action)) {
 	case 1:
 		now = sbinuptime();
 		ET_HW_LOCK(state);
 		loadtimer(now, 1);
 		ET_HW_UNLOCK(state);
 		state->handle = 0;
 		atomic_store_rel_int(&state->action, 0);
 		return (1);
 	case 2:
 		ET_HW_LOCK(state);
 		et_stop(timer);
 		ET_HW_UNLOCK(state);
 		state->handle = 0;
 		atomic_store_rel_int(&state->action, 0);
 		return (1);
 	}
 	if (atomic_readandclear_int(&state->handle) && !busy) {
 		now = sbinuptime();
 		handleevents(now, 0);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Reconfigure specified timer.
  * For per-CPU timers use IPI to make other CPUs to reconfigure.
  */
 static void
 configtimer(int start)
 {
 	sbintime_t now, next;
 	struct pcpu_state *state;
 	int cpu;
 
 	if (start) {
 		setuptimer();
 		now = sbinuptime();
 	} else
 		now = 0;
 	critical_enter();
 	ET_HW_LOCK(DPCPU_PTR(timerstate));
 	if (start) {
 		/* Initialize time machine parameters. */
 		next = now + timerperiod;
 		if (periodic)
 			nexttick = next;
 		else
 			nexttick = -1;
 #ifdef EARLY_AP_STARTUP
 		MPASS(mp_ncpus == 1 || smp_started);
 #endif
 		CPU_FOREACH(cpu) {
 			state = DPCPU_ID_PTR(cpu, timerstate);
 			state->now = now;
 #ifndef EARLY_AP_STARTUP
 			if (!smp_started && cpu != CPU_FIRST())
 				state->nextevent = SBT_MAX;
 			else
 #endif
 				state->nextevent = next;
 			if (periodic)
 				state->nexttick = next;
 			else
 				state->nexttick = -1;
 			state->nexthard = next;
 			state->nextstat = next;
 			state->nextprof = next;
 			state->nextcall = next;
 			state->nextcallopt = next;
 			hardclock_sync(cpu);
 		}
 		busy = 0;
 		/* Start global timer or per-CPU timer of this CPU. */
 		loadtimer(now, 1);
 	} else {
 		busy = 1;
 		/* Stop global timer or per-CPU timer of this CPU. */
 		et_stop(timer);
 	}
 	ET_HW_UNLOCK(DPCPU_PTR(timerstate));
 #ifdef SMP
 #ifdef EARLY_AP_STARTUP
 	/* If timer is global we are done. */
 	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) {
 #else
 	/* If timer is global or there is no other CPUs yet - we are done. */
 	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || !smp_started) {
 #endif
 		critical_exit();
 		return;
 	}
 	/* Set reconfigure flags for other CPUs. */
 	CPU_FOREACH(cpu) {
 		state = DPCPU_ID_PTR(cpu, timerstate);
 		atomic_store_rel_int(&state->action,
 		    (cpu == curcpu) ? 0 : ( start ? 1 : 2));
 	}
 	/* Broadcast reconfigure IPI. */
 	ipi_all_but_self(IPI_HARDCLOCK);
 	/* Wait for reconfiguration completed. */
 restart:
 	cpu_spinwait();
 	CPU_FOREACH(cpu) {
 		if (cpu == curcpu)
 			continue;
 		state = DPCPU_ID_PTR(cpu, timerstate);
 		if (atomic_load_acq_int(&state->action))
 			goto restart;
 	}
 #endif
 	critical_exit();
 }
 
 /*
  * Calculate nearest frequency supported by hardware timer.
  */
 static int
 round_freq(struct eventtimer *et, int freq)
 {
 	uint64_t div;
 
 	if (et->et_frequency != 0) {
 		div = lmax((et->et_frequency + freq / 2) / freq, 1);
 		if (et->et_flags & ET_FLAGS_POW2DIV)
 			div = 1 << (flsl(div + div / 2) - 1);
 		freq = (et->et_frequency + div / 2) / div;
 	}
 	if (et->et_min_period > SBT_1S)
 		panic("Event timer \"%s\" doesn't support sub-second periods!",
 		    et->et_name);
 	else if (et->et_min_period != 0)
 		freq = min(freq, SBT2FREQ(et->et_min_period));
 	if (et->et_max_period < SBT_1S && et->et_max_period != 0)
 		freq = max(freq, SBT2FREQ(et->et_max_period));
 	return (freq);
 }
 
 /*
  * Configure and start event timers (BSP part).
  */
 void
 cpu_initclocks_bsp(void)
 {
 	struct pcpu_state *state;
 	int base, div, cpu;
 
 	mtx_init(&et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
 	CPU_FOREACH(cpu) {
 		state = DPCPU_ID_PTR(cpu, timerstate);
 		mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
 		state->nextcall = SBT_MAX;
 		state->nextcallopt = SBT_MAX;
 	}
 	periodic = want_periodic;
 	/* Grab requested timer or the best of present. */
 	if (timername[0])
 		timer = et_find(timername, 0, 0);
 	if (timer == NULL && periodic) {
 		timer = et_find(NULL,
 		    ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC);
 	}
 	if (timer == NULL) {
 		timer = et_find(NULL,
 		    ET_FLAGS_ONESHOT, ET_FLAGS_ONESHOT);
 	}
 	if (timer == NULL && !periodic) {
 		timer = et_find(NULL,
 		    ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC);
 	}
 	if (timer == NULL)
 		panic("No usable event timer found!");
 	et_init(timer, timercb, NULL, NULL);
 
 	/* Adapt to timer capabilities. */
 	if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0)
 		periodic = 0;
 	else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0)
 		periodic = 1;
 	if (timer->et_flags & ET_FLAGS_C3STOP)
 		cpu_disable_c3_sleep++;
 
 	/*
 	 * We honor the requested 'hz' value.
 	 * We want to run stathz in the neighborhood of 128hz.
 	 * We would like profhz to run as often as possible.
 	 */
 	if (singlemul <= 0 || singlemul > 20) {
 		if (hz >= 1500 || (hz % 128) == 0)
 			singlemul = 1;
 		else if (hz >= 750)
 			singlemul = 2;
 		else
 			singlemul = 4;
 	}
 	if (periodic) {
 		base = round_freq(timer, hz * singlemul);
 		singlemul = max((base + hz / 2) / hz, 1);
 		hz = (base + singlemul / 2) / singlemul;
 		if (base <= 128)
 			stathz = base;
 		else {
 			div = base / 128;
 			if (div >= singlemul && (div % singlemul) == 0)
 				div++;
 			stathz = base / div;
 		}
 		profhz = stathz;
 		while ((profhz + stathz) <= 128 * 64)
 			profhz += stathz;
 		profhz = round_freq(timer, profhz);
 	} else {
 		hz = round_freq(timer, hz);
 		stathz = round_freq(timer, 127);
 		profhz = round_freq(timer, stathz * 64);
 	}
 	tick = 1000000 / hz;
 	tick_sbt = SBT_1S / hz;
 	tick_bt = sbttobt(tick_sbt);
 	statperiod = SBT_1S / stathz;
 	profperiod = SBT_1S / profhz;
 	ET_LOCK();
 	configtimer(1);
 	ET_UNLOCK();
 }
 
 /*
  * Start per-CPU event timers on APs.
  */
 void
 cpu_initclocks_ap(void)
 {
 	sbintime_t now;
 	struct pcpu_state *state;
 	struct thread *td;
 
 	state = DPCPU_PTR(timerstate);
 	now = sbinuptime();
 	ET_HW_LOCK(state);
 	state->now = now;
 	hardclock_sync(curcpu);
 	spinlock_enter();
 	ET_HW_UNLOCK(state);
 	td = curthread;
 	td->td_intr_nesting_level++;
 	handleevents(state->now, 2);
 	td->td_intr_nesting_level--;
 	spinlock_exit();
 }
 
 /*
  * Switch to profiling clock rates.
  */
 void
 cpu_startprofclock(void)
 {
 
 	ET_LOCK();
 	if (profiling == 0) {
 		if (periodic) {
 			configtimer(0);
 			profiling = 1;
 			configtimer(1);
 		} else
 			profiling = 1;
 	} else
 		profiling++;
 	ET_UNLOCK();
 }
 
 /*
  * Switch to regular clock rates.
  */
 void
 cpu_stopprofclock(void)
 {
 
 	ET_LOCK();
 	if (profiling == 1) {
 		if (periodic) {
 			configtimer(0);
 			profiling = 0;
 			configtimer(1);
 		} else
 		profiling = 0;
 	} else
 		profiling--;
 	ET_UNLOCK();
 }
 
 /*
  * Switch to idle mode (all ticks handled).
  */
 sbintime_t
 cpu_idleclock(void)
 {
 	sbintime_t now, t;
 	struct pcpu_state *state;
 
 	if (idletick || busy ||
 	    (periodic && (timer->et_flags & ET_FLAGS_PERCPU))
 #ifdef DEVICE_POLLING
 	    || curcpu == CPU_FIRST()
 #endif
 	    )
 		return (-1);
 	state = DPCPU_PTR(timerstate);
 	if (periodic)
 		now = state->now;
 	else
 		now = sbinuptime();
 	CTR3(KTR_SPARE2, "idle at %d:    now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 	t = getnextcpuevent(1);
 	ET_HW_LOCK(state);
 	state->idle = 1;
 	state->nextevent = t;
 	if (!periodic)
 		loadtimer(now, 0);
 	ET_HW_UNLOCK(state);
 	return (MAX(t - now, 0));
 }
 
 /*
  * Switch to active mode (skip empty ticks).
  */
 void
 cpu_activeclock(void)
 {
 	sbintime_t now;
 	struct pcpu_state *state;
 	struct thread *td;
 
 	state = DPCPU_PTR(timerstate);
 	if (state->idle == 0 || busy)
 		return;
 	if (periodic)
 		now = state->now;
 	else
 		now = sbinuptime();
 	CTR3(KTR_SPARE2, "active at %d:  now  %d.%08x",
 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
 	spinlock_enter();
 	td = curthread;
 	td->td_intr_nesting_level++;
 	handleevents(now, 1);
 	td->td_intr_nesting_level--;
 	spinlock_exit();
 }
 
 /*
  * Change the frequency of the given timer.  This changes et->et_frequency and
  * if et is the active timer it reconfigures the timer on all CPUs.  This is
  * intended to be a private interface for the use of et_change_frequency() only.
  */
 void
 cpu_et_frequency(struct eventtimer *et, uint64_t newfreq)
 {
 
 	ET_LOCK();
 	if (et == timer) {
 		configtimer(0);
 		et->et_frequency = newfreq;
 		configtimer(1);
 	} else
 		et->et_frequency = newfreq;
 	ET_UNLOCK();
 }
 
 void
 cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt)
 {
 	struct pcpu_state *state;
 
 	/* Do not touch anything if somebody reconfiguring timers. */
 	if (busy)
 		return;
 	CTR6(KTR_SPARE2, "new co at %d:    on %d at %d.%08x - %d.%08x",
 	    curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff),
 	    (int)(bt >> 32), (u_int)(bt & 0xffffffff));
 
 	KASSERT(!CPU_ABSENT(cpu), ("Absent CPU %d", cpu));
 	state = DPCPU_ID_PTR(cpu, timerstate);
 	ET_HW_LOCK(state);
 
 	/*
 	 * If there is callout time already set earlier -- do nothing.
 	 * This check may appear redundant because we check already in
 	 * callout_process() but this double check guarantees we're safe
 	 * with respect to race conditions between interrupts execution
 	 * and scheduling.
 	 */
 	state->nextcallopt = bt_opt;
 	if (bt >= state->nextcall)
 		goto done;
 	state->nextcall = bt;
 	/* If there is some other event set earlier -- do nothing. */
 	if (bt >= state->nextevent)
 		goto done;
 	state->nextevent = bt;
 	/* If timer is periodic -- there is nothing to reprogram. */
 	if (periodic)
 		goto done;
 	/* If timer is global or of the current CPU -- reprogram it. */
 	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) {
 		loadtimer(sbinuptime(), 0);
 done:
 		ET_HW_UNLOCK(state);
 		return;
 	}
 	/* Otherwise make other CPU to reprogram it. */
 	state->handle = 1;
 	ET_HW_UNLOCK(state);
 #ifdef SMP
 	ipi_cpu(cpu, IPI_HARDCLOCK);
 #endif
 }
 
 /*
  * Report or change the active event timers hardware.
  */
 static int
 sysctl_kern_eventtimer_timer(SYSCTL_HANDLER_ARGS)
 {
 	char buf[32];
 	struct eventtimer *et;
 	int error;
 
 	ET_LOCK();
 	et = timer;
 	snprintf(buf, sizeof(buf), "%s", et->et_name);
 	ET_UNLOCK();
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	ET_LOCK();
 	et = timer;
 	if (error != 0 || req->newptr == NULL ||
 	    strcasecmp(buf, et->et_name) == 0) {
 		ET_UNLOCK();
 		return (error);
 	}
 	et = et_find(buf, 0, 0);
 	if (et == NULL) {
 		ET_UNLOCK();
 		return (ENOENT);
 	}
 	configtimer(0);
 	et_free(timer);
 	if (et->et_flags & ET_FLAGS_C3STOP)
 		cpu_disable_c3_sleep++;
 	if (timer->et_flags & ET_FLAGS_C3STOP)
 		cpu_disable_c3_sleep--;
 	periodic = want_periodic;
 	timer = et;
 	et_init(timer, timercb, NULL, NULL);
 	configtimer(1);
 	ET_UNLOCK();
 	return (error);
 }
 SYSCTL_PROC(_kern_eventtimer, OID_AUTO, timer,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_eventtimer_timer, "A", "Chosen event timer");
 
 /*
  * Report or change the active event timer periodicity.
  */
 static int
 sysctl_kern_eventtimer_periodic(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = periodic;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	ET_LOCK();
 	configtimer(0);
 	periodic = want_periodic = val;
 	configtimer(1);
 	ET_UNLOCK();
 	return (error);
 }
 SYSCTL_PROC(_kern_eventtimer, OID_AUTO, periodic,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_eventtimer_periodic, "I", "Enable event timer periodic mode");
 
 #include "opt_ddb.h"
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(clocksource, db_show_clocksource)
 {
 	struct pcpu_state *st;
 	int c;
 
 	CPU_FOREACH(c) {
 		st = DPCPU_ID_PTR(c, timerstate);
 		db_printf(
 		    "CPU %2d: action %d handle %d  ipi %d idle %d\n"
 		    "        now %#jx nevent %#jx (%jd)\n"
 		    "        ntick %#jx (%jd) nhard %#jx (%jd)\n"
 		    "        nstat %#jx (%jd) nprof %#jx (%jd)\n"
 		    "        ncall %#jx (%jd) ncallopt %#jx (%jd)\n",
 		    c, st->action, st->handle, st->ipi, st->idle,
 		    (uintmax_t)st->now,
 		    (uintmax_t)st->nextevent,
 		    (uintmax_t)(st->nextevent - st->now) / tick_sbt,
 		    (uintmax_t)st->nexttick,
 		    (uintmax_t)(st->nexttick - st->now) / tick_sbt,
 		    (uintmax_t)st->nexthard,
 		    (uintmax_t)(st->nexthard - st->now) / tick_sbt,
 		    (uintmax_t)st->nextstat,
 		    (uintmax_t)(st->nextstat - st->now) / tick_sbt,
 		    (uintmax_t)st->nextprof,
 		    (uintmax_t)(st->nextprof - st->now) / tick_sbt,
 		    (uintmax_t)st->nextcall,
 		    (uintmax_t)(st->nextcall - st->now) / tick_sbt,
 		    (uintmax_t)st->nextcallopt,
 		    (uintmax_t)(st->nextcallopt - st->now) / tick_sbt);
 	}
 }
 
 #endif
Index: head/sys/kern/kern_condvar.c
===================================================================
--- head/sys/kern/kern_condvar.c	(revision 326270)
+++ head/sys/kern/kern_condvar.c	(revision 326271)
@@ -1,446 +1,448 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000 Jake Burkholder <jake@freebsd.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/resourcevar.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 /*
  * A bound below which cv_waiters is valid.  Once cv_waiters reaches this bound,
  * cv_signal must manually check the wait queue for threads.
  */
 #define	CV_WAITERS_BOUND	INT_MAX
 
 #define	CV_WAITERS_INC(cvp) do {					\
 	if ((cvp)->cv_waiters < CV_WAITERS_BOUND)			\
 		(cvp)->cv_waiters++;					\
 } while (0)
 
 /*
  * Common sanity checks for cv_wait* functions.
  */
 #define	CV_ASSERT(cvp, lock, td) do {					\
 	KASSERT((td) != NULL, ("%s: td NULL", __func__));		\
 	KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__));	\
 	KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__));		\
 	KASSERT((lock) != NULL, ("%s: lock NULL", __func__));		\
 } while (0)
 
 /*
  * Initialize a condition variable.  Must be called before use.
  */
 void
 cv_init(struct cv *cvp, const char *desc)
 {
 
 	cvp->cv_description = desc;
 	cvp->cv_waiters = 0;
 }
 
 /*
  * Destroy a condition variable.  The condition variable must be re-initialized
  * in order to be re-used.
  */
 void
 cv_destroy(struct cv *cvp)
 {
 #ifdef INVARIANTS
 	struct sleepqueue *sq;
 
 	sleepq_lock(cvp);
 	sq = sleepq_lookup(cvp);
 	sleepq_release(cvp);
 	KASSERT(sq == NULL, ("%s: associated sleep queue non-empty", __func__));
 #endif
 }
 
 /*
  * Wait on a condition variable.  The current thread is placed on the condition
  * variable's wait queue and suspended.  A cv_signal or cv_broadcast on the same
  * condition variable will resume the thread.  The mutex is released before
  * sleeping and will be held on return.  It is recommended that the mutex be
  * held when cv_signal or cv_broadcast are called.
  */
 void
 _cv_wait(struct cv *cvp, struct lock_object *lock)
 {
 	WITNESS_SAVE_DECL(lock_witness);
 	struct lock_class *class;
 	struct thread *td;
 	uintptr_t lock_state;
 
 	td = curthread;
 	lock_state = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0, cv_wmesg(cvp));
 #endif
 	CV_ASSERT(cvp, lock, td);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
 	class = LOCK_CLASS(lock);
 
 	if (SCHEDULER_STOPPED_TD(td))
 		return;
 
 	sleepq_lock(cvp);
 
 	CV_WAITERS_INC(cvp);
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
 	if (lock != &Giant.lock_object) {
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_release(cvp);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_lock(cvp);
 	}
 	sleepq_wait(cvp, 0);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0, cv_wmesg(cvp));
 #endif
 	PICKUP_GIANT();
 	if (lock != &Giant.lock_object) {
 		class->lc_lock(lock, lock_state);
 		WITNESS_RESTORE(lock, lock_witness);
 	}
 }
 
 /*
  * Wait on a condition variable.  This function differs from cv_wait by
  * not acquiring the mutex after condition variable was signaled.
  */
 void
 _cv_wait_unlock(struct cv *cvp, struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct thread *td;
 
 	td = curthread;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0, cv_wmesg(cvp));
 #endif
 	CV_ASSERT(cvp, lock, td);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
 	KASSERT(lock != &Giant.lock_object,
 	    ("cv_wait_unlock cannot be used with Giant"));
 	class = LOCK_CLASS(lock);
 
 	if (SCHEDULER_STOPPED_TD(td)) {
 		class->lc_unlock(lock);
 		return;
 	}
 
 	sleepq_lock(cvp);
 
 	CV_WAITERS_INC(cvp);
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
 	if (class->lc_flags & LC_SLEEPABLE)
 		sleepq_release(cvp);
 	class->lc_unlock(lock);
 	if (class->lc_flags & LC_SLEEPABLE)
 		sleepq_lock(cvp);
 	sleepq_wait(cvp, 0);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0, cv_wmesg(cvp));
 #endif
 	PICKUP_GIANT();
 }
 
 /*
  * Wait on a condition variable, allowing interruption by signals.  Return 0 if
  * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if
  * a signal was caught.  If ERESTART is returned the system call should be
  * restarted if possible.
  */
 int
 _cv_wait_sig(struct cv *cvp, struct lock_object *lock)
 {
 	WITNESS_SAVE_DECL(lock_witness);
 	struct lock_class *class;
 	struct thread *td;
 	uintptr_t lock_state;
 	int rval;
 
 	td = curthread;
 	lock_state = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0, cv_wmesg(cvp));
 #endif
 	CV_ASSERT(cvp, lock, td);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
 	class = LOCK_CLASS(lock);
 
 	if (SCHEDULER_STOPPED_TD(td))
 		return (0);
 
 	sleepq_lock(cvp);
 
 	CV_WAITERS_INC(cvp);
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
 	    SLEEPQ_INTERRUPTIBLE, 0);
 	if (lock != &Giant.lock_object) {
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_release(cvp);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_lock(cvp);
 	}
 	rval = sleepq_wait_sig(cvp, 0);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0, cv_wmesg(cvp));
 #endif
 	PICKUP_GIANT();
 	if (lock != &Giant.lock_object) {
 		class->lc_lock(lock, lock_state);
 		WITNESS_RESTORE(lock, lock_witness);
 	}
 
 	return (rval);
 }
 
 /*
  * Wait on a condition variable for (at most) the value specified in sbt
  * argument. Returns 0 if the process was resumed by cv_signal or cv_broadcast,
  * EWOULDBLOCK if the timeout expires.
  */
 int
 _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt,
     sbintime_t pr, int flags)
 {
 	WITNESS_SAVE_DECL(lock_witness);
 	struct lock_class *class;
 	struct thread *td;
 	int lock_state, rval;
 
 	td = curthread;
 	lock_state = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0, cv_wmesg(cvp));
 #endif
 	CV_ASSERT(cvp, lock, td);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
 	class = LOCK_CLASS(lock);
 
 	if (SCHEDULER_STOPPED_TD(td))
 		return (0);
 
 	sleepq_lock(cvp);
 
 	CV_WAITERS_INC(cvp);
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
 	sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
 	if (lock != &Giant.lock_object) {
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_release(cvp);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_lock(cvp);
 	}
 	rval = sleepq_timedwait(cvp, 0);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0, cv_wmesg(cvp));
 #endif
 	PICKUP_GIANT();
 	if (lock != &Giant.lock_object) {
 		class->lc_lock(lock, lock_state);
 		WITNESS_RESTORE(lock, lock_witness);
 	}
 
 	return (rval);
 }
 
 /*
  * Wait on a condition variable for (at most) the value specified in sbt 
  * argument, allowing interruption by signals.
  * Returns 0 if the thread was resumed by cv_signal or cv_broadcast,
  * EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if a signal
  * was caught.
  */
 int
 _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock,
     sbintime_t sbt, sbintime_t pr, int flags)
 {
 	WITNESS_SAVE_DECL(lock_witness);
 	struct lock_class *class;
 	struct thread *td;
 	int lock_state, rval;
 
 	td = curthread;
 	lock_state = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0, cv_wmesg(cvp));
 #endif
 	CV_ASSERT(cvp, lock, td);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
 	class = LOCK_CLASS(lock);
 
 	if (SCHEDULER_STOPPED_TD(td))
 		return (0);
 
 	sleepq_lock(cvp);
 
 	CV_WAITERS_INC(cvp);
 	if (lock == &Giant.lock_object)
 		mtx_assert(&Giant, MA_OWNED);
 	DROP_GIANT();
 
 	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
 	    SLEEPQ_INTERRUPTIBLE, 0);
 	sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
 	if (lock != &Giant.lock_object) {
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_release(cvp);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 		if (class->lc_flags & LC_SLEEPABLE)
 			sleepq_lock(cvp);
 	}
 	rval = sleepq_timedwait_sig(cvp, 0);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0, cv_wmesg(cvp));
 #endif
 	PICKUP_GIANT();
 	if (lock != &Giant.lock_object) {
 		class->lc_lock(lock, lock_state);
 		WITNESS_RESTORE(lock, lock_witness);
 	}
 
 	return (rval);
 }
 
 /*
  * Signal a condition variable, wakes up one waiting thread.  Will also wakeup
  * the swapper if the process is not in memory, so that it can bring the
  * sleeping process in.  Note that this may also result in additional threads
  * being made runnable.  Should be called with the same mutex as was passed to
  * cv_wait held.
  */
 void
 cv_signal(struct cv *cvp)
 {
 	int wakeup_swapper;
 
 	if (cvp->cv_waiters == 0)
 		return;
 	wakeup_swapper = 0;
 	sleepq_lock(cvp);
 	if (cvp->cv_waiters > 0) {
 		if (cvp->cv_waiters == CV_WAITERS_BOUND &&
 		    sleepq_lookup(cvp) == NULL) {
 			cvp->cv_waiters = 0;
 		} else {
 			if (cvp->cv_waiters < CV_WAITERS_BOUND)
 				cvp->cv_waiters--;
 			wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0,
 			    0);
 		}
 	}
 	sleepq_release(cvp);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Broadcast a signal to a condition variable.  Wakes up all waiting threads.
  * Should be called with the same mutex as was passed to cv_wait held.
  */
 void
 cv_broadcastpri(struct cv *cvp, int pri)
 {
 	int wakeup_swapper;
 
 	if (cvp->cv_waiters == 0)
 		return;
 	/*
 	 * XXX sleepq_broadcast pri argument changed from -1 meaning
 	 * no pri to 0 meaning no pri.
 	 */
 	wakeup_swapper = 0;
 	if (pri == -1)
 		pri = 0;
 	sleepq_lock(cvp);
 	if (cvp->cv_waiters > 0) {
 		cvp->cv_waiters = 0;
 		wakeup_swapper = sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0);
 	}
 	sleepq_release(cvp);
 	if (wakeup_swapper)
 		kick_proc0();
 }
Index: head/sys/kern/kern_conf.c
===================================================================
--- head/sys/kern/kern_conf.c	(revision 326270)
+++ head/sys/kern/kern_conf.c	(revision 326271)
@@ -1,1564 +1,1566 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999-2002 Poul-Henning Kamp
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/bio.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/vnode.h>
 #include <sys/queue.h>
 #include <sys/poll.h>
 #include <sys/sx.h>
 #include <sys/ctype.h>
 #include <sys/ucred.h>
 #include <sys/taskqueue.h>
 #include <machine/stdarg.h>
 
 #include <fs/devfs/devfs_int.h>
 #include <vm/vm.h>
 
 static MALLOC_DEFINE(M_DEVT, "cdev", "cdev storage");
 
 struct mtx devmtx;
 static void destroy_devl(struct cdev *dev);
 static int destroy_dev_sched_cbl(struct cdev *dev,
     void (*cb)(void *), void *arg);
 static void destroy_dev_tq(void *ctx, int pending);
 static int make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw,
     int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
     va_list ap);
 
 static struct cdev_priv_list cdevp_free_list =
     TAILQ_HEAD_INITIALIZER(cdevp_free_list);
 static SLIST_HEAD(free_cdevsw, cdevsw) cdevsw_gt_post_list =
     SLIST_HEAD_INITIALIZER(cdevsw_gt_post_list);
 
 void
 dev_lock(void)
 {
 
 	mtx_lock(&devmtx);
 }
 
 /*
  * Free all the memory collected while the cdev mutex was
  * locked. Since devmtx is after the system map mutex, free() cannot
  * be called immediately and is postponed until cdev mutex can be
  * dropped.
  */
 static void
 dev_unlock_and_free(void)
 {
 	struct cdev_priv_list cdp_free;
 	struct free_cdevsw csw_free;
 	struct cdev_priv *cdp;
 	struct cdevsw *csw;
 
 	mtx_assert(&devmtx, MA_OWNED);
 
 	/*
 	 * Make the local copy of the list heads while the dev_mtx is
 	 * held. Free it later.
 	 */
 	TAILQ_INIT(&cdp_free);
 	TAILQ_CONCAT(&cdp_free, &cdevp_free_list, cdp_list);
 	csw_free = cdevsw_gt_post_list;
 	SLIST_INIT(&cdevsw_gt_post_list);
 
 	mtx_unlock(&devmtx);
 
 	while ((cdp = TAILQ_FIRST(&cdp_free)) != NULL) {
 		TAILQ_REMOVE(&cdp_free, cdp, cdp_list);
 		devfs_free(&cdp->cdp_c);
 	}
 	while ((csw = SLIST_FIRST(&csw_free)) != NULL) {
 		SLIST_REMOVE_HEAD(&csw_free, d_postfree_list);
 		free(csw, M_DEVT);
 	}
 }
 
 static void
 dev_free_devlocked(struct cdev *cdev)
 {
 	struct cdev_priv *cdp;
 
 	mtx_assert(&devmtx, MA_OWNED);
 	cdp = cdev2priv(cdev);
 	KASSERT((cdp->cdp_flags & CDP_UNREF_DTR) == 0,
 	    ("destroy_dev() was not called after delist_dev(%p)", cdev));
 	TAILQ_INSERT_HEAD(&cdevp_free_list, cdp, cdp_list);
 }
 
 static void
 cdevsw_free_devlocked(struct cdevsw *csw)
 {
 
 	mtx_assert(&devmtx, MA_OWNED);
 	SLIST_INSERT_HEAD(&cdevsw_gt_post_list, csw, d_postfree_list);
 }
 
 void
 dev_unlock(void)
 {
 
 	mtx_unlock(&devmtx);
 }
 
 void
 dev_ref(struct cdev *dev)
 {
 
 	mtx_assert(&devmtx, MA_NOTOWNED);
 	mtx_lock(&devmtx);
 	dev->si_refcount++;
 	mtx_unlock(&devmtx);
 }
 
 void
 dev_refl(struct cdev *dev)
 {
 
 	mtx_assert(&devmtx, MA_OWNED);
 	dev->si_refcount++;
 }
 
 void
 dev_rel(struct cdev *dev)
 {
 	int flag = 0;
 
 	mtx_assert(&devmtx, MA_NOTOWNED);
 	dev_lock();
 	dev->si_refcount--;
 	KASSERT(dev->si_refcount >= 0,
 	    ("dev_rel(%s) gave negative count", devtoname(dev)));
 #if 0
 	if (dev->si_usecount == 0 &&
 	    (dev->si_flags & SI_CHEAPCLONE) && (dev->si_flags & SI_NAMED))
 		;
 	else 
 #endif
 	if (dev->si_devsw == NULL && dev->si_refcount == 0) {
 		LIST_REMOVE(dev, si_list);
 		flag = 1;
 	}
 	dev_unlock();
 	if (flag)
 		devfs_free(dev);
 }
 
 struct cdevsw *
 dev_refthread(struct cdev *dev, int *ref)
 {
 	struct cdevsw *csw;
 	struct cdev_priv *cdp;
 
 	mtx_assert(&devmtx, MA_NOTOWNED);
 	if ((dev->si_flags & SI_ETERNAL) != 0) {
 		*ref = 0;
 		return (dev->si_devsw);
 	}
 	dev_lock();
 	csw = dev->si_devsw;
 	if (csw != NULL) {
 		cdp = cdev2priv(dev);
 		if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0)
 			atomic_add_long(&dev->si_threadcount, 1);
 		else
 			csw = NULL;
 	}
 	dev_unlock();
 	*ref = 1;
 	return (csw);
 }
 
 struct cdevsw *
 devvn_refthread(struct vnode *vp, struct cdev **devp, int *ref)
 {
 	struct cdevsw *csw;
 	struct cdev_priv *cdp;
 	struct cdev *dev;
 
 	mtx_assert(&devmtx, MA_NOTOWNED);
 	if ((vp->v_vflag & VV_ETERNALDEV) != 0) {
 		dev = vp->v_rdev;
 		if (dev == NULL)
 			return (NULL);
 		KASSERT((dev->si_flags & SI_ETERNAL) != 0,
 		    ("Not eternal cdev"));
 		*ref = 0;
 		csw = dev->si_devsw;
 		KASSERT(csw != NULL, ("Eternal cdev is destroyed"));
 		*devp = dev;
 		return (csw);
 	}
 
 	csw = NULL;
 	dev_lock();
 	dev = vp->v_rdev;
 	if (dev == NULL) {
 		dev_unlock();
 		return (NULL);
 	}
 	cdp = cdev2priv(dev);
 	if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) {
 		csw = dev->si_devsw;
 		if (csw != NULL)
 			atomic_add_long(&dev->si_threadcount, 1);
 	}
 	dev_unlock();
 	if (csw != NULL) {
 		*devp = dev;
 		*ref = 1;
 	}
 	return (csw);
 }
 
 void	
 dev_relthread(struct cdev *dev, int ref)
 {
 
 	mtx_assert(&devmtx, MA_NOTOWNED);
 	if (!ref)
 		return;
 	KASSERT(dev->si_threadcount > 0,
 	    ("%s threadcount is wrong", dev->si_name));
 	atomic_subtract_rel_long(&dev->si_threadcount, 1);
 }
 
 int
 nullop(void)
 {
 
 	return (0);
 }
 
 int
 eopnotsupp(void)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 enxio(void)
 {
 	return (ENXIO);
 }
 
 static int
 enodev(void)
 {
 	return (ENODEV);
 }
 
 /* Define a dead_cdevsw for use when devices leave unexpectedly. */
 
 #define dead_open	(d_open_t *)enxio
 #define dead_close	(d_close_t *)enxio
 #define dead_read	(d_read_t *)enxio
 #define dead_write	(d_write_t *)enxio
 #define dead_ioctl	(d_ioctl_t *)enxio
 #define dead_poll	(d_poll_t *)enodev
 #define dead_mmap	(d_mmap_t *)enodev
 
 static void
 dead_strategy(struct bio *bp)
 {
 
 	biofinish(bp, NULL, ENXIO);
 }
 
 #define dead_dump	(dumper_t *)enxio
 #define dead_kqfilter	(d_kqfilter_t *)enxio
 #define dead_mmap_single (d_mmap_single_t *)enodev
 
 static struct cdevsw dead_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	dead_open,
 	.d_close =	dead_close,
 	.d_read =	dead_read,
 	.d_write =	dead_write,
 	.d_ioctl =	dead_ioctl,
 	.d_poll =	dead_poll,
 	.d_mmap =	dead_mmap,
 	.d_strategy =	dead_strategy,
 	.d_name =	"dead",
 	.d_dump =	dead_dump,
 	.d_kqfilter =	dead_kqfilter,
 	.d_mmap_single = dead_mmap_single
 };
 
 /* Default methods if driver does not specify method */
 
 #define null_open	(d_open_t *)nullop
 #define null_close	(d_close_t *)nullop
 #define no_read		(d_read_t *)enodev
 #define no_write	(d_write_t *)enodev
 #define no_ioctl	(d_ioctl_t *)enodev
 #define no_mmap		(d_mmap_t *)enodev
 #define no_kqfilter	(d_kqfilter_t *)enodev
 #define no_mmap_single	(d_mmap_single_t *)enodev
 
 static void
 no_strategy(struct bio *bp)
 {
 
 	biofinish(bp, NULL, ENODEV);
 }
 
 static int
 no_poll(struct cdev *dev __unused, int events, struct thread *td __unused)
 {
 
 	return (poll_no_poll(events));
 }
 
 #define no_dump		(dumper_t *)enodev
 
 static int
 giant_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_open(dev, oflags, devtype, td);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static int
 giant_fdopen(struct cdev *dev, int oflags, struct thread *td, struct file *fp)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_fdopen(dev, oflags, td, fp);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static int
 giant_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_close(dev, fflag, devtype, td);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static void
 giant_strategy(struct bio *bp)
 {
 	struct cdevsw *dsw;
 	struct cdev *dev;
 	int ref;
 
 	dev = bp->bio_dev;
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL) {
 		biofinish(bp, NULL, ENXIO);
 		return;
 	}
 	mtx_lock(&Giant);
 	dsw->d_gianttrick->d_strategy(bp);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 }
 
 static int
 giant_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_ioctl(dev, cmd, data, fflag, td);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
   
 static int
 giant_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_read(dev, uio, ioflag);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static int
 giant_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_write(dev, uio, ioflag);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static int
 giant_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_poll(dev, events, td);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static int
 giant_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_kqfilter(dev, kn);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static int
 giant_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot,
     vm_memattr_t *memattr)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_mmap(dev, offset, paddr, nprot,
 	    memattr);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static int
 giant_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size,
     vm_object_t *object, int nprot)
 {
 	struct cdevsw *dsw;
 	int ref, retval;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	mtx_lock(&Giant);
 	retval = dsw->d_gianttrick->d_mmap_single(dev, offset, size, object,
 	    nprot);
 	mtx_unlock(&Giant);
 	dev_relthread(dev, ref);
 	return (retval);
 }
 
 static void
 notify(struct cdev *dev, const char *ev, int flags)
 {
 	static const char prefix[] = "cdev=";
 	char *data;
 	int namelen, mflags;
 
 	if (cold)
 		return;
 	mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK;
 	namelen = strlen(dev->si_name);
 	data = malloc(namelen + sizeof(prefix), M_TEMP, mflags);
 	if (data == NULL)
 		return;
 	memcpy(data, prefix, sizeof(prefix) - 1);
 	memcpy(data + sizeof(prefix) - 1, dev->si_name, namelen + 1);
 	devctl_notify_f("DEVFS", "CDEV", ev, data, mflags);
 	free(data, M_TEMP);
 }
 
 static void
 notify_create(struct cdev *dev, int flags)
 {
 
 	notify(dev, "CREATE", flags);
 }
 
 static void
 notify_destroy(struct cdev *dev)
 {
 
 	notify(dev, "DESTROY", MAKEDEV_WAITOK);
 }
 
 static struct cdev *
 newdev(struct make_dev_args *args, struct cdev *si)
 {
 	struct cdev *si2;
 	struct cdevsw *csw;
 
 	mtx_assert(&devmtx, MA_OWNED);
 	csw = args->mda_devsw;
 	if (csw->d_flags & D_NEEDMINOR) {
 		/* We may want to return an existing device */
 		LIST_FOREACH(si2, &csw->d_devs, si_list) {
 			if (dev2unit(si2) == args->mda_unit) {
 				dev_free_devlocked(si);
 				return (si2);
 			}
 		}
 	}
 	si->si_drv0 = args->mda_unit;
 	si->si_devsw = csw;
 	si->si_drv1 = args->mda_si_drv1;
 	si->si_drv2 = args->mda_si_drv2;
 	LIST_INSERT_HEAD(&csw->d_devs, si, si_list);
 	return (si);
 }
 
 static void
 fini_cdevsw(struct cdevsw *devsw)
 {
 	struct cdevsw *gt;
 
 	if (devsw->d_gianttrick != NULL) {
 		gt = devsw->d_gianttrick;
 		memcpy(devsw, gt, sizeof *devsw);
 		cdevsw_free_devlocked(gt);
 		devsw->d_gianttrick = NULL;
 	}
 	devsw->d_flags &= ~D_INIT;
 }
 
 static int
 prep_cdevsw(struct cdevsw *devsw, int flags)
 {
 	struct cdevsw *dsw2;
 
 	mtx_assert(&devmtx, MA_OWNED);
 	if (devsw->d_flags & D_INIT)
 		return (0);
 	if (devsw->d_flags & D_NEEDGIANT) {
 		dev_unlock();
 		dsw2 = malloc(sizeof *dsw2, M_DEVT,
 		     (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK);
 		dev_lock();
 		if (dsw2 == NULL && !(devsw->d_flags & D_INIT))
 			return (ENOMEM);
 	} else
 		dsw2 = NULL;
 	if (devsw->d_flags & D_INIT) {
 		if (dsw2 != NULL)
 			cdevsw_free_devlocked(dsw2);
 		return (0);
 	}
 
 	if (devsw->d_version != D_VERSION_03) {
 		printf(
 		    "WARNING: Device driver \"%s\" has wrong version %s\n",
 		    devsw->d_name == NULL ? "???" : devsw->d_name,
 		    "and is disabled.  Recompile KLD module.");
 		devsw->d_open = dead_open;
 		devsw->d_close = dead_close;
 		devsw->d_read = dead_read;
 		devsw->d_write = dead_write;
 		devsw->d_ioctl = dead_ioctl;
 		devsw->d_poll = dead_poll;
 		devsw->d_mmap = dead_mmap;
 		devsw->d_mmap_single = dead_mmap_single;
 		devsw->d_strategy = dead_strategy;
 		devsw->d_dump = dead_dump;
 		devsw->d_kqfilter = dead_kqfilter;
 	}
 	
 	if (devsw->d_flags & D_NEEDGIANT) {
 		if (devsw->d_gianttrick == NULL) {
 			memcpy(dsw2, devsw, sizeof *dsw2);
 			devsw->d_gianttrick = dsw2;
 			dsw2 = NULL;
 		}
 	}
 
 #define FIXUP(member, noop, giant) 				\
 	do {							\
 		if (devsw->member == NULL) {			\
 			devsw->member = noop;			\
 		} else if (devsw->d_flags & D_NEEDGIANT)	\
 			devsw->member = giant;			\
 		}						\
 	while (0)
 
 	FIXUP(d_open,		null_open,	giant_open);
 	FIXUP(d_fdopen,		NULL,		giant_fdopen);
 	FIXUP(d_close,		null_close,	giant_close);
 	FIXUP(d_read,		no_read,	giant_read);
 	FIXUP(d_write,		no_write,	giant_write);
 	FIXUP(d_ioctl,		no_ioctl,	giant_ioctl);
 	FIXUP(d_poll,		no_poll,	giant_poll);
 	FIXUP(d_mmap,		no_mmap,	giant_mmap);
 	FIXUP(d_strategy,	no_strategy,	giant_strategy);
 	FIXUP(d_kqfilter,	no_kqfilter,	giant_kqfilter);
 	FIXUP(d_mmap_single,	no_mmap_single,	giant_mmap_single);
 
 	if (devsw->d_dump == NULL)	devsw->d_dump = no_dump;
 
 	LIST_INIT(&devsw->d_devs);
 
 	devsw->d_flags |= D_INIT;
 
 	if (dsw2 != NULL)
 		cdevsw_free_devlocked(dsw2);
 	return (0);
 }
 
 static int
 prep_devname(struct cdev *dev, const char *fmt, va_list ap)
 {
 	int len;
 	char *from, *q, *s, *to;
 
 	mtx_assert(&devmtx, MA_OWNED);
 
 	len = vsnrprintf(dev->si_name, sizeof(dev->si_name), 32, fmt, ap);
 	if (len > sizeof(dev->si_name) - 1)
 		return (ENAMETOOLONG);
 
 	/* Strip leading slashes. */
 	for (from = dev->si_name; *from == '/'; from++)
 		;
 
 	for (to = dev->si_name; *from != '\0'; from++, to++) {
 		/*
 		 * Spaces and double quotation marks cause
 		 * problems for the devctl(4) protocol.
 		 * Reject names containing those characters.
 		 */
 		if (isspace(*from) || *from == '"')
 			return (EINVAL);
 		/* Treat multiple sequential slashes as single. */
 		while (from[0] == '/' && from[1] == '/')
 			from++;
 		/* Trailing slash is considered invalid. */
 		if (from[0] == '/' && from[1] == '\0')
 			return (EINVAL);
 		*to = *from;
 	}
 	*to = '\0';
 
 	if (dev->si_name[0] == '\0')
 		return (EINVAL);
 
 	/* Disallow "." and ".." components. */
 	for (s = dev->si_name;;) {
 		for (q = s; *q != '/' && *q != '\0'; q++)
 			;
 		if (q - s == 1 && s[0] == '.')
 			return (EINVAL);
 		if (q - s == 2 && s[0] == '.' && s[1] == '.')
 			return (EINVAL);
 		if (*q != '/')
 			break;
 		s = q + 1;
 	}
 
 	if (devfs_dev_exists(dev->si_name) != 0)
 		return (EEXIST);
 
 	return (0);
 }
 
 void
 make_dev_args_init_impl(struct make_dev_args *args, size_t sz)
 {
 
 	bzero(args, sz);
 	args->mda_size = sz;
 }
 
 static int
 make_dev_sv(struct make_dev_args *args1, struct cdev **dres,
     const char *fmt, va_list ap)
 {
 	struct cdev *dev, *dev_new;
 	struct make_dev_args args;
 	int res;
 
 	bzero(&args, sizeof(args));
 	if (sizeof(args) < args1->mda_size)
 		return (EINVAL);
 	bcopy(args1, &args, args1->mda_size);
 	KASSERT((args.mda_flags & MAKEDEV_WAITOK) == 0 ||
 	    (args.mda_flags & MAKEDEV_NOWAIT) == 0,
 	    ("make_dev_sv: both WAITOK and NOWAIT specified"));
 	dev_new = devfs_alloc(args.mda_flags);
 	if (dev_new == NULL)
 		return (ENOMEM);
 	dev_lock();
 	res = prep_cdevsw(args.mda_devsw, args.mda_flags);
 	if (res != 0) {
 		dev_unlock();
 		devfs_free(dev_new);
 		return (res);
 	}
 	dev = newdev(&args, dev_new);
 	if ((dev->si_flags & SI_NAMED) == 0) {
 		res = prep_devname(dev, fmt, ap);
 		if (res != 0) {
 			if ((args.mda_flags & MAKEDEV_CHECKNAME) == 0) {
 				panic(
 			"make_dev_sv: bad si_name (error=%d, si_name=%s)",
 				    res, dev->si_name);
 			}
 			if (dev == dev_new) {
 				LIST_REMOVE(dev, si_list);
 				dev_unlock();
 				devfs_free(dev);
 			} else
 				dev_unlock();
 			return (res);
 		}
 	}
 	if ((args.mda_flags & MAKEDEV_REF) != 0)
 		dev_refl(dev);
 	if ((args.mda_flags & MAKEDEV_ETERNAL) != 0)
 		dev->si_flags |= SI_ETERNAL;
 	if (dev->si_flags & SI_CHEAPCLONE &&
 	    dev->si_flags & SI_NAMED) {
 		/*
 		 * This is allowed as it removes races and generally
 		 * simplifies cloning devices.
 		 * XXX: still ??
 		 */
 		dev_unlock_and_free();
 		*dres = dev;
 		return (0);
 	}
 	KASSERT(!(dev->si_flags & SI_NAMED),
 	    ("make_dev() by driver %s on pre-existing device (min=%x, name=%s)",
 	    args.mda_devsw->d_name, dev2unit(dev), devtoname(dev)));
 	dev->si_flags |= SI_NAMED;
 	if (args.mda_cr != NULL)
 		dev->si_cred = crhold(args.mda_cr);
 	dev->si_uid = args.mda_uid;
 	dev->si_gid = args.mda_gid;
 	dev->si_mode = args.mda_mode;
 
 	devfs_create(dev);
 	clean_unrhdrl(devfs_inos);
 	dev_unlock_and_free();
 
 	notify_create(dev, args.mda_flags);
 
 	*dres = dev;
 	return (0);
 }
 
 int
 make_dev_s(struct make_dev_args *args, struct cdev **dres,
     const char *fmt, ...)
 {
 	va_list ap;
 	int res;
 
 	va_start(ap, fmt);
 	res = make_dev_sv(args, dres, fmt, ap);
 	va_end(ap);
 	return (res);
 }
 
 static int
 make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw, int unit,
     struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
     va_list ap)
 {
 	struct make_dev_args args;
 
 	make_dev_args_init(&args);
 	args.mda_flags = flags;
 	args.mda_devsw = devsw;
 	args.mda_cr = cr;
 	args.mda_uid = uid;
 	args.mda_gid = gid;
 	args.mda_mode = mode;
 	args.mda_unit = unit;
 	return (make_dev_sv(&args, dres, fmt, ap));
 }
 
 struct cdev *
 make_dev(struct cdevsw *devsw, int unit, uid_t uid, gid_t gid, int mode,
     const char *fmt, ...)
 {
 	struct cdev *dev;
 	va_list ap;
 	int res;
 
 	va_start(ap, fmt);
 	res = make_dev_credv(0, &dev, devsw, unit, NULL, uid, gid, mode, fmt,
 	    ap);
 	va_end(ap);
 	KASSERT(res == 0 && dev != NULL,
 	    ("make_dev: failed make_dev_credv (error=%d)", res));
 	return (dev);
 }
 
 struct cdev *
 make_dev_cred(struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid,
     gid_t gid, int mode, const char *fmt, ...)
 {
 	struct cdev *dev;
 	va_list ap;
 	int res;
 
 	va_start(ap, fmt);
 	res = make_dev_credv(0, &dev, devsw, unit, cr, uid, gid, mode, fmt, ap);
 	va_end(ap);
 
 	KASSERT(res == 0 && dev != NULL,
 	    ("make_dev_cred: failed make_dev_credv (error=%d)", res));
 	return (dev);
 }
 
 struct cdev *
 make_dev_credf(int flags, struct cdevsw *devsw, int unit, struct ucred *cr,
     uid_t uid, gid_t gid, int mode, const char *fmt, ...)
 {
 	struct cdev *dev;
 	va_list ap;
 	int res;
 
 	va_start(ap, fmt);
 	res = make_dev_credv(flags, &dev, devsw, unit, cr, uid, gid, mode,
 	    fmt, ap);
 	va_end(ap);
 
 	KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) ||
 	    ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0,
 	    ("make_dev_credf: failed make_dev_credv (error=%d)", res));
 	return (res == 0 ? dev : NULL);
 }
 
 int
 make_dev_p(int flags, struct cdev **cdev, struct cdevsw *devsw,
     struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...)
 {
 	va_list ap;
 	int res;
 
 	va_start(ap, fmt);
 	res = make_dev_credv(flags, cdev, devsw, 0, cr, uid, gid, mode,
 	    fmt, ap);
 	va_end(ap);
 
 	KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) ||
 	    ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0,
 	    ("make_dev_p: failed make_dev_credv (error=%d)", res));
 	return (res);
 }
 
 static void
 dev_dependsl(struct cdev *pdev, struct cdev *cdev)
 {
 
 	cdev->si_parent = pdev;
 	cdev->si_flags |= SI_CHILD;
 	LIST_INSERT_HEAD(&pdev->si_children, cdev, si_siblings);
 }
 
 
 void
 dev_depends(struct cdev *pdev, struct cdev *cdev)
 {
 
 	dev_lock();
 	dev_dependsl(pdev, cdev);
 	dev_unlock();
 }
 
 static int
 make_dev_alias_v(int flags, struct cdev **cdev, struct cdev *pdev,
     const char *fmt, va_list ap)
 {
 	struct cdev *dev;
 	int error;
 
 	KASSERT(pdev != NULL, ("make_dev_alias_v: pdev is NULL"));
 	KASSERT((flags & MAKEDEV_WAITOK) == 0 || (flags & MAKEDEV_NOWAIT) == 0,
 	    ("make_dev_alias_v: both WAITOK and NOWAIT specified"));
 	KASSERT((flags & ~(MAKEDEV_WAITOK | MAKEDEV_NOWAIT |
 	    MAKEDEV_CHECKNAME)) == 0,
 	    ("make_dev_alias_v: invalid flags specified (flags=%02x)", flags));
 
 	dev = devfs_alloc(flags);
 	if (dev == NULL)
 		return (ENOMEM);
 	dev_lock();
 	dev->si_flags |= SI_ALIAS;
 	error = prep_devname(dev, fmt, ap);
 	if (error != 0) {
 		if ((flags & MAKEDEV_CHECKNAME) == 0) {
 			panic("make_dev_alias_v: bad si_name "
 			    "(error=%d, si_name=%s)", error, dev->si_name);
 		}
 		dev_unlock();
 		devfs_free(dev);
 		return (error);
 	}
 	dev->si_flags |= SI_NAMED;
 	devfs_create(dev);
 	dev_dependsl(pdev, dev);
 	clean_unrhdrl(devfs_inos);
 	dev_unlock();
 
 	notify_create(dev, flags);
 	*cdev = dev;
 
 	return (0);
 }
 
 struct cdev *
 make_dev_alias(struct cdev *pdev, const char *fmt, ...)
 {
 	struct cdev *dev;
 	va_list ap;
 	int res;
 
 	va_start(ap, fmt);
 	res = make_dev_alias_v(MAKEDEV_WAITOK, &dev, pdev, fmt, ap);
 	va_end(ap);
 
 	KASSERT(res == 0 && dev != NULL,
 	    ("make_dev_alias: failed make_dev_alias_v (error=%d)", res));
 	return (dev);
 }
 
 int
 make_dev_alias_p(int flags, struct cdev **cdev, struct cdev *pdev,
     const char *fmt, ...)
 {
 	va_list ap;
 	int res;
 
 	va_start(ap, fmt);
 	res = make_dev_alias_v(flags, cdev, pdev, fmt, ap);
 	va_end(ap);
 	return (res);
 }
 
 int
 make_dev_physpath_alias(int flags, struct cdev **cdev, struct cdev *pdev, 
     struct cdev *old_alias, const char *physpath)
 {
 	char *devfspath;
 	int physpath_len;
 	int max_parentpath_len;
 	int parentpath_len;
 	int devfspathbuf_len;
 	int mflags;
 	int ret;
 
 	*cdev = NULL;
 	devfspath = NULL;
 	physpath_len = strlen(physpath);
 	ret = EINVAL;
 	if (physpath_len == 0)
 		goto out;
 
 	if (strncmp("id1,", physpath, 4) == 0) {
 		physpath += 4;
 		physpath_len -= 4;
 		if (physpath_len == 0)
 			goto out;
 	}
 
 	max_parentpath_len = SPECNAMELEN - physpath_len - /*/*/1;
 	parentpath_len = strlen(pdev->si_name);
 	if (max_parentpath_len < parentpath_len) {
 		if (bootverbose)
 			printf("WARNING: Unable to alias %s "
 			    "to %s/%s - path too long\n",
 			    pdev->si_name, physpath, pdev->si_name);
 		ret = ENAMETOOLONG;
 		goto out;
 	}
 
 	mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK;
 	devfspathbuf_len = physpath_len + /*/*/1 + parentpath_len + /*NUL*/1;
 	devfspath = malloc(devfspathbuf_len, M_DEVBUF, mflags);
 	if (devfspath == NULL) {
 		ret = ENOMEM;
 		goto out;
 	}
 
 	sprintf(devfspath, "%s/%s", physpath, pdev->si_name);
 	if (old_alias != NULL && strcmp(old_alias->si_name, devfspath) == 0) {
 		/* Retain the existing alias. */
 		*cdev = old_alias;
 		old_alias = NULL;
 		ret = 0;
 	} else {
 		ret = make_dev_alias_p(flags, cdev, pdev, "%s", devfspath);
 	}
 out:
 	if (old_alias != NULL)	
 		destroy_dev(old_alias);
 	if (devfspath != NULL)
 		free(devfspath, M_DEVBUF);
 	return (ret);
 }
 
 static void
 destroy_devl(struct cdev *dev)
 {
 	struct cdevsw *csw;
 	struct cdev_privdata *p;
 	struct cdev_priv *cdp;
 
 	mtx_assert(&devmtx, MA_OWNED);
 	KASSERT(dev->si_flags & SI_NAMED,
 	    ("WARNING: Driver mistake: destroy_dev on %d\n", dev2unit(dev)));
 	KASSERT((dev->si_flags & SI_ETERNAL) == 0,
 	    ("WARNING: Driver mistake: destroy_dev on eternal %d\n",
 	     dev2unit(dev)));
 
 	cdp = cdev2priv(dev);
 	if ((cdp->cdp_flags & CDP_UNREF_DTR) == 0) {
 		/*
 		 * Avoid race with dev_rel(), e.g. from the populate
 		 * loop.  If CDP_UNREF_DTR flag is set, the reference
 		 * to be dropped at the end of destroy_devl() was
 		 * already taken by delist_dev_locked().
 		 */
 		dev_refl(dev);
 
 		devfs_destroy(dev);
 	}
 
 	/* Remove name marking */
 	dev->si_flags &= ~SI_NAMED;
 
 	/* If we are a child, remove us from the parents list */
 	if (dev->si_flags & SI_CHILD) {
 		LIST_REMOVE(dev, si_siblings);
 		dev->si_flags &= ~SI_CHILD;
 	}
 
 	/* Kill our children */
 	while (!LIST_EMPTY(&dev->si_children))
 		destroy_devl(LIST_FIRST(&dev->si_children));
 
 	/* Remove from clone list */
 	if (dev->si_flags & SI_CLONELIST) {
 		LIST_REMOVE(dev, si_clone);
 		dev->si_flags &= ~SI_CLONELIST;
 	}
 
 	csw = dev->si_devsw;
 	dev->si_devsw = NULL;	/* already NULL for SI_ALIAS */
 	while (csw != NULL && csw->d_purge != NULL && dev->si_threadcount) {
 		csw->d_purge(dev);
 		msleep(csw, &devmtx, PRIBIO, "devprg", hz/10);
 		if (dev->si_threadcount)
 			printf("Still %lu threads in %s\n",
 			    dev->si_threadcount, devtoname(dev));
 	}
 	while (dev->si_threadcount != 0) {
 		/* Use unique dummy wait ident */
 		msleep(&csw, &devmtx, PRIBIO, "devdrn", hz / 10);
 	}
 
 	dev_unlock();
 	if ((cdp->cdp_flags & CDP_UNREF_DTR) == 0) {
 		/* avoid out of order notify events */
 		notify_destroy(dev);
 	}
 	mtx_lock(&cdevpriv_mtx);
 	while ((p = LIST_FIRST(&cdp->cdp_fdpriv)) != NULL) {
 		devfs_destroy_cdevpriv(p);
 		mtx_lock(&cdevpriv_mtx);
 	}
 	mtx_unlock(&cdevpriv_mtx);
 	dev_lock();
 
 	dev->si_drv1 = 0;
 	dev->si_drv2 = 0;
 	bzero(&dev->__si_u, sizeof(dev->__si_u));
 
 	if (!(dev->si_flags & SI_ALIAS)) {
 		/* Remove from cdevsw list */
 		LIST_REMOVE(dev, si_list);
 
 		/* If cdevsw has no more struct cdev *'s, clean it */
 		if (LIST_EMPTY(&csw->d_devs)) {
 			fini_cdevsw(csw);
 			wakeup(&csw->d_devs);
 		}
 	}
 	dev->si_flags &= ~SI_ALIAS;
 	cdp->cdp_flags &= ~CDP_UNREF_DTR;
 	dev->si_refcount--;
 
 	if (dev->si_refcount > 0)
 		LIST_INSERT_HEAD(&dead_cdevsw.d_devs, dev, si_list);
 	else
 		dev_free_devlocked(dev);
 }
 
 static void
 delist_dev_locked(struct cdev *dev)
 {
 	struct cdev_priv *cdp;
 	struct cdev *child;
 
 	mtx_assert(&devmtx, MA_OWNED);
 	cdp = cdev2priv(dev);
 	if ((cdp->cdp_flags & CDP_UNREF_DTR) != 0)
 		return;
 	cdp->cdp_flags |= CDP_UNREF_DTR;
 	dev_refl(dev);
 	devfs_destroy(dev);
 	LIST_FOREACH(child, &dev->si_children, si_siblings)
 		delist_dev_locked(child);
 	dev_unlock();	
 	/* ensure the destroy event is queued in order */
 	notify_destroy(dev);
 	dev_lock();
 }
 
 /*
  * This function will delist a character device and its children from
  * the directory listing and create a destroy event without waiting
  * for all character device references to go away. At some later point
  * destroy_dev() must be called to complete the character device
  * destruction. After calling this function the character device name
  * can instantly be re-used.
  */
 void
 delist_dev(struct cdev *dev)
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "delist_dev");
 	dev_lock();
 	delist_dev_locked(dev);
 	dev_unlock();
 }
 
 void
 destroy_dev(struct cdev *dev)
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "destroy_dev");
 	dev_lock();
 	destroy_devl(dev);
 	dev_unlock_and_free();
 }
 
 const char *
 devtoname(struct cdev *dev)
 {
 
 	return (dev->si_name);
 }
 
 int
 dev_stdclone(char *name, char **namep, const char *stem, int *unit)
 {
 	int u, i;
 
 	i = strlen(stem);
 	if (bcmp(stem, name, i) != 0)
 		return (0);
 	if (!isdigit(name[i]))
 		return (0);
 	u = 0;
 	if (name[i] == '0' && isdigit(name[i+1]))
 		return (0);
 	while (isdigit(name[i])) {
 		u *= 10;
 		u += name[i++] - '0';
 	}
 	if (u > 0xffffff)
 		return (0);
 	*unit = u;
 	if (namep)
 		*namep = &name[i];
 	if (name[i]) 
 		return (2);
 	return (1);
 }
 
 /*
  * Helper functions for cloning device drivers.
  *
  * The objective here is to make it unnecessary for the device drivers to
  * use rman or similar to manage their unit number space.  Due to the way
  * we do "on-demand" devices, using rman or other "private" methods 
  * will be very tricky to lock down properly once we lock down this file.
  *
  * Instead we give the drivers these routines which puts the struct cdev *'s
  * that are to be managed on their own list, and gives the driver the ability
  * to ask for the first free unit number or a given specified unit number.
  *
  * In addition these routines support paired devices (pty, nmdm and similar)
  * by respecting a number of "flag" bits in the minor number.
  *
  */
 
 struct clonedevs {
 	LIST_HEAD(,cdev)	head;
 };
 
 void
 clone_setup(struct clonedevs **cdp)
 {
 
 	*cdp = malloc(sizeof **cdp, M_DEVBUF, M_WAITOK | M_ZERO);
 	LIST_INIT(&(*cdp)->head);
 }
 
 int
 clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up,
     struct cdev **dp, int extra)
 {
 	struct clonedevs *cd;
 	struct cdev *dev, *ndev, *dl, *de;
 	struct make_dev_args args;
 	int unit, low, u;
 
 	KASSERT(*cdp != NULL,
 	    ("clone_setup() not called in driver \"%s\"", csw->d_name));
 	KASSERT(!(extra & CLONE_UNITMASK),
 	    ("Illegal extra bits (0x%x) in clone_create", extra));
 	KASSERT(*up <= CLONE_UNITMASK,
 	    ("Too high unit (0x%x) in clone_create", *up));
 	KASSERT(csw->d_flags & D_NEEDMINOR,
 	    ("clone_create() on cdevsw without minor numbers"));
 
 
 	/*
 	 * Search the list for a lot of things in one go:
 	 *   A preexisting match is returned immediately.
 	 *   The lowest free unit number if we are passed -1, and the place
 	 *	 in the list where we should insert that new element.
 	 *   The place to insert a specified unit number, if applicable
 	 *       the end of the list.
 	 */
 	unit = *up;
 	ndev = devfs_alloc(MAKEDEV_WAITOK);
 	dev_lock();
 	prep_cdevsw(csw, MAKEDEV_WAITOK);
 	low = extra;
 	de = dl = NULL;
 	cd = *cdp;
 	LIST_FOREACH(dev, &cd->head, si_clone) {
 		KASSERT(dev->si_flags & SI_CLONELIST,
 		    ("Dev %p(%s) should be on clonelist", dev, dev->si_name));
 		u = dev2unit(dev);
 		if (u == (unit | extra)) {
 			*dp = dev;
 			dev_unlock();
 			devfs_free(ndev);
 			return (0);
 		}
 		if (unit == -1 && u == low) {
 			low++;
 			de = dev;
 			continue;
 		} else if (u < (unit | extra)) {
 			de = dev;
 			continue;
 		} else if (u > (unit | extra)) {
 			dl = dev;
 			break;
 		}
 	}
 	if (unit == -1)
 		unit = low & CLONE_UNITMASK;
 	make_dev_args_init(&args);
 	args.mda_unit = unit | extra;
 	args.mda_devsw = csw;
 	dev = newdev(&args, ndev);
 	if (dev->si_flags & SI_CLONELIST) {
 		printf("dev %p (%s) is on clonelist\n", dev, dev->si_name);
 		printf("unit=%d, low=%d, extra=0x%x\n", unit, low, extra);
 		LIST_FOREACH(dev, &cd->head, si_clone) {
 			printf("\t%p %s\n", dev, dev->si_name);
 		}
 		panic("foo");
 	}
 	KASSERT(!(dev->si_flags & SI_CLONELIST),
 	    ("Dev %p(%s) should not be on clonelist", dev, dev->si_name));
 	if (dl != NULL)
 		LIST_INSERT_BEFORE(dl, dev, si_clone);
 	else if (de != NULL)
 		LIST_INSERT_AFTER(de, dev, si_clone);
 	else
 		LIST_INSERT_HEAD(&cd->head, dev, si_clone);
 	dev->si_flags |= SI_CLONELIST;
 	*up = unit;
 	dev_unlock_and_free();
 	return (1);
 }
 
 /*
  * Kill everything still on the list.  The driver should already have
  * disposed of any softc hung of the struct cdev *'s at this time.
  */
 void
 clone_cleanup(struct clonedevs **cdp)
 {
 	struct cdev *dev;
 	struct cdev_priv *cp;
 	struct clonedevs *cd;
 	
 	cd = *cdp;
 	if (cd == NULL)
 		return;
 	dev_lock();
 	while (!LIST_EMPTY(&cd->head)) {
 		dev = LIST_FIRST(&cd->head);
 		LIST_REMOVE(dev, si_clone);
 		KASSERT(dev->si_flags & SI_CLONELIST,
 		    ("Dev %p(%s) should be on clonelist", dev, dev->si_name));
 		dev->si_flags &= ~SI_CLONELIST;
 		cp = cdev2priv(dev);
 		if (!(cp->cdp_flags & CDP_SCHED_DTR)) {
 			cp->cdp_flags |= CDP_SCHED_DTR;
 			KASSERT(dev->si_flags & SI_NAMED,
 				("Driver has goofed in cloning underways udev %jx unit %x",
 				(uintmax_t)dev2udev(dev), dev2unit(dev)));
 			destroy_devl(dev);
 		}
 	}
 	dev_unlock_and_free();
 	free(cd, M_DEVBUF);
 	*cdp = NULL;
 }
 
 static TAILQ_HEAD(, cdev_priv) dev_ddtr =
 	TAILQ_HEAD_INITIALIZER(dev_ddtr);
 static struct task dev_dtr_task = TASK_INITIALIZER(0, destroy_dev_tq, NULL);
 
 static void
 destroy_dev_tq(void *ctx, int pending)
 {
 	struct cdev_priv *cp;
 	struct cdev *dev;
 	void (*cb)(void *);
 	void *cb_arg;
 
 	dev_lock();
 	while (!TAILQ_EMPTY(&dev_ddtr)) {
 		cp = TAILQ_FIRST(&dev_ddtr);
 		dev = &cp->cdp_c;
 		KASSERT(cp->cdp_flags & CDP_SCHED_DTR,
 		    ("cdev %p in dev_destroy_tq without CDP_SCHED_DTR", cp));
 		TAILQ_REMOVE(&dev_ddtr, cp, cdp_dtr_list);
 		cb = cp->cdp_dtr_cb;
 		cb_arg = cp->cdp_dtr_cb_arg;
 		destroy_devl(dev);
 		dev_unlock_and_free();
 		dev_rel(dev);
 		if (cb != NULL)
 			cb(cb_arg);
 		dev_lock();
 	}
 	dev_unlock();
 }
 
 /*
  * devmtx shall be locked on entry. devmtx will be unlocked after
  * function return.
  */
 static int
 destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg)
 {
 	struct cdev_priv *cp;
 
 	mtx_assert(&devmtx, MA_OWNED);
 	cp = cdev2priv(dev);
 	if (cp->cdp_flags & CDP_SCHED_DTR) {
 		dev_unlock();
 		return (0);
 	}
 	dev_refl(dev);
 	cp->cdp_flags |= CDP_SCHED_DTR;
 	cp->cdp_dtr_cb = cb;
 	cp->cdp_dtr_cb_arg = arg;
 	TAILQ_INSERT_TAIL(&dev_ddtr, cp, cdp_dtr_list);
 	dev_unlock();
 	taskqueue_enqueue(taskqueue_swi_giant, &dev_dtr_task);
 	return (1);
 }
 
 int
 destroy_dev_sched_cb(struct cdev *dev, void (*cb)(void *), void *arg)
 {
 
 	dev_lock();
 	return (destroy_dev_sched_cbl(dev, cb, arg));
 }
 
 int
 destroy_dev_sched(struct cdev *dev)
 {
 
 	return (destroy_dev_sched_cb(dev, NULL, NULL));
 }
 
 void
 destroy_dev_drain(struct cdevsw *csw)
 {
 
 	dev_lock();
 	while (!LIST_EMPTY(&csw->d_devs)) {
 		msleep(&csw->d_devs, &devmtx, PRIBIO, "devscd", hz/10);
 	}
 	dev_unlock();
 }
 
 void
 drain_dev_clone_events(void)
 {
 
 	sx_xlock(&clone_drain_lock);
 	sx_xunlock(&clone_drain_lock);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(cdev, db_show_cdev)
 {
 	struct cdev_priv *cdp;
 	struct cdev *dev;
 	u_int flags;
 	char buf[512];
 
 	if (!have_addr) {
 		TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) {
 			dev = &cdp->cdp_c;
 			db_printf("%s %p\n", dev->si_name, dev);
 			if (db_pager_quit)
 				break;
 		}
 		return;
 	}
 
 	dev = (struct cdev *)addr;
 	cdp = cdev2priv(dev);
 	db_printf("dev %s ref %d use %ld thr %ld inuse %u fdpriv %p\n",
 	    dev->si_name, dev->si_refcount, dev->si_usecount,
 	    dev->si_threadcount, cdp->cdp_inuse, cdp->cdp_fdpriv.lh_first);
 	db_printf("devsw %p si_drv0 %d si_drv1 %p si_drv2 %p\n",
 	    dev->si_devsw, dev->si_drv0, dev->si_drv1, dev->si_drv2);
 	flags = dev->si_flags;
 #define	SI_FLAG(flag)	do {						\
 	if (flags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 3, sizeof(buf));			\
 		flags &= ~(flag);					\
 	}								\
 } while (0)
 	buf[0] = '\0';
 	SI_FLAG(SI_ETERNAL);
 	SI_FLAG(SI_ALIAS);
 	SI_FLAG(SI_NAMED);
 	SI_FLAG(SI_CHEAPCLONE);
 	SI_FLAG(SI_CHILD);
 	SI_FLAG(SI_DUMPDEV);
 	SI_FLAG(SI_CLONELIST);
 	db_printf("si_flags %s\n", buf);
 
 	flags = cdp->cdp_flags;
 #define	CDP_FLAG(flag)	do {						\
 	if (flags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
 		flags &= ~(flag);					\
 	}								\
 } while (0)
 	buf[0] = '\0';
 	CDP_FLAG(CDP_ACTIVE);
 	CDP_FLAG(CDP_SCHED_DTR);
 	db_printf("cdp_flags %s\n", buf);
 }
 #endif
Index: head/sys/kern/kern_context.c
===================================================================
--- head/sys/kern/kern_context.c	(revision 326270)
+++ head/sys/kern/kern_context.c	(revision 326271)
@@ -1,129 +1,131 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2002 Daniel M. Eischen <deischen@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 
 /*
  * The first two fields of a ucontext_t are the signal mask and the machine
  * context.  The next field is uc_link; we want to avoid destroying the link
  * when copying out contexts.
  */
 #define	UC_COPY_SIZE	offsetof(ucontext_t, uc_link)
 
 #ifndef _SYS_SYSPROTO_H_
 struct getcontext_args {
 	struct __ucontext *ucp;
 }
 struct setcontext_args {
 	const struct __ucontext_t *ucp;
 }
 struct swapcontext_args {
 	struct __ucontext *oucp;
 	const struct __ucontext_t *ucp;
 }
 #endif
 
 int
 sys_getcontext(struct thread *td, struct getcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		bzero(uc.__spare__, sizeof(uc.__spare__));
 		ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
 	}
 	return (ret);
 }
 
 int
 sys_setcontext(struct thread *td, struct setcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;	
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = set_mcontext(td, &uc.uc_mcontext);
 			if (ret == 0) {
 				kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask,
 				    NULL, 0);
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 int
 sys_swapcontext(struct thread *td, struct swapcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;	
 
 	if (uap->oucp == NULL || uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		bzero(uc.__spare__, sizeof(uc.__spare__));
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 			if (ret == 0) {
 				ret = set_mcontext(td, &uc.uc_mcontext);
 				if (ret == 0) {
 					kern_sigprocmask(td, SIG_SETMASK,
 					    &uc.uc_sigmask, NULL, 0);
 				}
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
Index: head/sys/kern/kern_cpu.c
===================================================================
--- head/sys/kern/kern_cpu.c	(revision 326270)
+++ head/sys/kern/kern_cpu.c	(revision 326271)
@@ -1,1067 +1,1069 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2004-2007 Nate Lawson (SDG)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/timetc.h>
 #include <sys/taskqueue.h>
 
 #include "cpufreq_if.h"
 
 /*
  * Common CPU frequency glue code.  Drivers for specific hardware can
  * attach this interface to allow users to get/set the CPU frequency.
  */
 
 /*
  * Number of levels we can handle.  Levels are synthesized from settings
  * so for M settings and N drivers, there may be M*N levels.
  */
 #define CF_MAX_LEVELS	64
 
 struct cf_saved_freq {
 	struct cf_level			level;
 	int				priority;
 	SLIST_ENTRY(cf_saved_freq)	link;
 };
 
 struct cpufreq_softc {
 	struct sx			lock;
 	struct cf_level			curr_level;
 	int				curr_priority;
 	SLIST_HEAD(, cf_saved_freq)	saved_freq;
 	struct cf_level_lst		all_levels;
 	int				all_count;
 	int				max_mhz;
 	device_t			dev;
 	struct sysctl_ctx_list		sysctl_ctx;
 	struct task			startup_task;
 	struct cf_level			*levels_buf;
 };
 
 struct cf_setting_array {
 	struct cf_setting		sets[MAX_SETTINGS];
 	int				count;
 	TAILQ_ENTRY(cf_setting_array)	link;
 };
 
 TAILQ_HEAD(cf_setting_lst, cf_setting_array);
 
 #define CF_MTX_INIT(x)		sx_init((x), "cpufreq lock")
 #define CF_MTX_LOCK(x)		sx_xlock((x))
 #define CF_MTX_UNLOCK(x)	sx_xunlock((x))
 #define CF_MTX_ASSERT(x)	sx_assert((x), SX_XLOCKED)
 
 #define CF_DEBUG(msg...)	do {		\
 	if (cf_verbose)				\
 		printf("cpufreq: " msg);	\
 	} while (0)
 
 static int	cpufreq_attach(device_t dev);
 static void	cpufreq_startup_task(void *ctx, int pending);
 static int	cpufreq_detach(device_t dev);
 static int	cf_set_method(device_t dev, const struct cf_level *level,
 		    int priority);
 static int	cf_get_method(device_t dev, struct cf_level *level);
 static int	cf_levels_method(device_t dev, struct cf_level *levels,
 		    int *count);
 static int	cpufreq_insert_abs(struct cpufreq_softc *sc,
 		    struct cf_setting *sets, int count);
 static int	cpufreq_expand_set(struct cpufreq_softc *sc,
 		    struct cf_setting_array *set_arr);
 static struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc,
 		    struct cf_level *dup, struct cf_setting *set);
 static int	cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS);
 static int	cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS);
 static int	cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS);
 
 static device_method_t cpufreq_methods[] = {
 	DEVMETHOD(device_probe,		bus_generic_probe),
 	DEVMETHOD(device_attach,	cpufreq_attach),
 	DEVMETHOD(device_detach,	cpufreq_detach),
 
         DEVMETHOD(cpufreq_set,		cf_set_method),
         DEVMETHOD(cpufreq_get,		cf_get_method),
         DEVMETHOD(cpufreq_levels,	cf_levels_method),
 	{0, 0}
 };
 static driver_t cpufreq_driver = {
 	"cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc)
 };
 static devclass_t cpufreq_dc;
 DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
 
 static int		cf_lowest_freq;
 static int		cf_verbose;
 static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL,
     "cpufreq debugging");
 SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RWTUN, &cf_lowest_freq, 1,
     "Don't provide levels below this frequency.");
 SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RWTUN, &cf_verbose, 1,
     "Print verbose debugging messages");
 
 static int
 cpufreq_attach(device_t dev)
 {
 	struct cpufreq_softc *sc;
 	struct pcpu *pc;
 	device_t parent;
 	uint64_t rate;
 	int numdevs;
 
 	CF_DEBUG("initializing %s\n", device_get_nameunit(dev));
 	sc = device_get_softc(dev);
 	parent = device_get_parent(dev);
 	sc->dev = dev;
 	sysctl_ctx_init(&sc->sysctl_ctx);
 	TAILQ_INIT(&sc->all_levels);
 	CF_MTX_INIT(&sc->lock);
 	sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
 	SLIST_INIT(&sc->saved_freq);
 	/* Try to get nominal CPU freq to use it as maximum later if needed */
 	sc->max_mhz = cpu_get_nominal_mhz(dev);
 	/* If that fails, try to measure the current rate */
 	if (sc->max_mhz <= 0) {
 		pc = cpu_get_pcpu(dev);
 		if (cpu_est_clockrate(pc->pc_cpuid, &rate) == 0)
 			sc->max_mhz = rate / 1000000;
 		else
 			sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
 	}
 
 	/*
 	 * Only initialize one set of sysctls for all CPUs.  In the future,
 	 * if multiple CPUs can have different settings, we can move these
 	 * sysctls to be under every CPU instead of just the first one.
 	 */
 	numdevs = devclass_get_count(cpufreq_dc);
 	if (numdevs > 1)
 		return (0);
 
 	CF_DEBUG("initializing one-time data for %s\n",
 	    device_get_nameunit(dev));
 	sc->levels_buf = malloc(CF_MAX_LEVELS * sizeof(*sc->levels_buf),
 	    M_DEVBUF, M_WAITOK);
 	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
 	    OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
 	    cpufreq_curr_sysctl, "I", "Current CPU frequency");
 	SYSCTL_ADD_PROC(&sc->sysctl_ctx,
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
 	    OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    cpufreq_levels_sysctl, "A", "CPU frequency levels");
 
 	/*
 	 * Queue a one-shot broadcast that levels have changed.
 	 * It will run once the system has completed booting.
 	 */
 	TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev);
 	taskqueue_enqueue(taskqueue_thread, &sc->startup_task);
 
 	return (0);
 }
 
 /* Handle any work to be done for all drivers that attached during boot. */
 static void 
 cpufreq_startup_task(void *ctx, int pending)
 {
 
 	cpufreq_settings_changed((device_t)ctx);
 }
 
 static int
 cpufreq_detach(device_t dev)
 {
 	struct cpufreq_softc *sc;
 	struct cf_saved_freq *saved_freq;
 	int numdevs;
 
 	CF_DEBUG("shutdown %s\n", device_get_nameunit(dev));
 	sc = device_get_softc(dev);
 	sysctl_ctx_free(&sc->sysctl_ctx);
 
 	while ((saved_freq = SLIST_FIRST(&sc->saved_freq)) != NULL) {
 		SLIST_REMOVE_HEAD(&sc->saved_freq, link);
 		free(saved_freq, M_TEMP);
 	}
 
 	/* Only clean up these resources when the last device is detaching. */
 	numdevs = devclass_get_count(cpufreq_dc);
 	if (numdevs == 1) {
 		CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev));
 		free(sc->levels_buf, M_DEVBUF);
 	}
 
 	return (0);
 }
 
 static int
 cf_set_method(device_t dev, const struct cf_level *level, int priority)
 {
 	struct cpufreq_softc *sc;
 	const struct cf_setting *set;
 	struct cf_saved_freq *saved_freq, *curr_freq;
 	struct pcpu *pc;
 	int error, i;
 
 	sc = device_get_softc(dev);
 	error = 0;
 	set = NULL;
 	saved_freq = NULL;
 
 	/* We are going to change levels so notify the pre-change handler. */
 	EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error);
 	if (error != 0) {
 		EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
 		return (error);
 	}
 
 	CF_MTX_LOCK(&sc->lock);
 
 #ifdef SMP
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 #else
 	/*
 	 * If still booting and secondary CPUs not started yet, don't allow
 	 * changing the frequency until they're online.  This is because we
 	 * can't switch to them using sched_bind() and thus we'd only be
 	 * switching the main CPU.  XXXTODO: Need to think more about how to
 	 * handle having different CPUs at different frequencies.  
 	 */
 	if (mp_ncpus > 1 && !smp_started) {
 		device_printf(dev, "rejecting change, SMP not started yet\n");
 		error = ENXIO;
 		goto out;
 	}
 #endif
 #endif /* SMP */
 
 	/*
 	 * If the requested level has a lower priority, don't allow
 	 * the new level right now.
 	 */
 	if (priority < sc->curr_priority) {
 		CF_DEBUG("ignoring, curr prio %d less than %d\n", priority,
 		    sc->curr_priority);
 		error = EPERM;
 		goto out;
 	}
 
 	/*
 	 * If the caller didn't specify a level and one is saved, prepare to
 	 * restore the saved level.  If none has been saved, return an error.
 	 */
 	if (level == NULL) {
 		saved_freq = SLIST_FIRST(&sc->saved_freq);
 		if (saved_freq == NULL) {
 			CF_DEBUG("NULL level, no saved level\n");
 			error = ENXIO;
 			goto out;
 		}
 		level = &saved_freq->level;
 		priority = saved_freq->priority;
 		CF_DEBUG("restoring saved level, freq %d prio %d\n",
 		    level->total_set.freq, priority);
 	}
 
 	/* Reject levels that are below our specified threshold. */
 	if (level->total_set.freq < cf_lowest_freq) {
 		CF_DEBUG("rejecting freq %d, less than %d limit\n",
 		    level->total_set.freq, cf_lowest_freq);
 		error = EINVAL;
 		goto out;
 	}
 
 	/* If already at this level, just return. */
 	if (sc->curr_level.total_set.freq == level->total_set.freq) {
 		CF_DEBUG("skipping freq %d, same as current level %d\n",
 		    level->total_set.freq, sc->curr_level.total_set.freq);
 		goto skip;
 	}
 
 	/* First, set the absolute frequency via its driver. */
 	set = &level->abs_set;
 	if (set->dev) {
 		if (!device_is_attached(set->dev)) {
 			error = ENXIO;
 			goto out;
 		}
 
 		/* Bind to the target CPU before switching. */
 		pc = cpu_get_pcpu(set->dev);
 		thread_lock(curthread);
 		sched_bind(curthread, pc->pc_cpuid);
 		thread_unlock(curthread);
 		CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
 		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
 		error = CPUFREQ_DRV_SET(set->dev, set);
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 		if (error) {
 			goto out;
 		}
 	}
 
 	/* Next, set any/all relative frequencies via their drivers. */
 	for (i = 0; i < level->rel_count; i++) {
 		set = &level->rel_set[i];
 		if (!device_is_attached(set->dev)) {
 			error = ENXIO;
 			goto out;
 		}
 
 		/* Bind to the target CPU before switching. */
 		pc = cpu_get_pcpu(set->dev);
 		thread_lock(curthread);
 		sched_bind(curthread, pc->pc_cpuid);
 		thread_unlock(curthread);
 		CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
 		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
 		error = CPUFREQ_DRV_SET(set->dev, set);
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 		if (error) {
 			/* XXX Back out any successful setting? */
 			goto out;
 		}
 	}
 
 skip:
 	/*
 	 * Before recording the current level, check if we're going to a
 	 * higher priority.  If so, save the previous level and priority.
 	 */
 	if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN &&
 	    priority > sc->curr_priority) {
 		CF_DEBUG("saving level, freq %d prio %d\n",
 		    sc->curr_level.total_set.freq, sc->curr_priority);
 		curr_freq = malloc(sizeof(*curr_freq), M_TEMP, M_NOWAIT);
 		if (curr_freq == NULL) {
 			error = ENOMEM;
 			goto out;
 		}
 		curr_freq->level = sc->curr_level;
 		curr_freq->priority = sc->curr_priority;
 		SLIST_INSERT_HEAD(&sc->saved_freq, curr_freq, link);
 	}
 	sc->curr_level = *level;
 	sc->curr_priority = priority;
 
 	/* If we were restoring a saved state, reset it to "unused". */
 	if (saved_freq != NULL) {
 		CF_DEBUG("resetting saved level\n");
 		sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
 		SLIST_REMOVE_HEAD(&sc->saved_freq, link);
 		free(saved_freq, M_TEMP);
 	}
 
 out:
 	CF_MTX_UNLOCK(&sc->lock);
 
 	/*
 	 * We changed levels (or attempted to) so notify the post-change
 	 * handler of new frequency or error.
 	 */
 	EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
 	if (error && set)
 		device_printf(set->dev, "set freq failed, err %d\n", error);
 
 	return (error);
 }
 
 static int
 cf_get_method(device_t dev, struct cf_level *level)
 {
 	struct cpufreq_softc *sc;
 	struct cf_level *levels;
 	struct cf_setting *curr_set, set;
 	struct pcpu *pc;
 	device_t *devs;
 	int bdiff, count, diff, error, i, n, numdevs;
 	uint64_t rate;
 
 	sc = device_get_softc(dev);
 	error = 0;
 	levels = NULL;
 
 	/* If we already know the current frequency, we're done. */
 	CF_MTX_LOCK(&sc->lock);
 	curr_set = &sc->curr_level.total_set;
 	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
 		CF_DEBUG("get returning known freq %d\n", curr_set->freq);
 		goto out;
 	}
 	CF_MTX_UNLOCK(&sc->lock);
 
 	/*
 	 * We need to figure out the current level.  Loop through every
 	 * driver, getting the current setting.  Then, attempt to get a best
 	 * match of settings against each level.
 	 */
 	count = CF_MAX_LEVELS;
 	levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
 	if (levels == NULL)
 		return (ENOMEM);
 	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
 	if (error) {
 		if (error == E2BIG)
 			printf("cpufreq: need to increase CF_MAX_LEVELS\n");
 		free(levels, M_TEMP);
 		return (error);
 	}
 	error = device_get_children(device_get_parent(dev), &devs, &numdevs);
 	if (error) {
 		free(levels, M_TEMP);
 		return (error);
 	}
 
 	/*
 	 * Reacquire the lock and search for the given level.
 	 *
 	 * XXX Note: this is not quite right since we really need to go
 	 * through each level and compare both absolute and relative
 	 * settings for each driver in the system before making a match.
 	 * The estimation code below catches this case though.
 	 */
 	CF_MTX_LOCK(&sc->lock);
 	for (n = 0; n < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; n++) {
 		if (!device_is_attached(devs[n]))
 			continue;
 		if (CPUFREQ_DRV_GET(devs[n], &set) != 0)
 			continue;
 		for (i = 0; i < count; i++) {
 			if (set.freq == levels[i].total_set.freq) {
 				sc->curr_level = levels[i];
 				break;
 			}
 		}
 	}
 	free(devs, M_TEMP);
 	if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
 		CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq);
 		goto out;
 	}
 
 	/*
 	 * We couldn't find an exact match, so attempt to estimate and then
 	 * match against a level.
 	 */
 	pc = cpu_get_pcpu(dev);
 	if (pc == NULL) {
 		error = ENXIO;
 		goto out;
 	}
 	cpu_est_clockrate(pc->pc_cpuid, &rate);
 	rate /= 1000000;
 	bdiff = 1 << 30;
 	for (i = 0; i < count; i++) {
 		diff = abs(levels[i].total_set.freq - rate);
 		if (diff < bdiff) {
 			bdiff = diff;
 			sc->curr_level = levels[i];
 		}
 	}
 	CF_DEBUG("get estimated freq %d\n", curr_set->freq);
 
 out:
 	if (error == 0)
 		*level = sc->curr_level;
 
 	CF_MTX_UNLOCK(&sc->lock);
 	if (levels)
 		free(levels, M_TEMP);
 	return (error);
 }
 
 static int
 cf_levels_method(device_t dev, struct cf_level *levels, int *count)
 {
 	struct cf_setting_array *set_arr;
 	struct cf_setting_lst rel_sets;
 	struct cpufreq_softc *sc;
 	struct cf_level *lev;
 	struct cf_setting *sets;
 	struct pcpu *pc;
 	device_t *devs;
 	int error, i, numdevs, set_count, type;
 	uint64_t rate;
 
 	if (levels == NULL || count == NULL)
 		return (EINVAL);
 
 	TAILQ_INIT(&rel_sets);
 	sc = device_get_softc(dev);
 	error = device_get_children(device_get_parent(dev), &devs, &numdevs);
 	if (error)
 		return (error);
 	sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
 	if (sets == NULL) {
 		free(devs, M_TEMP);
 		return (ENOMEM);
 	}
 
 	/* Get settings from all cpufreq drivers. */
 	CF_MTX_LOCK(&sc->lock);
 	for (i = 0; i < numdevs; i++) {
 		/* Skip devices that aren't ready. */
 		if (!device_is_attached(devs[i]))
 			continue;
 
 		/*
 		 * Get settings, skipping drivers that offer no settings or
 		 * provide settings for informational purposes only.
 		 */
 		error = CPUFREQ_DRV_TYPE(devs[i], &type);
 		if (error || (type & CPUFREQ_FLAG_INFO_ONLY)) {
 			if (error == 0) {
 				CF_DEBUG("skipping info-only driver %s\n",
 				    device_get_nameunit(devs[i]));
 			}
 			continue;
 		}
 		set_count = MAX_SETTINGS;
 		error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count);
 		if (error || set_count == 0)
 			continue;
 
 		/* Add the settings to our absolute/relative lists. */
 		switch (type & CPUFREQ_TYPE_MASK) {
 		case CPUFREQ_TYPE_ABSOLUTE:
 			error = cpufreq_insert_abs(sc, sets, set_count);
 			break;
 		case CPUFREQ_TYPE_RELATIVE:
 			CF_DEBUG("adding %d relative settings\n", set_count);
 			set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT);
 			if (set_arr == NULL) {
 				error = ENOMEM;
 				goto out;
 			}
 			bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
 			set_arr->count = set_count;
 			TAILQ_INSERT_TAIL(&rel_sets, set_arr, link);
 			break;
 		default:
 			error = EINVAL;
 		}
 		if (error)
 			goto out;
 	}
 
 	/*
 	 * If there are no absolute levels, create a fake one at 100%.  We
 	 * then cache the clockrate for later use as our base frequency.
 	 */
 	if (TAILQ_EMPTY(&sc->all_levels)) {
 		if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) {
 			sc->max_mhz = cpu_get_nominal_mhz(dev);
 			/*
 			 * If the CPU can't report a rate for 100%, hope
 			 * the CPU is running at its nominal rate right now,
 			 * and use that instead.
 			 */
 			if (sc->max_mhz <= 0) {
 				pc = cpu_get_pcpu(dev);
 				cpu_est_clockrate(pc->pc_cpuid, &rate);
 				sc->max_mhz = rate / 1000000;
 			}
 		}
 		memset(&sets[0], CPUFREQ_VAL_UNKNOWN, sizeof(*sets));
 		sets[0].freq = sc->max_mhz;
 		sets[0].dev = NULL;
 		error = cpufreq_insert_abs(sc, sets, 1);
 		if (error)
 			goto out;
 	}
 
 	/* Create a combined list of absolute + relative levels. */
 	TAILQ_FOREACH(set_arr, &rel_sets, link)
 		cpufreq_expand_set(sc, set_arr);
 
 	/* If the caller doesn't have enough space, return the actual count. */
 	if (sc->all_count > *count) {
 		*count = sc->all_count;
 		error = E2BIG;
 		goto out;
 	}
 
 	/* Finally, output the list of levels. */
 	i = 0;
 	TAILQ_FOREACH(lev, &sc->all_levels, link) {
 
 		/* Skip levels that have a frequency that is too low. */
 		if (lev->total_set.freq < cf_lowest_freq) {
 			sc->all_count--;
 			continue;
 		}
 
 		levels[i] = *lev;
 		i++;
 	}
 	*count = sc->all_count;
 	error = 0;
 
 out:
 	/* Clear all levels since we regenerate them each time. */
 	while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) {
 		TAILQ_REMOVE(&sc->all_levels, lev, link);
 		free(lev, M_TEMP);
 	}
 	sc->all_count = 0;
 
 	CF_MTX_UNLOCK(&sc->lock);
 	while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) {
 		TAILQ_REMOVE(&rel_sets, set_arr, link);
 		free(set_arr, M_TEMP);
 	}
 	free(devs, M_TEMP);
 	free(sets, M_TEMP);
 	return (error);
 }
 
 /*
  * Create levels for an array of absolute settings and insert them in
  * sorted order in the specified list.
  */
 static int
 cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets,
     int count)
 {
 	struct cf_level_lst *list;
 	struct cf_level *level, *search;
 	int i;
 
 	CF_MTX_ASSERT(&sc->lock);
 
 	list = &sc->all_levels;
 	for (i = 0; i < count; i++) {
 		level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO);
 		if (level == NULL)
 			return (ENOMEM);
 		level->abs_set = sets[i];
 		level->total_set = sets[i];
 		level->total_set.dev = NULL;
 		sc->all_count++;
 
 		if (TAILQ_EMPTY(list)) {
 			CF_DEBUG("adding abs setting %d at head\n",
 			    sets[i].freq);
 			TAILQ_INSERT_HEAD(list, level, link);
 			continue;
 		}
 
 		TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) {
 			if (sets[i].freq <= search->total_set.freq) {
 				CF_DEBUG("adding abs setting %d after %d\n",
 				    sets[i].freq, search->total_set.freq);
 				TAILQ_INSERT_AFTER(list, search, level, link);
 				break;
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Expand a group of relative settings, creating derived levels from them.
  */
 static int
 cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr)
 {
 	struct cf_level *fill, *search;
 	struct cf_setting *set;
 	int i;
 
 	CF_MTX_ASSERT(&sc->lock);
 
 	/*
 	 * Walk the set of all existing levels in reverse.  This is so we
 	 * create derived states from the lowest absolute settings first
 	 * and discard duplicates created from higher absolute settings.
 	 * For instance, a level of 50 Mhz derived from 100 Mhz + 50% is
 	 * preferable to 200 Mhz + 25% because absolute settings are more
 	 * efficient since they often change the voltage as well.
 	 */
 	TAILQ_FOREACH_REVERSE(search, &sc->all_levels, cf_level_lst, link) {
 		/* Add each setting to the level, duplicating if necessary. */
 		for (i = 0; i < set_arr->count; i++) {
 			set = &set_arr->sets[i];
 
 			/*
 			 * If this setting is less than 100%, split the level
 			 * into two and add this setting to the new level.
 			 */
 			fill = search;
 			if (set->freq < 10000) {
 				fill = cpufreq_dup_set(sc, search, set);
 
 				/*
 				 * The new level was a duplicate of an existing
 				 * level or its absolute setting is too high
 				 * so we freed it.  For example, we discard a
 				 * derived level of 1000 MHz/25% if a level
 				 * of 500 MHz/100% already exists.
 				 */
 				if (fill == NULL)
 					break;
 			}
 
 			/* Add this setting to the existing or new level. */
 			KASSERT(fill->rel_count < MAX_SETTINGS,
 			    ("cpufreq: too many relative drivers (%d)",
 			    MAX_SETTINGS));
 			fill->rel_set[fill->rel_count] = *set;
 			fill->rel_count++;
 			CF_DEBUG(
 			"expand set added rel setting %d%% to %d level\n",
 			    set->freq / 100, fill->total_set.freq);
 		}
 	}
 
 	return (0);
 }
 
 static struct cf_level *
 cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup,
     struct cf_setting *set)
 {
 	struct cf_level_lst *list;
 	struct cf_level *fill, *itr;
 	struct cf_setting *fill_set, *itr_set;
 	int i;
 
 	CF_MTX_ASSERT(&sc->lock);
 
 	/*
 	 * Create a new level, copy it from the old one, and update the
 	 * total frequency and power by the percentage specified in the
 	 * relative setting.
 	 */
 	fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT);
 	if (fill == NULL)
 		return (NULL);
 	*fill = *dup;
 	fill_set = &fill->total_set;
 	fill_set->freq =
 	    ((uint64_t)fill_set->freq * set->freq) / 10000;
 	if (fill_set->power != CPUFREQ_VAL_UNKNOWN) {
 		fill_set->power = ((uint64_t)fill_set->power * set->freq)
 		    / 10000;
 	}
 	if (set->lat != CPUFREQ_VAL_UNKNOWN) {
 		if (fill_set->lat != CPUFREQ_VAL_UNKNOWN)
 			fill_set->lat += set->lat;
 		else
 			fill_set->lat = set->lat;
 	}
 	CF_DEBUG("dup set considering derived setting %d\n", fill_set->freq);
 
 	/*
 	 * If we copied an old level that we already modified (say, at 100%),
 	 * we need to remove that setting before adding this one.  Since we
 	 * process each setting array in order, we know any settings for this
 	 * driver will be found at the end.
 	 */
 	for (i = fill->rel_count; i != 0; i--) {
 		if (fill->rel_set[i - 1].dev != set->dev)
 			break;
 		CF_DEBUG("removed last relative driver: %s\n",
 		    device_get_nameunit(set->dev));
 		fill->rel_count--;
 	}
 
 	/*
 	 * Insert the new level in sorted order.  If it is a duplicate of an
 	 * existing level (1) or has an absolute setting higher than the
 	 * existing level (2), do not add it.  We can do this since any such
 	 * level is guaranteed use less power.  For example (1), a level with
 	 * one absolute setting of 800 Mhz uses less power than one composed
 	 * of an absolute setting of 1600 Mhz and a relative setting at 50%.
 	 * Also for example (2), a level of 800 Mhz/75% is preferable to
 	 * 1600 Mhz/25% even though the latter has a lower total frequency.
 	 */
 	list = &sc->all_levels;
 	KASSERT(!TAILQ_EMPTY(list), ("all levels list empty in dup set"));
 	TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) {
 		itr_set = &itr->total_set;
 		if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) {
 			CF_DEBUG("dup set rejecting %d (dupe)\n",
 			    fill_set->freq);
 			itr = NULL;
 			break;
 		} else if (fill_set->freq < itr_set->freq) {
 			if (fill->abs_set.freq <= itr->abs_set.freq) {
 				CF_DEBUG(
 			"dup done, inserting new level %d after %d\n",
 				    fill_set->freq, itr_set->freq);
 				TAILQ_INSERT_AFTER(list, itr, fill, link);
 				sc->all_count++;
 			} else {
 				CF_DEBUG("dup set rejecting %d (abs too big)\n",
 				    fill_set->freq);
 				itr = NULL;
 			}
 			break;
 		}
 	}
 
 	/* We didn't find a good place for this new level so free it. */
 	if (itr == NULL) {
 		CF_DEBUG("dup set freeing new level %d (not optimal)\n",
 		    fill_set->freq);
 		free(fill, M_TEMP);
 		fill = NULL;
 	}
 
 	return (fill);
 }
 
 static int
 cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct cpufreq_softc *sc;
 	struct cf_level *levels;
 	int best, count, diff, bdiff, devcount, error, freq, i, n;
 	device_t *devs;
 
 	devs = NULL;
 	sc = oidp->oid_arg1;
 	levels = sc->levels_buf;
 
 	error = CPUFREQ_GET(sc->dev, &levels[0]);
 	if (error)
 		goto out;
 	freq = levels[0].total_set.freq;
 	error = sysctl_handle_int(oidp, &freq, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		goto out;
 
 	/*
 	 * While we only call cpufreq_get() on one device (assuming all
 	 * CPUs have equal levels), we call cpufreq_set() on all CPUs.
 	 * This is needed for some MP systems.
 	 */
 	error = devclass_get_devices(cpufreq_dc, &devs, &devcount);
 	if (error)
 		goto out;
 	for (n = 0; n < devcount; n++) {
 		count = CF_MAX_LEVELS;
 		error = CPUFREQ_LEVELS(devs[n], levels, &count);
 		if (error) {
 			if (error == E2BIG)
 				printf(
 			"cpufreq: need to increase CF_MAX_LEVELS\n");
 			break;
 		}
 		best = 0;
 		bdiff = 1 << 30;
 		for (i = 0; i < count; i++) {
 			diff = abs(levels[i].total_set.freq - freq);
 			if (diff < bdiff) {
 				bdiff = diff;
 				best = i;
 			}
 		}
 		error = CPUFREQ_SET(devs[n], &levels[best], CPUFREQ_PRIO_USER);
 	}
 
 out:
 	if (devs)
 		free(devs, M_TEMP);
 	return (error);
 }
 
 static int
 cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct cpufreq_softc *sc;
 	struct cf_level *levels;
 	struct cf_setting *set;
 	struct sbuf sb;
 	int count, error, i;
 
 	sc = oidp->oid_arg1;
 	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
 
 	/* Get settings from the device and generate the output string. */
 	count = CF_MAX_LEVELS;
 	levels = sc->levels_buf;
 	if (levels == NULL) {
 		sbuf_delete(&sb);
 		return (ENOMEM);
 	}
 	error = CPUFREQ_LEVELS(sc->dev, levels, &count);
 	if (error) {
 		if (error == E2BIG)
 			printf("cpufreq: need to increase CF_MAX_LEVELS\n");
 		goto out;
 	}
 	if (count) {
 		for (i = 0; i < count; i++) {
 			set = &levels[i].total_set;
 			sbuf_printf(&sb, "%d/%d ", set->freq, set->power);
 		}
 	} else
 		sbuf_cpy(&sb, "0");
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 
 out:
 	sbuf_delete(&sb);
 	return (error);
 }
 
 static int
 cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	device_t dev;
 	struct cf_setting *sets;
 	struct sbuf sb;
 	int error, i, set_count;
 
 	dev = oidp->oid_arg1;
 	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
 
 	/* Get settings from the device and generate the output string. */
 	set_count = MAX_SETTINGS;
 	sets = malloc(set_count * sizeof(*sets), M_TEMP, M_NOWAIT);
 	if (sets == NULL) {
 		sbuf_delete(&sb);
 		return (ENOMEM);
 	}
 	error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count);
 	if (error)
 		goto out;
 	if (set_count) {
 		for (i = 0; i < set_count; i++)
 			sbuf_printf(&sb, "%d/%d ", sets[i].freq, sets[i].power);
 	} else
 		sbuf_cpy(&sb, "0");
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 
 out:
 	free(sets, M_TEMP);
 	sbuf_delete(&sb);
 	return (error);
 }
 
 int
 cpufreq_register(device_t dev)
 {
 	struct cpufreq_softc *sc;
 	device_t cf_dev, cpu_dev;
 
 	/* Add a sysctl to get each driver's settings separately. */
 	SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
 	    OID_AUTO, "freq_settings", CTLTYPE_STRING | CTLFLAG_RD, dev, 0,
 	    cpufreq_settings_sysctl, "A", "CPU frequency driver settings");
 
 	/*
 	 * Add only one cpufreq device to each CPU.  Currently, all CPUs
 	 * must offer the same levels and be switched at the same time.
 	 */
 	cpu_dev = device_get_parent(dev);
 	if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) {
 		sc = device_get_softc(cf_dev);
 		sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
 		return (0);
 	}
 
 	/* Add the child device and possibly sysctls. */
 	cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", -1);
 	if (cf_dev == NULL)
 		return (ENOMEM);
 	device_quiet(cf_dev);
 
 	return (device_probe_and_attach(cf_dev));
 }
 
 int
 cpufreq_unregister(device_t dev)
 {
 	device_t cf_dev, *devs;
 	int cfcount, devcount, error, i, type;
 
 	/*
 	 * If this is the last cpufreq child device, remove the control
 	 * device as well.  We identify cpufreq children by calling a method
 	 * they support.
 	 */
 	error = device_get_children(device_get_parent(dev), &devs, &devcount);
 	if (error)
 		return (error);
 	cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1);
 	if (cf_dev == NULL) {
 		device_printf(dev,
 	"warning: cpufreq_unregister called with no cpufreq device active\n");
 		free(devs, M_TEMP);
 		return (0);
 	}
 	cfcount = 0;
 	for (i = 0; i < devcount; i++) {
 		if (!device_is_attached(devs[i]))
 			continue;
 		if (CPUFREQ_DRV_TYPE(devs[i], &type) == 0)
 			cfcount++;
 	}
 	if (cfcount <= 1)
 		device_delete_child(device_get_parent(cf_dev), cf_dev);
 	free(devs, M_TEMP);
 
 	return (0);
 }
 
 int
 cpufreq_settings_changed(device_t dev)
 {
 
 	EVENTHANDLER_INVOKE(cpufreq_levels_changed,
 	    device_get_unit(device_get_parent(dev)));
 	return (0);
 }
Index: head/sys/kern/kern_cpuset.c
===================================================================
--- head/sys/kern/kern_cpuset.c	(revision 326270)
+++ head/sys/kern/kern_cpuset.c	(revision 326271)
@@ -1,1349 +1,1351 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  * 
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/capsicum.h>
 #include <sys/cpuset.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 /*
  * cpusets provide a mechanism for creating and manipulating sets of
  * processors for the purpose of constraining the scheduling of threads to
  * specific processors.
  *
  * Each process belongs to an identified set, by default this is set 1.  Each
  * thread may further restrict the cpus it may run on to a subset of this
  * named set.  This creates an anonymous set which other threads and processes
  * may not join by number.
  *
  * The named set is referred to herein as the 'base' set to avoid ambiguity.
  * This set is usually a child of a 'root' set while the anonymous set may
  * simply be referred to as a mask.  In the syscall api these are referred to
  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
  *
  * Threads inherit their set from their creator whether it be anonymous or
  * not.  This means that anonymous sets are immutable because they may be
  * shared.  To modify an anonymous set a new set is created with the desired
  * mask and the same parent as the existing anonymous set.  This gives the
  * illusion of each thread having a private mask.
  *
  * Via the syscall apis a user may ask to retrieve or modify the root, base,
  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
  * modifies all numbered and anonymous child sets to comply with the new mask.
  * Modifying a pid or tid's mask applies only to that tid but must still
  * exist within the assigned parent set.
  *
  * A thread may not be assigned to a group separate from other threads in
  * the process.  This is to remove ambiguity when the setid is queried with
  * a pid argument.  There is no other technical limitation.
  *
  * This somewhat complex arrangement is intended to make it easy for
  * applications to query available processors and bind their threads to
  * specific processors while also allowing administrators to dynamically
  * reprovision by changing sets which apply to groups of processes.
  *
  * A simple application should not concern itself with sets at all and
  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
 static uma_zone_t cpuset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
 
 cpuset_t *cpuset_root;
 cpuset_t cpuset_domain[MAXMEMDOM];
 
 /*
  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  */
 struct cpuset *
 cpuset_ref(struct cpuset *set)
 {
 
 	refcount_acquire(&set->cs_ref);
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.  Returns the root
  * referenced.
  */
 static struct cpuset *
 cpuset_refroot(struct cpuset *set)
 {
 
 	for (; set->cs_parent != NULL; set = set->cs_parent)
 		if (set->cs_flags & CPU_SET_ROOT)
 			break;
 	cpuset_ref(set);
 
 	return (set);
 }
 
 /*
  * Find the first non-anonymous set starting from 'set'.  Returns this set
  * referenced.  May return the passed in set with an extra ref if it is
  * not anonymous. 
  */
 static struct cpuset *
 cpuset_refbase(struct cpuset *set)
 {
 
 	if (set->cs_id == CPUSET_INVALID)
 		set = set->cs_parent;
 	cpuset_ref(set);
 
 	return (set);
 }
 
 /*
  * Release a reference in a context where it is safe to allocate.
  */
 void
 cpuset_rel(struct cpuset *set)
 {
 	cpusetid_t id;
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	id = set->cs_id;
 	if (id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 	if (id != CPUSET_INVALID)
 		free_unr(cpuset_unr, id);
 }
 
 /*
  * Deferred release must be used when in a context that is not safe to
  * allocate/free.  This places any unreferenced sets on the list 'head'.
  */
 static void
 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
 {
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	LIST_INSERT_HEAD(head, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 }
 
 /*
  * Complete a deferred release.  Removes the set from the list provided to
  * cpuset_rel_defer.
  */
 static void
 cpuset_rel_complete(struct cpuset *set)
 {
 	LIST_REMOVE(set, cs_link);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 }
 
 /*
  * Find a set based on an id.  Returns it with a ref.
  */
 static struct cpuset *
 cpuset_lookup(cpusetid_t setid, struct thread *td)
 {
 	struct cpuset *set;
 
 	if (setid == CPUSET_INVALID)
 		return (NULL);
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(set, &cpuset_ids, cs_link)
 		if (set->cs_id == setid)
 			break;
 	if (set)
 		cpuset_ref(set);
 	mtx_unlock_spin(&cpuset_lock);
 
 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
 	if (set != NULL && jailed(td->td_ucred)) {
 		struct cpuset *jset, *tset;
 
 		jset = td->td_ucred->cr_prison->pr_cpuset;
 		for (tset = set; tset != NULL; tset = tset->cs_parent)
 			if (tset == jset)
 				break;
 		if (tset == NULL) {
 			cpuset_rel(set);
 			set = NULL;
 		}
 	}
 
 	return (set);
 }
 
 /*
  * Create a set in the space provided in 'set' with the provided parameters.
  * The set is returned with a single ref.  May return EDEADLK if the set
  * will have no valid cpu based on restrictions from the parent.
  */
 static int
 _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
     cpusetid_t id)
 {
 
 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
 		return (EDEADLK);
 	CPU_COPY(mask, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	refcount_init(&set->cs_ref, 1);
 	set->cs_flags = 0;
 	mtx_lock_spin(&cpuset_lock);
 	CPU_AND(&set->cs_mask, &parent->cs_mask);
 	set->cs_id = id;
 	set->cs_parent = cpuset_ref(parent);
 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (0);
 }
 
 /*
  * Create a new non-anonymous set with the requested parent and mask.  May
  * return failures if the mask is invalid or a new number can not be
  * allocated.
  */
 static int
 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
 {
 	struct cpuset *set;
 	cpusetid_t id;
 	int error;
 
 	id = alloc_unr(cpuset_unr);
 	if (id == -1)
 		return (ENFILE);
 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = _cpuset_create(set, parent, mask, id);
 	if (error == 0)
 		return (0);
 	free_unr(cpuset_unr, id);
 	uma_zfree(cpuset_zone, set);
 
 	return (error);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
 {
 	struct cpuset *nset;
 	cpuset_t newmask;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	if (check_mask) {
 		if (!CPU_OVERLAP(&set->cs_mask, mask))
 			return (EDEADLK);
 		CPU_COPY(&set->cs_mask, &newmask);
 		CPU_AND(&newmask, mask);
 	} else
 		CPU_COPY(mask, &newmask);
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	CPU_AND(&set->cs_mask, mask);
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update(nset, &set->cs_mask);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *root;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets.
 	 */
 	if (jailed(curthread->td_ucred) &&
 	    set->cs_flags & CPU_SET_ROOT)
 		return (EPERM);
 	/*
 	 * Verify that we have access to this set of
 	 * cpus.
 	 */
 	root = set->cs_parent;
 	if (root && !CPU_SUBSET(&root->cs_mask, mask))
 		return (EINVAL);
 	mtx_lock_spin(&cpuset_lock);
 	error = cpuset_testupdate(set, mask, 0);
 	if (error)
 		goto out;
 	CPU_COPY(mask, &set->cs_mask);
 	cpuset_update(set, mask);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (error);
 }
 
 /*
  * Resolve the 'which' parameter of several cpuset apis.
  *
  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
  * checks for permission via p_cansched().
  *
  * For WHICH_SET returns a valid set with a new reference.
  *
  * -1 may be supplied for any argument to mean the current proc/thread or
  * the base set of the current thread.  May fail with ESRCH/EPERM.
  */
 int
 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
     struct cpuset **setp)
 {
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	*pp = p = NULL;
 	*tdp = td = NULL;
 	*setp = set = NULL;
 	switch (which) {
 	case CPU_WHICH_PID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			break;
 		}
 		if ((p = pfind(id)) == NULL)
 			return (ESRCH);
 		break;
 	case CPU_WHICH_TID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			td = curthread;
 			break;
 		}
 		td = tdfind(id, -1);
 		if (td == NULL)
 			return (ESRCH);
 		p = td->td_proc;
 		break;
 	case CPU_WHICH_CPUSET:
 		if (id == -1) {
 			thread_lock(curthread);
 			set = cpuset_refbase(curthread->td_cpuset);
 			thread_unlock(curthread);
 		} else
 			set = cpuset_lookup(id, curthread);
 		if (set) {
 			*setp = set;
 			return (0);
 		}
 		return (ESRCH);
 	case CPU_WHICH_JAIL:
 	{
 		/* Find `set' for prison with given id. */
 		struct prison *pr;
 
 		sx_slock(&allprison_lock);
 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
 		sx_sunlock(&allprison_lock);
 		if (pr == NULL)
 			return (ESRCH);
 		cpuset_ref(pr->pr_cpuset);
 		*setp = pr->pr_cpuset;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	error = p_cansched(curthread, p);
 	if (error) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (td == NULL)
 		td = FIRST_THREAD_IN_PROC(p);
 	*pp = p;
 	*tdp = td;
 	return (0);
 }
 
 /*
  * Create an anonymous set with the provided mask in the space provided by
  * 'fset'.  If the passed in set is anonymous we use its parent otherwise
  * the new set is a child of 'set'.
  */
 static int
 cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
 {
 	struct cpuset *parent;
 
 	if (set->cs_id == CPUSET_INVALID)
 		parent = set->cs_parent;
 	else
 		parent = set;
 	if (!CPU_SUBSET(&parent->cs_mask, mask))
 		return (EDEADLK);
 	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
 }
 
 /*
  * Handle two cases for replacing the base set or mask of an entire process.
  *
  * 1) Set is non-null and mask is null.  This reparents all anonymous sets
  *    to the provided set and replaces all non-anonymous td_cpusets with the
  *    provided set.
  * 2) Mask is non-null and set is null.  This replaces or creates anonymous
  *    sets for every thread with the existing base as a parent.
  *
  * This is overly complicated because we can't allocate while holding a 
  * spinlock and spinlocks must be held while changing and examining thread
  * state.
  */
 static int
 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
 {
 	struct setlist freelist;
 	struct setlist droplist;
 	struct cpuset *tdset;
 	struct cpuset *nset;
 	struct thread *td;
 	struct proc *p;
 	int threads;
 	int nfree;
 	int error;
 
 	/*
 	 * The algorithm requires two passes due to locking considerations.
 	 * 
 	 * 1) Lookup the process and acquire the locks in the required order.
 	 * 2) If enough cpusets have not been allocated release the locks and
 	 *    allocate them.  Loop.
 	 */
 	LIST_INIT(&freelist);
 	LIST_INIT(&droplist);
 	nfree = 0;
 	for (;;) {
 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 		if (error)
 			goto out;
 		if (nfree >= p->p_numthreads)
 			break;
 		threads = p->p_numthreads;
 		PROC_UNLOCK(p);
 		for (; nfree < threads; nfree++) {
 			nset = uma_zalloc(cpuset_zone, M_WAITOK);
 			LIST_INSERT_HEAD(&freelist, nset, cs_link);
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
 	 * make sure the operation will succeed before applying changes.  The
 	 * proc lock prevents td_cpuset from changing between calls.
 	 */
 	error = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		tdset = td->td_cpuset;
 		/*
 		 * Verify that a new mask doesn't specify cpus outside of
 		 * the set the thread is a member of.
 		 */
 		if (mask) {
 			if (tdset->cs_id == CPUSET_INVALID)
 				tdset = tdset->cs_parent;
 			if (!CPU_SUBSET(&tdset->cs_mask, mask))
 				error = EDEADLK;
 		/*
 		 * Verify that a new set won't leave an existing thread
 		 * mask without a cpu to run on.  It can, however, restrict
 		 * the set.
 		 */
 		} else if (tdset->cs_id == CPUSET_INVALID) {
 			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
 				error = EDEADLK;
 		}
 		thread_unlock(td);
 		if (error)
 			goto unlock_out;
 	}
 	/*
 	 * Replace each thread's cpuset while using deferred release.  We
 	 * must do this because the thread lock must be held while operating
 	 * on the thread and this limits the type of operations allowed.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		/*
 		 * If we presently have an anonymous set or are applying a
 		 * mask we must create an anonymous shadow set.  That is
 		 * either parented to our existing base or the supplied set.
 		 *
 		 * If we have a base set with no anonymous shadow we simply
 		 * replace it outright.
 		 */
 		tdset = td->td_cpuset;
 		if (tdset->cs_id == CPUSET_INVALID || mask) {
 			nset = LIST_FIRST(&freelist);
 			LIST_REMOVE(nset, cs_link);
 			if (mask)
 				error = cpuset_shadow(tdset, nset, mask);
 			else
 				error = _cpuset_create(nset, set,
 				    &tdset->cs_mask, CPUSET_INVALID);
 			if (error) {
 				LIST_INSERT_HEAD(&freelist, nset, cs_link);
 				thread_unlock(td);
 				break;
 			}
 		} else
 			nset = cpuset_ref(set);
 		cpuset_rel_defer(&droplist, tdset);
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		thread_unlock(td);
 	}
 unlock_out:
 	PROC_UNLOCK(p);
 out:
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
 	while ((nset = LIST_FIRST(&freelist)) != NULL) {
 		LIST_REMOVE(nset, cs_link);
 		uma_zfree(cpuset_zone, nset);
 	}
 	return (error);
 }
 
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
 	char *tbuf;
 	size_t i, bytesp, bufsiz;
 
 	tbuf = buf;
 	bytesp = 0;
 	bufsiz = CPUSETBUFSIZ;
 
 	for (i = 0; i < (_NCPUWORDS - 1); i++) {
 		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
 		bufsiz -= bytesp;
 		tbuf += bytesp;
 	}
 	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
 	return (buf);
 }
 
 /*
  * Build a valid cpuset_t object from a string representation.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
 	u_int nwords;
 	int i, ret;
 
 	if (strlen(buf) > CPUSETBUFSIZ - 1)
 		return (-1);
 
 	/* Allow to pass a shorter version of the mask when necessary. */
 	nwords = 1;
 	for (i = 0; buf[i] != '\0'; i++)
 		if (buf[i] == ',')
 			nwords++;
 	if (nwords > _NCPUWORDS)
 		return (-1);
 
 	CPU_ZERO(set);
 	for (i = 0; i < (nwords - 1); i++) {
 		ret = sscanf(buf, "%lx,", &set->__bits[i]);
 		if (ret == 0 || ret == -1)
 			return (-1);
 		buf = strstr(buf, ",");
 		if (buf == NULL)
 			return (-1);
 		buf++;
 	}
 	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
 	if (ret == 0 || ret == -1)
 		return (-1);
 	return (0);
 }
 
 /*
  * Apply an anonymous mask to a single thread.
  */
 int
 cpuset_setthread(lwpid_t id, cpuset_t *mask)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
 	if (error)
 		goto out;
 	set = NULL;
 	thread_lock(td);
 	error = cpuset_shadow(td->td_cpuset, nset, mask);
 	if (error == 0) {
 		set = td->td_cpuset;
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		nset = NULL;
 	}
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 	if (set)
 		cpuset_rel(set);
 out:
 	if (nset)
 		uma_zfree(cpuset_zone, nset);
 	return (error);
 }
 
 /*
  * Apply new cpumask to the ithread.
  */
 int
 cpuset_setithread(lwpid_t id, int cpu)
 {
 	struct cpuset *nset, *rset;
 	struct cpuset *parent, *old_set;
 	struct thread *td;
 	struct proc *p;
 	cpusetid_t cs_id;
 	cpuset_t mask;
 	int error;
 
 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
 	rset = uma_zalloc(cpuset_zone, M_WAITOK);
 	cs_id = CPUSET_INVALID;
 
 	CPU_ZERO(&mask);
 	if (cpu == NOCPU)
 		CPU_COPY(cpuset_root, &mask);
 	else
 		CPU_SET(cpu, &mask);
 
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
 	if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
 		goto out;
 
 	/* cpuset_which() returns with PROC_LOCK held. */
 	old_set = td->td_cpuset;
 
 	if (cpu == NOCPU) {
 
 		/*
 		 * roll back to default set. We're not using cpuset_shadow()
 		 * here because we can fail CPU_SUBSET() check. This can happen
 		 * if default set does not contain all CPUs.
 		 */
 		error = _cpuset_create(nset, cpuset_default, &mask,
 		    CPUSET_INVALID);
 
 		goto applyset;
 	}
 
 	if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
 	    old_set->cs_parent->cs_id == 1)) {
 
 		/*
 		 * Current set is either default (1) or
 		 * shadowed version of default set.
 		 *
 		 * Allocate new root set to be able to shadow it
 		 * with any mask.
 		 */
 		error = _cpuset_create(rset, cpuset_zero,
 		    &cpuset_zero->cs_mask, cs_id);
 		if (error != 0) {
 			PROC_UNLOCK(p);
 			goto out;
 		}
 		rset->cs_flags |= CPU_SET_ROOT;
 		parent = rset;
 		rset = NULL;
 		cs_id = CPUSET_INVALID;
 	} else {
 		/* Assume existing set was already allocated by previous call */
 		parent = old_set;
 		old_set = NULL;
 	}
 
 	error = cpuset_shadow(parent, nset, &mask);
 applyset:
 	if (error == 0) {
 		thread_lock(td);
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		thread_unlock(td);
 		nset = NULL;
 	} else
 		old_set = NULL;
 	PROC_UNLOCK(p);
 	if (old_set != NULL)
 		cpuset_rel(old_set);
 out:
 	if (nset != NULL)
 		uma_zfree(cpuset_zone, nset);
 	if (rset != NULL)
 		uma_zfree(cpuset_zone, rset);
 	if (cs_id != CPUSET_INVALID)
 		free_unr(cpuset_unr, cs_id);
 	return (error);
 }
 
 
 /*
  * Creates system-wide cpusets and the cpuset for thread0 including two
  * sets:
  * 
  * 0 - The root set which should represent all valid processors in the
  *     system.  It is initially created with a mask of all processors
  *     because we don't know what processors are valid until cpuset_init()
  *     runs.  This set is immutable.
  * 1 - The default set which all processes are a member of until changed.
  *     This allows an administrator to move all threads off of given cpus to
  *     dedicate them to high priority tasks or save power etc.
  */
 struct cpuset *
 cpuset_thread0(void)
 {
 	struct cpuset *set;
 	int error, i;
 
 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
 
 	/*
 	 * Create the root system set for the whole machine.  Doesn't use
 	 * cpuset_create() due to NULL parent.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	CPU_FILL(&set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	set->cs_ref = 1;
 	set->cs_flags = CPU_SET_ROOT;
 	cpuset_zero = set;
 	cpuset_root = &set->cs_mask;
 
 	/*
 	 * Now derive a default, modifiable set from that to give out.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
 	cpuset_default = set;
 
 	/*
 	 * Initialize the unit allocator. 0 and 1 are allocated above.
 	 */
 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
 
 	/*
 	 * If MD code has not initialized per-domain cpusets, place all
 	 * CPUs in domain 0.
 	 */
 	for (i = 0; i < MAXMEMDOM; i++)
 		if (!CPU_EMPTY(&cpuset_domain[i]))
 			goto domains_set;
 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
 domains_set:
 
 	return (set);
 }
 
 /*
  * Create a cpuset, which would be cpuset_create() but
  * mark the new 'set' as root.
  *
  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
  * for that.
  *
  * In case of no error, returns the set in *setp locked with a reference.
  */
 int
 cpuset_create_root(struct prison *pr, struct cpuset **setp)
 {
 	struct cpuset *set;
 	int error;
 
 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
 
 	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
 	if (error)
 		return (error);
 
 	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
 	    __func__, __LINE__));
 
 	/* Mark the set as root. */
 	set = *setp;
 	set->cs_flags |= CPU_SET_ROOT;
 
 	return (0);
 }
 
 int
 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 {
 	int error;
 
 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 
 	cpuset_ref(set);
 	error = cpuset_setproc(p->p_pid, set, NULL);
 	if (error)
 		return (error);
 	cpuset_rel(set);
 	return (0);
 }
 
 /*
  * This is called once the final set of system cpus is known.  Modifies
  * the root set and all children and mark the root read-only.  
  */
 static void
 cpuset_init(void *arg)
 {
 	cpuset_t mask;
 
 	mask = all_cpus;
 	if (cpuset_modify(cpuset_zero, &mask))
 		panic("Can't set initial cpuset mask.\n");
 	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
 }
 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_args {
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset(struct thread *td, struct cpuset_args *uap)
 {
 	struct cpuset *root;
 	struct cpuset *set;
 	int error;
 
 	thread_lock(td);
 	root = cpuset_refroot(td->td_cpuset);
 	thread_unlock(td);
 	error = cpuset_create(&set, root, &root->cs_mask);
 	cpuset_rel(root);
 	if (error)
 		return (error);
 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 	if (error == 0)
 		error = cpuset_setproc(-1, set, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setid_args {
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	setid;
 };
 #endif
 int
 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
 {
 
 	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
 }
 
 int
 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
     id_t id, cpusetid_t setid)
 {
 	struct cpuset *set;
 	int error;
 
 	/*
 	 * Presently we only support per-process sets.
 	 */
 	if (which != CPU_WHICH_PID)
 		return (EINVAL);
 	set = cpuset_lookup(setid, td);
 	if (set == NULL)
 		return (ESRCH);
 	error = cpuset_setproc(id, set, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getid_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
 {
 
 	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
 	    uap->setid));
 }
 
 int
 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, cpusetid_t *setid)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpusetid_t tmpid;
 	int error;
 
 	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
 		return (EINVAL);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		return (error);
 	switch (which) {
 	case CPU_WHICH_TID:
 	case CPU_WHICH_PID:
 		thread_lock(ttd);
 		set = cpuset_refbase(ttd->td_cpuset);
 		thread_unlock(ttd);
 		PROC_UNLOCK(p);
 		break;
 	case CPU_WHICH_CPUSET:
 	case CPU_WHICH_JAIL:
 		break;
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (EINVAL);
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 		nset = cpuset_refroot(set);
 		cpuset_rel(set);
 		set = nset;
 		break;
 	case CPU_LEVEL_CPUSET:
 		break;
 	case CPU_LEVEL_WHICH:
 		break;
 	}
 	tmpid = set->cs_id;
 	cpuset_rel(set);
 	if (error == 0)
 		error = copyout(&tmpid, setid, sizeof(tmpid));
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
 {
 
 	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, cpuset_t *maskp)
 {
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 	size_t size;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only get your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 	    if (level != CPU_LEVEL_WHICH)
 		return (ECAPMODE);
 	    if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 		return (ECAPMODE);
 	    if (id != -1)
 		return (ECAPMODE);
 	}
 	size = cpusetsize;
 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		goto out;
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		CPU_COPY(&nset->cs_mask, mask);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			CPU_COPY(&set->cs_mask, mask);
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 			error = intr_getaffinity(id, which, mask);
 			break;
 		case CPU_WHICH_DOMAIN:
 			if (id < 0 || id >= MAXMEMDOM)
 				error = ESRCH;
 			else
 				CPU_COPY(&cpuset_domain[id], mask);
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	if (error == 0)
 		error = copyout(mask, maskp, size);
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	const cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
 {
 
 	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, const cpuset_t *maskp)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only set your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 	    if (level != CPU_LEVEL_WHICH)
 		return (ECAPMODE);
 	    if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 		return (ECAPMODE);
 	    if (id != -1)
 		return (ECAPMODE);
 	}
 	mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
 	error = copyin(maskp, mask, cpusetsize);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (cpusetsize > sizeof(cpuset_t)) {
 		char *end;
 		char *cp;
 
 		end = cp = (char *)&mask->__bits;
 		end += cpusetsize;
 		cp += sizeof(cpuset_t);
 		while (cp != end)
 			if (*cp++ != 0) {
 				error = EINVAL;
 				goto out;
 			}
 
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify(nset, mask);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = cpuset_setthread(id, mask);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, mask);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify(set, mask);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 			error = intr_setaffinity(id, which, mask);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 void
 ddb_display_cpuset(const cpuset_t *set)
 {
 	int cpu, once;
 
 	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
 		if (CPU_ISSET(cpu, set)) {
 			if (once == 0) {
 				db_printf("%d", cpu);
 				once = 1;
 			} else  
 				db_printf(",%d", cpu);
 		}
 	}
 	if (once == 0)
 		db_printf("<none>");
 }
 
 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 {
 	struct cpuset *set;
 
 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
 		    set, set->cs_id, set->cs_ref, set->cs_flags,
 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 		db_printf("  mask=");
 		ddb_display_cpuset(&set->cs_mask);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* DDB */
Index: head/sys/kern/kern_ctf.c
===================================================================
--- head/sys/kern/kern_ctf.c	(revision 326270)
+++ head/sys/kern/kern_ctf.c	(revision 326271)
@@ -1,324 +1,326 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008 John Birrell <jb@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Note this file is included by both link_elf.c and link_elf_obj.c.
  *
  * The CTF header structure definition can't be used here because it's
  * (annoyingly) covered by the CDDL. We will just use a few bytes from
  * it as an integer array where we 'know' what they mean.
  */
 #define CTF_HDR_SIZE		36
 #define CTF_HDR_STRTAB_U32	7
 #define CTF_HDR_STRLEN_U32	8
 
 #ifdef DDB_CTF
 static void *
 z_alloc(void *nil, u_int items, u_int size)
 {
 	void *ptr;
 
 	ptr = malloc(items * size, M_TEMP, M_NOWAIT);
 	return ptr;
 }
 
 static void
 z_free(void *nil, void *ptr)
 {
 	free(ptr, M_TEMP);
 }
 
 #endif
 
 static int
 link_elf_ctf_get(linker_file_t lf, linker_ctf_t *lc)
 {
 #ifdef DDB_CTF
 	Elf_Ehdr *hdr = NULL;
 	Elf_Shdr *shdr = NULL;
 	caddr_t ctftab = NULL;
 	caddr_t raw = NULL;
 	caddr_t shstrtab = NULL;
 	elf_file_t ef = (elf_file_t) lf;
 	int flags;
 	int i;
 	int nbytes;
 	size_t sz;
 	struct nameidata nd;
 	struct thread *td = curthread;
 	uint8_t ctf_hdr[CTF_HDR_SIZE];
 #endif
 	int error = 0;
 
 	if (lf == NULL || lc == NULL)
 		return (EINVAL);
 
 	/* Set the defaults for no CTF present. That's not a crime! */
 	bzero(lc, sizeof(*lc));
 
 #ifdef DDB_CTF
 	/*
 	 * First check if we've tried to load CTF data previously and the
 	 * CTF ELF section wasn't found. We flag that condition by setting
 	 * ctfcnt to -1. See below.
 	 */
 	if (ef->ctfcnt < 0)
 		return (EFTYPE);
 
 	/* Now check if we've already loaded the CTF data.. */
 	if (ef->ctfcnt > 0) {
 		/* We only need to load once. */
 		lc->ctftab = ef->ctftab;
 		lc->ctfcnt = ef->ctfcnt;
 		lc->symtab = ef->ddbsymtab;
 		lc->strtab = ef->ddbstrtab;
 		lc->strcnt = ef->ddbstrcnt;
 		lc->nsym   = ef->ddbsymcnt;
 		lc->ctfoffp = (uint32_t **) &ef->ctfoff;
 		lc->typoffp = (uint32_t **) &ef->typoff;
 		lc->typlenp = &ef->typlen;
 		return (0);
 	}
 
 	/*
 	 * We need to try reading the CTF data. Flag no CTF data present
 	 * by default and if we actually succeed in reading it, we'll
 	 * update ctfcnt to the number of bytes read.
 	 */
 	ef->ctfcnt = -1;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, lf->pathname, td);
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	/* Allocate memory for the FLF header. */
 	hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK);
 
 	/* Read the ELF header. */
 	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, hdr, sizeof(*hdr),
 	    0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL,
 	    td)) != 0)
 		goto out;
 
 	/* Sanity check. */
 	if (!IS_ELF(*hdr)) {
 		error = ENOEXEC;
 		goto out;
 	}
 
 	nbytes = hdr->e_shnum * hdr->e_shentsize;
 	if (nbytes == 0 || hdr->e_shoff == 0 ||
 	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Allocate memory for all the section headers */
 	shdr = malloc(nbytes, M_LINKER, M_WAITOK);
 
 	/* Read all the section headers */
 	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes,
 	    hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    NULL, td)) != 0)
 		goto out;
 
 	/*
 	 * We need to search for the CTF section by name, so if the
 	 * section names aren't present, then we can't locate the
 	 * .SUNW_ctf section containing the CTF data.
 	 */
 	if (hdr->e_shstrndx == 0 || shdr[hdr->e_shstrndx].sh_type != SHT_STRTAB) {
 		printf("%s(%d): module %s e_shstrndx is %d, sh_type is %d\n",
 		    __func__, __LINE__, lf->pathname, hdr->e_shstrndx,
 		    shdr[hdr->e_shstrndx].sh_type);
 		error = EFTYPE;
 		goto out;
 	}
 
 	/* Allocate memory to buffer the section header strings. */
 	shstrtab = malloc(shdr[hdr->e_shstrndx].sh_size, M_LINKER, M_WAITOK);
 
 	/* Read the section header strings. */
 	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, shstrtab,
 	    shdr[hdr->e_shstrndx].sh_size, shdr[hdr->e_shstrndx].sh_offset,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0)
 		goto out;
 
 	/* Search for the section containing the CTF data. */
 	for (i = 0; i < hdr->e_shnum; i++)
 		if (strcmp(".SUNW_ctf", shstrtab + shdr[i].sh_name) == 0)
 			break;
 
 	/* Check if the CTF section wasn't found. */
 	if (i >= hdr->e_shnum) {
 		printf("%s(%d): module %s has no .SUNW_ctf section\n",
 		    __func__, __LINE__, lf->pathname);
 		error = EFTYPE;
 		goto out;
 	}
 
 	/* Read the CTF header. */
 	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, ctf_hdr, sizeof(ctf_hdr),
 	    shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 	    NOCRED, NULL, td)) != 0)
 		goto out;
 
 	/* Check the CTF magic number. (XXX check for big endian!) */
 	if (ctf_hdr[0] != 0xf1 || ctf_hdr[1] != 0xcf) {
 		printf("%s(%d): module %s has invalid format\n",
 		    __func__, __LINE__, lf->pathname);
 		error = EFTYPE;
 		goto out;
 	}
 
 	/* Check if version 2. */
 	if (ctf_hdr[2] != 2) {
 		printf("%s(%d): module %s CTF format version is %d "
 		    "(2 expected)\n",
 		    __func__, __LINE__, lf->pathname, ctf_hdr[2]);
 		error = EFTYPE;
 		goto out;
 	}
 
 	/* Check if the data is compressed. */
 	if ((ctf_hdr[3] & 0x1) != 0) {
 		uint32_t *u32 = (uint32_t *) ctf_hdr;
 
 		/*
 		 * The last two fields in the CTF header are the offset
 		 * from the end of the header to the start of the string
 		 * data and the length of that string data. se this
 		 * information to determine the decompressed CTF data
 		 * buffer required.
 		 */
 		sz = u32[CTF_HDR_STRTAB_U32] + u32[CTF_HDR_STRLEN_U32] +
 		    sizeof(ctf_hdr);
 
 		/*
 		 * Allocate memory for the compressed CTF data, including
 		 * the header (which isn't compressed).
 		 */
 		raw = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK);
 	} else {
 		/*
 		 * The CTF data is not compressed, so the ELF section
 		 * size is the same as the buffer size required.
 		 */
 		sz = shdr[i].sh_size;
 	}
 
 	/*
 	 * Allocate memory to buffer the CTF data in its decompressed
 	 * form.
 	 */
 	ctftab = malloc(sz, M_LINKER, M_WAITOK);
 
 	/*
 	 * Read the CTF data into the raw buffer if compressed, or
 	 * directly into the CTF buffer otherwise.
 	 */
 	if ((error = vn_rdwr(UIO_READ, nd.ni_vp, raw == NULL ? ctftab : raw,
 	    shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED,
 	    td->td_ucred, NOCRED, NULL, td)) != 0)
 		goto out;
 
 	/* Check if decompression is required. */
 	if (raw != NULL) {
 		z_stream zs;
 		int ret;
 
 		/*
 		 * The header isn't compressed, so copy that into the
 		 * CTF buffer first.
 		 */
 		bcopy(ctf_hdr, ctftab, sizeof(ctf_hdr));
 
 		/* Initialise the zlib structure. */
 		bzero(&zs, sizeof(zs));
 		zs.zalloc = z_alloc;
 		zs.zfree = z_free;
 
 		if (inflateInit(&zs) != Z_OK) {
 			error = EIO;
 			goto out;
 		}
 
 		zs.avail_in = shdr[i].sh_size - sizeof(ctf_hdr);
 		zs.next_in = ((uint8_t *) raw) + sizeof(ctf_hdr);
 		zs.avail_out = sz - sizeof(ctf_hdr);
 		zs.next_out = ((uint8_t *) ctftab) + sizeof(ctf_hdr);
 		ret = inflate(&zs, Z_FINISH);
 		inflateEnd(&zs);
 		if (ret != Z_STREAM_END) {
 			printf("%s(%d): zlib inflate returned %d\n", __func__, __LINE__, ret);
 			error = EIO;
 			goto out;
 		}
 	}
 
 	/* Got the CTF data! */
 	ef->ctftab = ctftab;
 	ef->ctfcnt = shdr[i].sh_size;
 
 	/* We'll retain the memory allocated for the CTF data. */
 	ctftab = NULL;
 
 	/* Let the caller use the CTF data read. */
 	lc->ctftab = ef->ctftab;
 	lc->ctfcnt = ef->ctfcnt;
 	lc->symtab = ef->ddbsymtab;
 	lc->strtab = ef->ddbstrtab;
 	lc->strcnt = ef->ddbstrcnt;
 	lc->nsym   = ef->ddbsymcnt;
 	lc->ctfoffp = (uint32_t **) &ef->ctfoff;
 	lc->typoffp = (uint32_t **) &ef->typoff;
 	lc->typlenp = &ef->typlen;
 
 out:
 	VOP_UNLOCK(nd.ni_vp, 0);
 	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
 
 	if (hdr != NULL)
 		free(hdr, M_LINKER);
 	if (shdr != NULL)
 		free(shdr, M_LINKER);
 	if (shstrtab != NULL)
 		free(shstrtab, M_LINKER);
 	if (ctftab != NULL)
 		free(ctftab, M_LINKER);
 	if (raw != NULL)
 		free(raw, M_LINKER);
 #else
 	error = EOPNOTSUPP;
 #endif
 
 	return (error);
 }
Index: head/sys/kern/kern_dtrace.c
===================================================================
--- head/sys/kern/kern_dtrace.c	(revision 326270)
+++ head/sys/kern/kern_dtrace.c	(revision 326271)
@@ -1,126 +1,128 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2007-2008 John Birrell <jb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kdb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/dtrace_bsd.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 
 #define KDTRACE_PROC_SIZE	64
 #define	KDTRACE_THREAD_SIZE	256
 
 FEATURE(kdtrace_hooks,
     "Kernel DTrace hooks which are required to load DTrace kernel modules");
 
 static MALLOC_DEFINE(M_KDTRACE, "kdtrace", "DTrace hooks");
 
 /* Hooks used in the machine-dependent trap handlers. */
 dtrace_trap_func_t		dtrace_trap_func;
 dtrace_doubletrap_func_t	dtrace_doubletrap_func;
 dtrace_pid_probe_ptr_t		dtrace_pid_probe_ptr;
 dtrace_return_probe_ptr_t	dtrace_return_probe_ptr;
 
 systrace_probe_func_t __read_frequently	systrace_probe_func;
 
 /* Return the DTrace process data size compiled in the kernel hooks. */
 size_t
 kdtrace_proc_size()
 {
 
 	return (KDTRACE_PROC_SIZE);
 }
 
 static void
 kdtrace_proc_ctor(void *arg __unused, struct proc *p)
 {
 
 	p->p_dtrace = malloc(KDTRACE_PROC_SIZE, M_KDTRACE, M_WAITOK|M_ZERO);
 }
 
 static void
 kdtrace_proc_dtor(void *arg __unused, struct proc *p)
 {
 
 	if (p->p_dtrace != NULL) {
 		free(p->p_dtrace, M_KDTRACE);
 		p->p_dtrace = NULL;
 	}
 }
 
 /* Return the DTrace thread data size compiled in the kernel hooks. */
 size_t
 kdtrace_thread_size()
 {
 
 	return (KDTRACE_THREAD_SIZE);
 }
 
 static void
 kdtrace_thread_ctor(void *arg __unused, struct thread *td)
 {
 
 	td->td_dtrace = malloc(KDTRACE_THREAD_SIZE, M_KDTRACE, M_WAITOK|M_ZERO);
 }
 
 static void
 kdtrace_thread_dtor(void *arg __unused, struct thread *td)
 {
 
 	if (td->td_dtrace != NULL) {
 		free(td->td_dtrace, M_KDTRACE);
 		td->td_dtrace = NULL;
 	}
 }
 
 /*
  *  Initialise the kernel DTrace hooks.
  */
 static void
 init_dtrace(void *dummy __unused)
 {
 
 	EVENTHANDLER_REGISTER(process_ctor, kdtrace_proc_ctor, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(process_dtor, kdtrace_proc_dtor, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(thread_ctor, kdtrace_thread_ctor, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(thread_dtor, kdtrace_thread_dtor, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 SYSINIT(kdtrace, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_dtrace, NULL);
Index: head/sys/kern/kern_environment.c
===================================================================
--- head/sys/kern/kern_environment.c	(revision 326270)
+++ head/sys/kern/kern_environment.c	(revision 326271)
@@ -1,712 +1,714 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1998 Michael Smith
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The unified bootloader passes us a pointer to a preserved copy of
  * bootstrap/kernel environment variables.  We convert them to a
  * dynamic array of strings later when the VM subsystem is up.
  *
  * We make these available through the kenv(2) syscall for userland
  * and through kern_getenv()/freeenv() kern_setenv() kern_unsetenv() testenv() for
  * the kernel.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/libkern.h>
 #include <sys/kenv.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
 
 #define KENV_SIZE	512	/* Maximum number of environment strings */
 
 /* pointer to the static environment */
 char		*kern_envp;
 static int	env_len;
 static int	env_pos;
 static char	*kernenv_next(char *);
 
 /* dynamic environment variables */
 char		**kenvp;
 struct mtx	kenv_lock;
 
 /*
  * No need to protect this with a mutex since SYSINITS are single threaded.
  */
 int	dynamic_kenv = 0;
 
 #define KENV_CHECK	if (!dynamic_kenv) \
 			    panic("%s: called before SI_SUB_KMEM", __func__)
 
 int
 sys_kenv(td, uap)
 	struct thread *td;
 	struct kenv_args /* {
 		int what;
 		const char *name;
 		char *value;
 		int len;
 	} */ *uap;
 {
 	char *name, *value, *buffer = NULL;
 	size_t len, done, needed, buflen;
 	int error, i;
 
 	KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0"));
 
 	error = 0;
 	if (uap->what == KENV_DUMP) {
 #ifdef MAC
 		error = mac_kenv_check_dump(td->td_ucred);
 		if (error)
 			return (error);
 #endif
 		done = needed = 0;
 		buflen = uap->len;
 		if (buflen > KENV_SIZE * (KENV_MNAMELEN + KENV_MVALLEN + 2))
 			buflen = KENV_SIZE * (KENV_MNAMELEN +
 			    KENV_MVALLEN + 2);
 		if (uap->len > 0 && uap->value != NULL)
 			buffer = malloc(buflen, M_TEMP, M_WAITOK|M_ZERO);
 		mtx_lock(&kenv_lock);
 		for (i = 0; kenvp[i] != NULL; i++) {
 			len = strlen(kenvp[i]) + 1;
 			needed += len;
 			len = min(len, buflen - done);
 			/*
 			 * If called with a NULL or insufficiently large
 			 * buffer, just keep computing the required size.
 			 */
 			if (uap->value != NULL && buffer != NULL && len > 0) {
 				bcopy(kenvp[i], buffer + done, len);
 				done += len;
 			}
 		}
 		mtx_unlock(&kenv_lock);
 		if (buffer != NULL) {
 			error = copyout(buffer, uap->value, done);
 			free(buffer, M_TEMP);
 		}
 		td->td_retval[0] = ((done == needed) ? 0 : needed);
 		return (error);
 	}
 
 	switch (uap->what) {
 	case KENV_SET:
 		error = priv_check(td, PRIV_KENV_SET);
 		if (error)
 			return (error);
 		break;
 
 	case KENV_UNSET:
 		error = priv_check(td, PRIV_KENV_UNSET);
 		if (error)
 			return (error);
 		break;
 	}
 
 	name = malloc(KENV_MNAMELEN + 1, M_TEMP, M_WAITOK);
 
 	error = copyinstr(uap->name, name, KENV_MNAMELEN + 1, NULL);
 	if (error)
 		goto done;
 
 	switch (uap->what) {
 	case KENV_GET:
 #ifdef MAC
 		error = mac_kenv_check_get(td->td_ucred, name);
 		if (error)
 			goto done;
 #endif
 		value = kern_getenv(name);
 		if (value == NULL) {
 			error = ENOENT;
 			goto done;
 		}
 		len = strlen(value) + 1;
 		if (len > uap->len)
 			len = uap->len;
 		error = copyout(value, uap->value, len);
 		freeenv(value);
 		if (error)
 			goto done;
 		td->td_retval[0] = len;
 		break;
 	case KENV_SET:
 		len = uap->len;
 		if (len < 1) {
 			error = EINVAL;
 			goto done;
 		}
 		if (len > KENV_MVALLEN + 1)
 			len = KENV_MVALLEN + 1;
 		value = malloc(len, M_TEMP, M_WAITOK);
 		error = copyinstr(uap->value, value, len, NULL);
 		if (error) {
 			free(value, M_TEMP);
 			goto done;
 		}
 #ifdef MAC
 		error = mac_kenv_check_set(td->td_ucred, name, value);
 		if (error == 0)
 #endif
 			kern_setenv(name, value);
 		free(value, M_TEMP);
 		break;
 	case KENV_UNSET:
 #ifdef MAC
 		error = mac_kenv_check_unset(td->td_ucred, name);
 		if (error)
 			goto done;
 #endif
 		error = kern_unsetenv(name);
 		if (error)
 			error = ENOENT;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 done:
 	free(name, M_TEMP);
 	return (error);
 }
 
 /*
  * Populate the initial kernel environment.
  *
  * This is called very early in MD startup, either to provide a copy of the
  * environment obtained from a boot loader, or to provide an empty buffer into
  * which MD code can store an initial environment using kern_setenv() calls.
  *
  * When a copy of an initial environment is passed in, we start by scanning that
  * env for overrides to the compiled-in envmode and hintmode variables.
  *
  * If the global envmode is 1, the environment is initialized from the global
  * static_env[], regardless of the arguments passed.  This implements the env
  * keyword described in config(5).  In this case env_pos is set to env_len,
  * causing kern_setenv() to return -1 (if len > 0) or panic (if len == 0) until
  * the dynamic environment is available.  The envmode and static_env variables
  * are defined in env.c which is generated by config(8).
  *
  * If len is non-zero, the caller is providing an empty buffer.  The caller will
  * subsequently use kern_setenv() to add up to len bytes of initial environment
  * before the dynamic environment is available.
  *
  * If len is zero, the caller is providing a pre-loaded buffer containing
  * environment strings.  Additional strings cannot be added until the dynamic
  * environment is available.  The memory pointed to must remain stable at least
  * until sysinit runs init_dynamic_kenv().  If no initial environment is
  * available from the boot loader, passing a NULL pointer allows the static_env
  * to be installed if it is configured.
  */
 void
 init_static_kenv(char *buf, size_t len)
 {
 	char *cp;
 	
 	for (cp = buf; cp != NULL && cp[0] != '\0'; cp += strlen(cp) + 1) {
 		if (strcmp(cp, "static_env.disabled=1") == 0)
 			envmode = 0;
 		if (strcmp(cp, "static_hints.disabled=1") == 0)
 			hintmode = 0;
 	}
 
 	if (envmode == 1) {
 		kern_envp = static_env;
 		env_len = len;
 		env_pos = len;
 	} else {
 		kern_envp = buf;
 		env_len = len;
 		env_pos = 0;
 	}
 }
 
 /*
  * Setup the dynamic kernel environment.
  */
 static void
 init_dynamic_kenv(void *data __unused)
 {
 	char *cp, *cpnext;
 	size_t len;
 	int i;
 
 	kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV,
 		M_WAITOK | M_ZERO);
 	i = 0;
 	if (kern_envp && *kern_envp != '\0') {
 		for (cp = kern_envp; cp != NULL; cp = cpnext) {
 			cpnext = kernenv_next(cp);
 			len = strlen(cp) + 1;
 			if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
 				printf(
 				"WARNING: too long kenv string, ignoring %s\n",
 				    cp);
 				continue;
 			}
 			if (i < KENV_SIZE) {
 				kenvp[i] = malloc(len, M_KENV, M_WAITOK);
 				strcpy(kenvp[i++], cp);
 				memset(cp, 0, strlen(cp));
 			} else
 				printf(
 				"WARNING: too many kenv strings, ignoring %s\n",
 				    cp);
 		}
 	}
 	kenvp[i] = NULL;
 
 	mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF);
 	dynamic_kenv = 1;
 }
 SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
 
 void
 freeenv(char *env)
 {
 
 	if (dynamic_kenv && env != NULL) {
 		memset(env, 0, strlen(env));
 		free(env, M_KENV);
 	}
 }
 
 /*
  * Internal functions for string lookup.
  */
 static char *
 _getenv_dynamic(const char *name, int *idx)
 {
 	char *cp;
 	int len, i;
 
 	mtx_assert(&kenv_lock, MA_OWNED);
 	len = strlen(name);
 	for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
 		if ((strncmp(cp, name, len) == 0) &&
 		    (cp[len] == '=')) {
 			if (idx != NULL)
 				*idx = i;
 			return (cp + len + 1);
 		}
 	}
 	return (NULL);
 }
 
 static char *
 _getenv_static(const char *name)
 {
 	char *cp, *ep;
 	int len;
 
 	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
 		for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
 			;
 		if (*ep != '=')
 			continue;
 		len = ep - cp;
 		ep++;
 		if (!strncmp(name, cp, len) && name[len] == 0)
 			return (ep);
 	}
 	return (NULL);
 }
 
 /*
  * Look up an environment variable by name.
  * Return a pointer to the string if found.
  * The pointer has to be freed with freeenv()
  * after use.
  */
 char *
 kern_getenv(const char *name)
 {
 	char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
 	char *ret;
 
 	if (dynamic_kenv) {
 		if (getenv_string(name, buf, sizeof(buf))) {
 			ret = strdup(buf, M_KENV);
 		} else {
 			ret = NULL;
 			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 			    "getenv");
 		}
 	} else
 		ret = _getenv_static(name);
 	return (ret);
 }
 
 /*
  * Test if an environment variable is defined.
  */
 int
 testenv(const char *name)
 {
 	char *cp;
 
 	if (dynamic_kenv) {
 		mtx_lock(&kenv_lock);
 		cp = _getenv_dynamic(name, NULL);
 		mtx_unlock(&kenv_lock);
 	} else
 		cp = _getenv_static(name);
 	if (cp != NULL)
 		return (1);
 	return (0);
 }
 
 static int
 setenv_static(const char *name, const char *value)
 {
 	int len;
 
 	if (env_pos >= env_len)
 		return (-1);
 
 	/* Check space for x=y and two nuls */
 	len = strlen(name) + strlen(value);
 	if (len + 3 < env_len - env_pos) {
 		len = sprintf(&kern_envp[env_pos], "%s=%s", name, value);
 		env_pos += len+1;
 		kern_envp[env_pos] = '\0';
 		return (0);
 	} else
 		return (-1);
 
 }
 
 /*
  * Set an environment variable by name.
  */
 int
 kern_setenv(const char *name, const char *value)
 {
 	char *buf, *cp, *oldenv;
 	int namelen, vallen, i;
 
 	if (dynamic_kenv == 0 && env_len > 0)
 		return (setenv_static(name, value));
 
 	KENV_CHECK;
 
 	namelen = strlen(name) + 1;
 	if (namelen > KENV_MNAMELEN + 1)
 		return (-1);
 	vallen = strlen(value) + 1;
 	if (vallen > KENV_MVALLEN + 1)
 		return (-1);
 	buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
 	sprintf(buf, "%s=%s", name, value);
 
 	mtx_lock(&kenv_lock);
 	cp = _getenv_dynamic(name, &i);
 	if (cp != NULL) {
 		oldenv = kenvp[i];
 		kenvp[i] = buf;
 		mtx_unlock(&kenv_lock);
 		free(oldenv, M_KENV);
 	} else {
 		/* We add the option if it wasn't found */
 		for (i = 0; (cp = kenvp[i]) != NULL; i++)
 			;
 
 		/* Bounds checking */
 		if (i < 0 || i >= KENV_SIZE) {
 			free(buf, M_KENV);
 			mtx_unlock(&kenv_lock);
 			return (-1);
 		}
 
 		kenvp[i] = buf;
 		kenvp[i + 1] = NULL;
 		mtx_unlock(&kenv_lock);
 	}
 	return (0);
 }
 
 /*
  * Unset an environment variable string.
  */
 int
 kern_unsetenv(const char *name)
 {
 	char *cp, *oldenv;
 	int i, j;
 
 	KENV_CHECK;
 
 	mtx_lock(&kenv_lock);
 	cp = _getenv_dynamic(name, &i);
 	if (cp != NULL) {
 		oldenv = kenvp[i];
 		for (j = i + 1; kenvp[j] != NULL; j++)
 			kenvp[i++] = kenvp[j];
 		kenvp[i] = NULL;
 		mtx_unlock(&kenv_lock);
 		memset(oldenv, 0, strlen(oldenv));
 		free(oldenv, M_KENV);
 		return (0);
 	}
 	mtx_unlock(&kenv_lock);
 	return (-1);
 }
 
 /*
  * Return a string value from an environment variable.
  */
 int
 getenv_string(const char *name, char *data, int size)
 {
 	char *cp;
 
 	if (dynamic_kenv) {
 		mtx_lock(&kenv_lock);
 		cp = _getenv_dynamic(name, NULL);
 		if (cp != NULL)
 			strlcpy(data, cp, size);
 		mtx_unlock(&kenv_lock);
 	} else {
 		cp = _getenv_static(name);
 		if (cp != NULL)
 			strlcpy(data, cp, size);
 	}
 	return (cp != NULL);
 }
 
 /*
  * Return an integer value from an environment variable.
  */
 int
 getenv_int(const char *name, int *data)
 {
 	quad_t tmp;
 	int rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (int) tmp;
 	return (rval);
 }
 
 /*
  * Return an unsigned integer value from an environment variable.
  */
 int
 getenv_uint(const char *name, unsigned int *data)
 {
 	quad_t tmp;
 	int rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (unsigned int) tmp;
 	return (rval);
 }
 
 /*
  * Return an int64_t value from an environment variable.
  */
 int
 getenv_int64(const char *name, int64_t *data)
 {
 	quad_t tmp;
 	int64_t rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (int64_t) tmp;
 	return (rval);
 }
 
 /*
  * Return an uint64_t value from an environment variable.
  */
 int
 getenv_uint64(const char *name, uint64_t *data)
 {
 	quad_t tmp;
 	uint64_t rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (uint64_t) tmp;
 	return (rval);
 }
 
 /*
  * Return a long value from an environment variable.
  */
 int
 getenv_long(const char *name, long *data)
 {
 	quad_t tmp;
 	int rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (long) tmp;
 	return (rval);
 }
 
 /*
  * Return an unsigned long value from an environment variable.
  */
 int
 getenv_ulong(const char *name, unsigned long *data)
 {
 	quad_t tmp;
 	int rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (unsigned long) tmp;
 	return (rval);
 }
 
 /*
  * Return a quad_t value from an environment variable.
  */
 int
 getenv_quad(const char *name, quad_t *data)
 {
 	char	value[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
 	char	*vtp;
 	quad_t	iv;
 
 	if (!getenv_string(name, value, sizeof(value)))
 		return (0);
 	iv = strtoq(value, &vtp, 0);
 	if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0'))
 		return (0);
 	switch (vtp[0]) {
 	case 't': case 'T':
 		iv *= 1024;
 	case 'g': case 'G':
 		iv *= 1024;
 	case 'm': case 'M':
 		iv *= 1024;
 	case 'k': case 'K':
 		iv *= 1024;
 	case '\0':
 		break;
 	default:
 		return (0);
 	}
 	*data = iv;
 	return (1);
 }
 
 /*
  * Find the next entry after the one which (cp) falls within, return a
  * pointer to its start or NULL if there are no more.
  */
 static char *
 kernenv_next(char *cp)
 {
 
 	if (cp != NULL) {
 		while (*cp != 0)
 			cp++;
 		cp++;
 		if (*cp == 0)
 			cp = NULL;
 	}
 	return (cp);
 }
 
 void
 tunable_int_init(void *data)
 {
 	struct tunable_int *d = (struct tunable_int *)data;
 
 	TUNABLE_INT_FETCH(d->path, d->var);
 }
 
 void
 tunable_long_init(void *data)
 {
 	struct tunable_long *d = (struct tunable_long *)data;
 
 	TUNABLE_LONG_FETCH(d->path, d->var);
 }
 
 void
 tunable_ulong_init(void *data)
 {
 	struct tunable_ulong *d = (struct tunable_ulong *)data;
 
 	TUNABLE_ULONG_FETCH(d->path, d->var);
 }
 
 void
 tunable_int64_init(void *data)
 {
 	struct tunable_int64 *d = (struct tunable_int64 *)data;
 
 	TUNABLE_INT64_FETCH(d->path, d->var);
 }
 
 void
 tunable_uint64_init(void *data)
 {
 	struct tunable_uint64 *d = (struct tunable_uint64 *)data;
 
 	TUNABLE_UINT64_FETCH(d->path, d->var);
 }
 
 void
 tunable_quad_init(void *data)
 {
 	struct tunable_quad *d = (struct tunable_quad *)data;
 
 	TUNABLE_QUAD_FETCH(d->path, d->var);
 }
 
 void
 tunable_str_init(void *data)
 {
 	struct tunable_str *d = (struct tunable_str *)data;
 
 	TUNABLE_STR_FETCH(d->path, d->var, d->size);
 }
Index: head/sys/kern/kern_et.c
===================================================================
--- head/sys/kern/kern_et.c	(revision 326270)
+++ head/sys/kern/kern_et.c	(revision 326271)
@@ -1,265 +1,267 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
 #include <sys/timeet.h>
 
 #include "opt_timer.h"
 
 SLIST_HEAD(et_eventtimers_list, eventtimer);
 static struct et_eventtimers_list eventtimers = SLIST_HEAD_INITIALIZER(et_eventtimers);
 
 struct mtx	et_eventtimers_mtx;
 MTX_SYSINIT(et_eventtimers_init, &et_eventtimers_mtx, "et_mtx", MTX_DEF);
 
 SYSCTL_NODE(_kern, OID_AUTO, eventtimer, CTLFLAG_RW, 0, "Event timers");
 static SYSCTL_NODE(_kern_eventtimer, OID_AUTO, et, CTLFLAG_RW, 0, "");
 
 /*
  * Register a new event timer hardware.
  */
 int
 et_register(struct eventtimer *et)
 {
 	struct eventtimer *tmp, *next;
 
 	if (et->et_quality >= 0 || bootverbose) {
 		if (et->et_frequency == 0) {
 			printf("Event timer \"%s\" quality %d\n",
 			    et->et_name, et->et_quality);
 		} else {
 			printf("Event timer \"%s\" "
 			    "frequency %ju Hz quality %d\n",
 			    et->et_name, (uintmax_t)et->et_frequency,
 			    et->et_quality);
 		}
 	}
 	KASSERT(et->et_start, ("et_register: timer has no start function"));
 	et->et_sysctl = SYSCTL_ADD_NODE_WITH_LABEL(NULL,
 	    SYSCTL_STATIC_CHILDREN(_kern_eventtimer_et), OID_AUTO, et->et_name,
 	    CTLFLAG_RW, 0, "event timer description", "eventtimer");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
 	    "flags", CTLFLAG_RD, &(et->et_flags), 0,
 	    "Event timer capabilities");
 	SYSCTL_ADD_UQUAD(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
 	    "frequency", CTLFLAG_RD, &(et->et_frequency),
 	    "Event timer base frequency");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
 	    "quality", CTLFLAG_RD, &(et->et_quality), 0,
 	    "Goodness of event timer");
 	ET_LOCK();
 	if (SLIST_EMPTY(&eventtimers) ||
 	    SLIST_FIRST(&eventtimers)->et_quality < et->et_quality) {
 		SLIST_INSERT_HEAD(&eventtimers, et, et_all);
 	} else {
 		SLIST_FOREACH(tmp, &eventtimers, et_all) {
 			next = SLIST_NEXT(tmp, et_all);
 			if (next == NULL || next->et_quality < et->et_quality) {
 				SLIST_INSERT_AFTER(tmp, et, et_all);
 				break;
 			}
 		}
 	}
 	ET_UNLOCK();
 	return (0);
 }
 
 /*
  * Deregister event timer hardware.
  */
 int
 et_deregister(struct eventtimer *et)
 {
 	int err = 0;
 
 	if (et->et_deregister_cb != NULL) {
 		if ((err = et->et_deregister_cb(et, et->et_arg)) != 0)
 			return (err);
 	}
 
 	ET_LOCK();
 	SLIST_REMOVE(&eventtimers, et, eventtimer, et_all);
 	ET_UNLOCK();
 	sysctl_remove_oid(et->et_sysctl, 1, 1);
 	return (0);
 }
 
 /*
  * Change the frequency of the given timer.  If it is the active timer,
  * reconfigure it on all CPUs (reschedules all current events based on the new
  * timer frequency).
  */
 void
 et_change_frequency(struct eventtimer *et, uint64_t newfreq)
 {
 
 #ifndef NO_EVENTTIMERS
 	cpu_et_frequency(et, newfreq);
 #endif
 }
 
 /*
  * Find free event timer hardware with specified parameters.
  */
 struct eventtimer *
 et_find(const char *name, int check, int want)
 {
 	struct eventtimer *et = NULL;
 
 	SLIST_FOREACH(et, &eventtimers, et_all) {
 		if (et->et_active)
 			continue;
 		if (name != NULL && strcasecmp(et->et_name, name) != 0)
 			continue;
 		if (name == NULL && et->et_quality < 0)
 			continue;
 		if ((et->et_flags & check) != want)
 			continue;
 		break;
 	}
 	return (et);
 }
 
 /*
  * Initialize event timer hardware. Set callbacks.
  */
 int
 et_init(struct eventtimer *et, et_event_cb_t *event,
     et_deregister_cb_t *deregister, void *arg)
 {
 
 	if (event == NULL)
 		return (EINVAL);
 	if (et->et_active)
 		return (EBUSY);
 
 	et->et_active = 1;
 	et->et_event_cb = event;
 	et->et_deregister_cb = deregister;
 	et->et_arg = arg;
 	return (0);
 }
 
 /*
  * Start event timer hardware.
  * first - delay before first tick.
  * period - period of subsequent periodic ticks.
  */
 int
 et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 
 	if (!et->et_active)
 		return (ENXIO);
 	KASSERT(period >= 0, ("et_start: negative period"));
 	KASSERT((et->et_flags & ET_FLAGS_PERIODIC) || period == 0,
 		("et_start: period specified for oneshot-only timer"));
 	KASSERT((et->et_flags & ET_FLAGS_ONESHOT) || period != 0,
 		("et_start: period not specified for periodic-only timer"));
 	if (period != 0) {
 		if (period < et->et_min_period)
 		        period = et->et_min_period;
 		else if (period > et->et_max_period)
 		        period = et->et_max_period;
 	}
 	if (period == 0 || first != 0) {
 		if (first < et->et_min_period)
 		        first = et->et_min_period;
 		else if (first > et->et_max_period)
 		        first = et->et_max_period;
 	}
 	return (et->et_start(et, first, period));
 }
 
 /* Stop event timer hardware. */
 int
 et_stop(struct eventtimer *et)
 {
 
 	if (!et->et_active)
 		return (ENXIO);
 	if (et->et_stop)
 		return (et->et_stop(et));
 	return (0);
 }
 
 /* Mark event timer hardware as broken. */
 int
 et_ban(struct eventtimer *et)
 {
 
 	et->et_flags &= ~(ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT);
 	return (0);
 }
 
 /* Free event timer hardware. */
 int
 et_free(struct eventtimer *et)
 {
 
 	if (!et->et_active)
 		return (ENXIO);
 
 	et->et_active = 0;
 	return (0);
 }
 
 /* Report list of supported event timer hardware via sysctl. */
 static int
 sysctl_kern_eventtimer_choice(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct eventtimer *et;
 	int error;
 
 	sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL);
 
 	ET_LOCK();
 	SLIST_FOREACH(et, &eventtimers, et_all) {
 		if (et != SLIST_FIRST(&eventtimers))
 			sbuf_putc(&sb, ' ');
 		sbuf_printf(&sb, "%s(%d)", et->et_name, et->et_quality);
 	}
 	ET_UNLOCK();
 
 	error = sbuf_finish(&sb);
 	if (error == 0)
 		error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
 	sbuf_delete(&sb);
 	return (error);
 }
 SYSCTL_PROC(_kern_eventtimer, OID_AUTO, choice,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_eventtimer_choice, "A", "Present event timers");
 
Index: head/sys/kern/kern_event.c
===================================================================
--- head/sys/kern/kern_event.c	(revision 326270)
+++ head/sys/kern/kern_event.c	(revision 326271)
@@ -1,2639 +1,2641 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
  * Copyright (c) 2009 Apple, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 #include "opt_kqueue.h"
 
 #ifdef COMPAT_FREEBSD11
 #define	_WANT_FREEBSD11_KEVENT
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <machine/atomic.h>
 
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 /*
  * This lock is used if multiple kq locks are required.  This possibly
  * should be made into a per proc lock.
  */
 static struct mtx	kq_global;
 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
 	if (!haslck)				\
 		mtx_lock(lck);			\
 	haslck = 1;				\
 } while (0)
 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
 	if (haslck)				\
 		mtx_unlock(lck);			\
 	haslck = 0;				\
 } while (0)
 
 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
 
 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
 		    struct thread *td, int waitok);
 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
 static void	kqueue_release(struct kqueue *kq, int locked);
 static void	kqueue_destroy(struct kqueue *kq);
 static void	kqueue_drain(struct kqueue *kq, struct thread *td);
 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
 		    uintptr_t ident, int waitok);
 static void	kqueue_task(void *arg, int pending);
 static int	kqueue_scan(struct kqueue *kq, int maxevents,
 		    struct kevent_copyops *k_ops,
 		    const struct timespec *timeout,
 		    struct kevent *keva, struct thread *td);
 static void 	kqueue_wakeup(struct kqueue *kq);
 static struct filterops *kqueue_fo_find(int filt);
 static void	kqueue_fo_release(int filt);
 struct g_kevent_args;
 static int	kern_kevent_generic(struct thread *td,
 		    struct g_kevent_args *uap,
 		    struct kevent_copyops *k_ops, const char *struct_name);
 
 static fo_ioctl_t	kqueue_ioctl;
 static fo_poll_t	kqueue_poll;
 static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 static fo_fill_kinfo_t	kqueue_fill_kinfo;
 
 static struct fileops kqueueops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = kqueue_ioctl,
 	.fo_poll = kqueue_poll,
 	.fo_kqfilter = kqueue_kqfilter,
 	.fo_stat = kqueue_stat,
 	.fo_close = kqueue_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = kqueue_fill_kinfo,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
 static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_drop_detached(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(int waitok);
 static void 	knote_free(struct knote *kn);
 
 static void	filt_kqdetach(struct knote *kn);
 static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static int	filt_timerattach(struct knote *kn);
 static void	filt_timerdetach(struct knote *kn);
 static int	filt_timer(struct knote *kn, long hint);
 static int	filt_userattach(struct knote *kn);
 static void	filt_userdetach(struct knote *kn);
 static int	filt_user(struct knote *kn, long hint);
 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 
 static struct filterops file_filtops = {
 	.f_isfd = 1,
 	.f_attach = filt_fileattach,
 };
 static struct filterops kqread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_kqdetach,
 	.f_event = filt_kqueue,
 };
 /* XXX - move to kern_proc.c?  */
 static struct filterops proc_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_procattach,
 	.f_detach = filt_procdetach,
 	.f_event = filt_proc,
 };
 static struct filterops timer_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_timerattach,
 	.f_detach = filt_timerdetach,
 	.f_event = filt_timer,
 };
 static struct filterops user_filtops = {
 	.f_attach = filt_userattach,
 	.f_detach = filt_userdetach,
 	.f_event = filt_user,
 	.f_touch = filt_usertouch,
 };
 
 static uma_zone_t	knote_zone;
 static unsigned int	kq_ncallouts = 0;
 static unsigned int 	kq_calloutmax = 4 * 1024;
 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 /* XXX - ensure not influx ? */
 #define KNOTE_ACTIVATE(kn, islock) do { 				\
 	if ((islock))							\
 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
 	else								\
 		KQ_LOCK((kn)->kn_kq);					\
 	(kn)->kn_status |= KN_ACTIVE;					\
 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
 		knote_enqueue((kn));					\
 	if (!(islock))							\
 		KQ_UNLOCK((kn)->kn_kq);					\
 } while(0)
 #define KQ_LOCK(kq) do {						\
 	mtx_lock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_FLUX_WAKEUP(kq) do {						\
 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
 		wakeup((kq));						\
 	}								\
 } while (0)
 #define KQ_UNLOCK_FLUX(kq) do {						\
 	KQ_FLUX_WAKEUP(kq);						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_UNLOCK(kq) do {						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_OWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
 } while (0)
 #define KQ_NOTOWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
 } while (0)
 
 static struct knlist *
 kn_list_lock(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = kn->kn_knlist;
 	if (knl != NULL)
 		knl->kl_lock(knl->kl_lockarg);
 	return (knl);
 }
 
 static void
 kn_list_unlock(struct knlist *knl)
 {
 	bool do_free;
 
 	if (knl == NULL)
 		return;
 	do_free = knl->kl_autodestroy && knlist_empty(knl);
 	knl->kl_unlock(knl->kl_lockarg);
 	if (do_free) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 static bool
 kn_in_flux(struct knote *kn)
 {
 
 	return (kn->kn_influx > 0);
 }
 
 static void
 kn_enter_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx < INT_MAX);
 	kn->kn_influx++;
 }
 
 static bool
 kn_leave_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx > 0);
 	kn->kn_influx--;
 	return (kn->kn_influx == 0);
 }
 
 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
 	if (islocked)							\
 		KNL_ASSERT_LOCKED(knl);				\
 	else								\
 		KNL_ASSERT_UNLOCKED(knl);				\
 } while (0)
 #ifdef INVARIANTS
 #define	KNL_ASSERT_LOCKED(knl) do {					\
 	knl->kl_assert_locked((knl)->kl_lockarg);			\
 } while (0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
 	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
 } while (0)
 #else /* !INVARIANTS */
 #define	KNL_ASSERT_LOCKED(knl) do {} while(0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
 #endif /* INVARIANTS */
 
 #ifndef	KN_HASHSIZE
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #endif
 
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
 filt_nullattach(struct knote *kn)
 {
 
 	return (ENXIO);
 };
 
 struct filterops null_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_nullattach,
 };
 
 /* XXX - make SYSINIT to add these, and move into respective modules. */
 extern struct filterops sig_filtops;
 extern struct filterops fs_filtops;
 
 /*
  * Table for for all system-defined filters.
  */
 static struct mtx	filterops_lock;
 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
 	MTX_DEF);
 static struct {
 	struct filterops *for_fop;
 	int for_nolock;
 	int for_refcnt;
 } sysfilt_ops[EVFILT_SYSCOUNT] = {
 	{ &file_filtops, 1 },			/* EVFILT_READ */
 	{ &file_filtops, 1 },			/* EVFILT_WRITE */
 	{ &null_filtops },			/* EVFILT_AIO */
 	{ &file_filtops, 1 },			/* EVFILT_VNODE */
 	{ &proc_filtops, 1 },			/* EVFILT_PROC */
 	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
 	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
 	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
 	{ &fs_filtops, 1 },			/* EVFILT_FS */
 	{ &null_filtops },			/* EVFILT_LIO */
 	{ &user_filtops, 1 },			/* EVFILT_USER */
 	{ &null_filtops },			/* EVFILT_SENDFILE */
 	{ &file_filtops, 1 },                   /* EVFILT_EMPTY */
 };
 
 /*
  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
  * method.
  */
 static int
 filt_fileattach(struct knote *kn)
 {
 
 	return (fo_kqfilter(kn->kn_fp, kn));
 }
 
 /*ARGSUSED*/
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_status |= KN_KQUEUE;
 	kn->kn_fop = &kqread_filtops;
 	knlist_add(&kq->kq_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_kqdetach(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, long hint)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	kn->kn_data = kq->kq_count;
 	return (kn->kn_data > 0);
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
 	int error;
 	bool exiting, immediate;
 
 	exiting = immediate = false;
 	if (kn->kn_sfflags & NOTE_EXIT)
 		p = pfind_any(kn->kn_id);
 	else
 		p = pfind(kn->kn_id);
 	if (p == NULL)
 		return (ESRCH);
 	if (p->p_flag & P_WEXIT)
 		exiting = true;
 
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
 	 * Internal flag indicating registration done by kernel for the
 	 * purposes of getting a NOTE_CHILD notification.
 	 */
 	if (kn->kn_flags & EV_FLAG2) {
 		kn->kn_flags &= ~EV_FLAG2;
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
 		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
 		immediate = true; /* Force immediate activation of child note. */
 	}
 	/*
 	 * Internal flag indicating registration done by kernel (for other than
 	 * NOTE_CHILD).
 	 */
 	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
 	knlist_add(p->p_klist, kn, 1);
 
 	/*
 	 * Immediately activate any child notes or, in the case of a zombie
 	 * target process, exit notes.  The latter is necessary to handle the
 	 * case where the target process, e.g. a child, dies before the kevent
 	 * is registered.
 	 */
 	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
 		KNOTE_ACTIVATE(kn, 0);
 
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * The knote may be attached to a different process, which may exit,
  * leaving nothing for the knote to be attached to.  So when the process
  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  * it will be deleted when read out.  However, as part of the knote deletion,
  * this routine is called, so a check is needed to avoid actually performing
  * a detach, because the original process does not exist any more.
  */
 /* XXX - move to kern_proc.c?  */
 static void
 filt_procdetach(struct knote *kn)
 {
 
 	knlist_remove(kn->kn_knlist, kn, 0);
 	kn->kn_ptr.p_proc = NULL;
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_proc(struct knote *kn, long hint)
 {
 	struct proc *p;
 	u_int event;
 
 	p = kn->kn_ptr.p_proc;
 	if (p == NULL) /* already activated, from attach filter */
 		return (0);
 
 	/* Mask off extra data. */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		kn->kn_ptr.p_proc = NULL;
 		if (kn->kn_fflags & NOTE_EXIT)
 			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 /*
  * Called when the process forked. It mostly does the same as the
  * knote(), activating all knotes registered to be activated when the
  * process forked. Additionally, for each knote attached to the
  * parent, check whether user wants to track the new process. If so
  * attach a new knote to it, and immediately report an event with the
  * child's pid.
  */
 void
 knote_fork(struct knlist *list, int pid)
 {
 	struct kqueue *kq;
 	struct knote *kn;
 	struct kevent kev;
 	int error;
 
 	if (list == NULL)
 		return;
 	list->kl_lock(list->kl_lockarg);
 
 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The same as knote(), activate the event.
 		 */
 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
 			kn->kn_status |= KN_HASKQLOCK;
 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
 				KNOTE_ACTIVATE(kn, 1);
 			kn->kn_status &= ~KN_HASKQLOCK;
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The NOTE_TRACK case. In addition to the activation
 		 * of the event, we need to register new events to
 		 * track the child. Drop the locks in preparation for
 		 * the call to kqueue_register().
 		 */
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		list->kl_unlock(list->kl_lockarg);
 
 		/*
 		 * Activate existing knote and register tracking knotes with
 		 * new process.
 		 *
 		 * First register a knote to get just the child notice. This
 		 * must be a separate note from a potential NOTE_EXIT
 		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
 		 * to use the data field (in conflicting ways).
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
 		    EV_FLAG2;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, 0);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 
 		/*
 		 * Then register another knote to track other potential events
 		 * from the new process.
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, 0);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
 			KNOTE_ACTIVATE(kn, 0);
 		KQ_LOCK(kq);
 		kn_leave_flux(kn);
 		KQ_UNLOCK_FLUX(kq);
 		list->kl_lock(list->kl_lockarg);
 	}
 	list->kl_unlock(list->kl_lockarg);
 }
 
 /*
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
  */
 
 #define NOTE_TIMER_PRECMASK						\
     (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
 
 static sbintime_t
 timer2sbintime(intptr_t data, int flags)
 {
 	int64_t secs;
 
         /*
          * Macros for converting to the fractional second portion of an
          * sbintime_t using 64bit multiplication to improve precision.
          */
 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
 	switch (flags & NOTE_TIMER_PRECMASK) {
 	case NOTE_SECONDS:
 #ifdef __LP64__
 		if (data > (SBT_MAX / SBT_1S))
 			return (SBT_MAX);
 #endif
 		return ((sbintime_t)data << 32);
 	case NOTE_MSECONDS: /* FALLTHROUGH */
 	case 0:
 		if (data >= 1000) {
 			secs = data / 1000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | MS_TO_SBT(data % 1000));
 		}
 		return (MS_TO_SBT(data));
 	case NOTE_USECONDS:
 		if (data >= 1000000) {
 			secs = data / 1000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | US_TO_SBT(data % 1000000));
 		}
 		return (US_TO_SBT(data));
 	case NOTE_NSECONDS:
 		if (data >= 1000000000) {
 			secs = data / 1000000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | US_TO_SBT(data % 1000000000));
 		}
 		return (NS_TO_SBT(data));
 	default:
 		break;
 	}
 	return (-1);
 }
 
 struct kq_timer_cb_data {
 	struct callout c;
 	sbintime_t next;	/* next timer event fires at */
 	sbintime_t to;		/* precalculated timer period, 0 for abs */
 };
 
 static void
 filt_timerexpire(void *knx)
 {
 	struct knote *kn;
 	struct kq_timer_cb_data *kc;
 
 	kn = knx;
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
 	if ((kn->kn_flags & EV_ONESHOT) != 0)
 		return;
 	kc = kn->kn_ptr.p_v;
 	if (kc->to == 0)
 		return;
 	kc->next += kc->to;
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
 	    PCPU_GET(cpuid), C_ABSOLUTE);
 }
 
 /*
  * data contains amount of time to sleep
  */
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	struct bintime bt;
 	sbintime_t to, sbt;
 	unsigned int ncallouts;
 
 	if (kn->kn_sdata < 0)
 		return (EINVAL);
 	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
 		kn->kn_sdata = 1;
 	/* Only precision unit are supported in flags so far */
 	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
 		return (EINVAL);
 
 	to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		getboottimebin(&bt);
 		sbt = bttosbt(bt);
 		to -= sbt;
 	}
 	if (to < 0)
 		return (EINVAL);
 
 	do {
 		ncallouts = kq_ncallouts;
 		if (ncallouts >= kq_calloutmax)
 			return (ENOMEM);
 	} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
 
 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
 	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
 	callout_init(&kc->c, 1);
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		kc->next = to;
 		kc->to = 0;
 	} else {
 		kc->next = to + sbinuptime();
 		kc->to = to;
 	}
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
 	    PCPU_GET(cpuid), C_ABSOLUTE);
 
 	return (0);
 }
 
 static void
 filt_timerdetach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	unsigned int old;
 
 	kc = kn->kn_ptr.p_v;
 	callout_drain(&kc->c);
 	free(kc, M_KQUEUE);
 	old = atomic_fetchadd_int(&kq_ncallouts, -1);
 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
 }
 
 static int
 filt_timer(struct knote *kn, long hint)
 {
 
 	return (kn->kn_data != 0);
 }
 
 static int
 filt_userattach(struct knote *kn)
 {
 
 	/* 
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */ 
 	kn->kn_hook = NULL;
 	if (kn->kn_fflags & NOTE_TRIGGER)
 		kn->kn_hookid = 1;
 	else
 		kn->kn_hookid = 0;
 	return (0);
 }
 
 static void
 filt_userdetach(__unused struct knote *kn)
 {
 
 	/*
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */
 }
 
 static int
 filt_user(struct knote *kn, __unused long hint)
 {
 
 	return (kn->kn_hookid);
 }
 
 static void
 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	u_int ffctrl;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		if (kev->fflags & NOTE_TRIGGER)
 			kn->kn_hookid = 1;
 
 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
 		kev->fflags &= NOTE_FFLAGSMASK;
 		switch (ffctrl) {
 		case NOTE_FFNOP:
 			break;
 
 		case NOTE_FFAND:
 			kn->kn_sfflags &= kev->fflags;
 			break;
 
 		case NOTE_FFOR:
 			kn->kn_sfflags |= kev->fflags;
 			break;
 
 		case NOTE_FFCOPY:
 			kn->kn_sfflags = kev->fflags;
 			break;
 
 		default:
 			/* XXX Return error? */
 			break;
 		}
 		kn->kn_sdata = kev->data;
 		if (kev->flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		kev->fflags = kn->kn_sfflags;
 		kev->data = kn->kn_sdata;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_usertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
 	return (kern_kqueue(td, 0, NULL));
 }
 
 static void
 kqueue_init(struct kqueue *kq)
 {
 
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 }
 
 int
 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
 {
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
 	struct ucred *cred;
 	int fd, error;
 
 	fdp = td->td_proc->p_fd;
 	cred = td->td_ucred;
 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
 		return (ENOMEM);
 
 	error = falloc_caps(td, &fp, &fd, flags, fcaps);
 	if (error != 0) {
 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
 		return (error);
 	}
 
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	kqueue_init(kq);
 	kq->kq_fdp = fdp;
 	kq->kq_cred = crhold(cred);
 
 	FILEDESC_XLOCK(fdp);
 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 	FILEDESC_XUNLOCK(fdp);
 
 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (0);
 }
 
 struct g_kevent_args {
 	int	fd;
 	void	*changelist;
 	int	nchanges;
 	void	*eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
 
 int
 sys_kevent(struct thread *td, struct kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent_copyout,
 		.k_copyin = kevent_copyin,
 		.kevent_size = sizeof(struct kevent),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
 }
 
 static int
 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
     struct kevent_copyops *k_ops, const char *struct_name)
 {
 	struct timespec ts, *tsp;
 #ifdef KTRACE
 	struct kevent *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
 		    uap->nchanges, k_ops->kevent_size);
 #endif
 
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    k_ops, tsp);
 
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
 		    td->td_retval[0], k_ops->kevent_size);
 #endif
 
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
 	if (error == 0)
 		uap->changelist += count;
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD11
 static int
 kevent11_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct kevent_freebsd11 kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		kev11.ident = kevp->ident;
 		kev11.filter = kevp->filter;
 		kev11.flags = kevp->flags;
 		kev11.fflags = kevp->fflags;
 		kev11.data = kevp->data;
 		kev11.udata = kevp->udata;
 		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
 		if (error != 0)
 			break;
 		uap->eventlist++;
 		kevp++;
 	}
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent11_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct kevent_freebsd11 kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		error = copyin(uap->changelist, &kev11, sizeof(kev11));
 		if (error != 0)
 			break;
 		kevp->ident = kev11.ident;
 		kevp->filter = kev11.filter;
 		kevp->flags = kev11.flags;
 		kevp->fflags = kev11.fflags;
 		kevp->data = (uintptr_t)kev11.data;
 		kevp->udata = kev11.udata;
 		bzero(&kevp->ext, sizeof(kevp->ext));
 		uap->changelist++;
 		kevp++;
 	}
 	return (error);
 }
 
 int
 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent11_copyout,
 		.k_copyin = kevent11_copyin,
 		.kevent_size = sizeof(struct kevent_freebsd11),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11"));
 }
 #endif
 
 int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	int error;
 
 	cap_rights_init(&rights);
 	if (nchanges > 0)
 		cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
 	if (nevents > 0)
 		cap_rights_set(&rights, CAP_KQUEUE_EVENT);
 	error = fget(td, fd, &rights, &fp);
 	if (error != 0)
 		return (error);
 
 	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
 	fdrop(fp, td);
 
 	return (error);
 }
 
 static int
 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kevent keva[KQ_NEVENTS];
 	struct kevent *kevp, *changes;
 	int i, n, nerrors, error;
 
 	nerrors = 0;
 	while (nchanges > 0) {
 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
 		error = k_ops->k_copyin(k_ops->arg, keva, n);
 		if (error)
 			return (error);
 		changes = keva;
 		for (i = 0; i < n; i++) {
 			kevp = &changes[i];
 			if (!kevp->filter)
 				continue;
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td, 1);
 			if (error || (kevp->flags & EV_RECEIPT)) {
 				if (nevents == 0)
 					return (error);
 				kevp->flags = EV_ERROR;
 				kevp->data = error;
 				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
 				nevents--;
 				nerrors++;
 			}
 		}
 		nchanges -= n;
 	}
 	if (nerrors) {
 		td->td_retval[0] = nerrors;
 		return (0);
 	}
 
 	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
 }
 
 int
 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kqueue *kq;
 	int error;
 
 	error = kqueue_acquire(fp, &kq);
 	if (error != 0)
 		return (error);
 	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
 	kqueue_release(kq, 0);
 	return (error);
 }
 
 /*
  * Performs a kevent() call on a temporarily created kqueue. This can be
  * used to perform one-shot polling, similar to poll() and select().
  */
 int
 kern_kevent_anonymous(struct thread *td, int nevents,
     struct kevent_copyops *k_ops)
 {
 	struct kqueue kq = {};
 	int error;
 
 	kqueue_init(&kq);
 	kq.kq_refcnt = 1;
 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
 	kqueue_drain(&kq, td);
 	kqueue_destroy(&kq);
 	return (error);
 }
 
 int
 kqueue_add_filteropts(int filt, struct filterops *filtops)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
 		printf(
 "trying to add a filterop that is out of range: %d is beyond %d\n",
 		    ~filt, EVFILT_SYSCOUNT);
 		return EINVAL;
 	}
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
 	    sysfilt_ops[~filt].for_fop != NULL)
 		error = EEXIST;
 	else {
 		sysfilt_ops[~filt].for_fop = filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return (error);
 }
 
 int
 kqueue_del_filteropts(int filt)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return EINVAL;
 
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
 	    sysfilt_ops[~filt].for_fop == NULL)
 		error = EINVAL;
 	else if (sysfilt_ops[~filt].for_refcnt != 0)
 		error = EBUSY;
 	else {
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return error;
 }
 
 static struct filterops *
 kqueue_fo_find(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return NULL;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return sysfilt_ops[~filt].for_fop;
 
 	mtx_lock(&filterops_lock);
 	sysfilt_ops[~filt].for_refcnt++;
 	if (sysfilt_ops[~filt].for_fop == NULL)
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 	mtx_unlock(&filterops_lock);
 
 	return sysfilt_ops[~filt].for_fop;
 }
 
 static void
 kqueue_fo_release(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return;
 
 	mtx_lock(&filterops_lock);
 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
 	    ("filter object refcount not valid on release"));
 	sysfilt_ops[~filt].for_refcnt--;
 	mtx_unlock(&filterops_lock);
 }
 
 /*
  * A ref to kq (obtained via kqueue_acquire) must be held.  waitok will
  * influence if memory allocation should wait.  Make sure it is 0 if you
  * hold any mutexes.
  */
 static int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
 {
 	struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
 	struct knlist *knl;
 	cap_rights_t rights;
 	int error, filt, event;
 	int haskqglobal, filedesc_unlock;
 
 	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
 		return (EINVAL);
 
 	fp = NULL;
 	kn = NULL;
 	knl = NULL;
 	error = 0;
 	haskqglobal = 0;
 	filedesc_unlock = 0;
 
 	filt = kev->filter;
 	fops = kqueue_fo_find(filt);
 	if (fops == NULL)
 		return EINVAL;
 
 	if (kev->flags & EV_ADD) {
 		/*
 		 * Prevent waiting with locks.  Non-sleepable
 		 * allocation failures are handled in the loop, only
 		 * if the spare knote appears to be actually required.
 		 */
 		tkn = knote_alloc(waitok);
 	} else {
 		tkn = NULL;
 	}
 
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
 		if (kev->ident > INT_MAX)
 			error = EBADF;
 		else
 			error = fget(td, kev->ident,
 			    cap_rights_init(&rights, CAP_EVENT), &fp);
 		if (error)
 			goto done;
 
 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
 		    kev->ident, 0) != 0) {
 			/* try again */
 			fdrop(fp, td);
 			fp = NULL;
 			error = kqueue_expand(kq, fops, kev->ident, waitok);
 			if (error)
 				goto done;
 			goto findkn;
 		}
 
 		if (fp->f_type == DTYPE_KQUEUE) {
 			/*
 			 * If we add some intelligence about what we are doing,
 			 * we should be able to support events on ourselves.
 			 * We need to know when we are doing this to prevent
 			 * getting both the knlist lock and the kq lock since
 			 * they are the same thing.
 			 */
 			if (fp->f_data == kq) {
 				error = EINVAL;
 				goto done;
 			}
 
 			/*
 			 * Pre-lock the filedesc before the global
 			 * lock mutex, see the comment in
 			 * kqueue_close().
 			 */
 			FILEDESC_XLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 1;
 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 		}
 
 		KQ_LOCK(kq);
 		if (kev->ident < kq->kq_knlistsize) {
 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
 				if (kev->filter == kn->kn_filter)
 					break;
 		}
 	} else {
 		if ((kev->flags & EV_ADD) == EV_ADD)
 			kqueue_expand(kq, fops, kev->ident, waitok);
 
 		KQ_LOCK(kq);
 
 		/*
 		 * If possible, find an existing knote to use for this kevent.
 		 */
 		if (kev->filter == EVFILT_PROC &&
 		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
 			/* This is an internal creation of a process tracking
 			 * note. Don't attempt to coalesce this with an
 			 * existing note.
 			 */
 			;			
 		} else if (kq->kq_knhashmask != 0) {
 			struct klist *list;
 
 			list = &kq->kq_knhash[
 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
 			SLIST_FOREACH(kn, list, kn_link)
 				if (kev->ident == kn->kn_id &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	}
 
 	/* knote is in the process of changing, wait for it to stabilize. */
 	if (kn != NULL && kn_in_flux(kn)) {
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 		if (filedesc_unlock) {
 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 0;
 		}
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
 		if (fp != NULL) {
 			fdrop(fp, td);
 			fp = NULL;
 		}
 		goto findkn;
 	}
 
 	/*
 	 * kn now contains the matching knote, or NULL if no match
 	 */
 	if (kn == NULL) {
 		if (kev->flags & EV_ADD) {
 			kn = tkn;
 			tkn = NULL;
 			if (kn == NULL) {
 				KQ_UNLOCK(kq);
 				error = ENOMEM;
 				goto done;
 			}
 			kn->kn_fp = fp;
 			kn->kn_kq = kq;
 			kn->kn_fop = fops;
 			/*
 			 * apply reference counts to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
 			fops = NULL;
 			fp = NULL;
 
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
 			kn->kn_status = KN_DETACHED;
 			kn_enter_flux(kn);
 
 			error = knote_attach(kn, kq);
 			KQ_UNLOCK(kq);
 			if (error != 0) {
 				tkn = kn;
 				goto done;
 			}
 
 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
 				knote_drop_detached(kn, td);
 				goto done;
 			}
 			knl = kn_list_lock(kn);
 			goto done_ev_add;
 		} else {
 			/* No matching knote and the EV_ADD flag is not set. */
 			KQ_UNLOCK(kq);
 			error = ENOENT;
 			goto done;
 		}
 	}
 	
 	if (kev->flags & EV_DELETE) {
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		knote_drop(kn, td);
 		goto done;
 	}
 
 	if (kev->flags & EV_FORCEONESHOT) {
 		kn->kn_flags |= EV_ONESHOT;
 		KNOTE_ACTIVATE(kn, 1);
 	}
 
 	/*
 	 * The user may change some filter values after the initial EV_ADD,
 	 * but doing so will not reset any filter which has already been
 	 * triggered.
 	 */
 	kn->kn_status |= KN_SCAN;
 	kn_enter_flux(kn);
 	KQ_UNLOCK(kq);
 	knl = kn_list_lock(kn);
 	kn->kn_kevent.udata = kev->udata;
 	if (!fops->f_isfd && fops->f_touch != NULL) {
 		fops->f_touch(kn, kev, EVENT_REGISTER);
 	} else {
 		kn->kn_sfflags = kev->fflags;
 		kn->kn_sdata = kev->data;
 	}
 
 	/*
 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
 	 * the initial attach event decides that the event is "completed" 
 	 * already.  i.e. filt_procattach is called on a zombie process.  It
 	 * will call filt_proc which will remove it from the list, and NULL
 	 * kn_knlist.
 	 */
 done_ev_add:
 	if ((kev->flags & EV_ENABLE) != 0)
 		kn->kn_status &= ~KN_DISABLED;
 	else if ((kev->flags & EV_DISABLE) != 0)
 		kn->kn_status |= KN_DISABLED;
 
 	if ((kn->kn_status & KN_DISABLED) == 0)
 		event = kn->kn_fop->f_event(kn, 0);
 	else
 		event = 0;
 
 	KQ_LOCK(kq);
 	if (event)
 		kn->kn_status |= KN_ACTIVE;
 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
 	    KN_ACTIVE)
 		knote_enqueue(kn);
 	kn->kn_status &= ~KN_SCAN;
 	kn_leave_flux(kn);
 	kn_list_unlock(knl);
 	KQ_UNLOCK_FLUX(kq);
 
 done:
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	if (fp != NULL)
 		fdrop(fp, td);
 	knote_free(tkn);
 	if (fops != NULL)
 		kqueue_fo_release(filt);
 	return (error);
 }
 
 static int
 kqueue_acquire(struct file *fp, struct kqueue **kqp)
 {
 	int error;
 	struct kqueue *kq;
 
 	error = 0;
 
 	kq = fp->f_data;
 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
 		return (EBADF);
 	*kqp = kq;
 	KQ_LOCK(kq);
 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 		KQ_UNLOCK(kq);
 		return (EBADF);
 	}
 	kq->kq_refcnt++;
 	KQ_UNLOCK(kq);
 
 	return error;
 }
 
 static void
 kqueue_release(struct kqueue *kq, int locked)
 {
 	if (locked)
 		KQ_OWNED(kq);
 	else
 		KQ_LOCK(kq);
 	kq->kq_refcnt--;
 	if (kq->kq_refcnt == 1)
 		wakeup(&kq->kq_refcnt);
 	if (!locked)
 		KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_schedtask(struct kqueue *kq)
 {
 
 	KQ_OWNED(kq);
 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
 	    ("scheduling kqueue task while draining"));
 
 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
 		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
 		kq->kq_state |= KQ_TASKSCHED;
 	}
 }
 
 /*
  * Expand the kq to make sure we have storage for fops/ident pair.
  *
  * Return 0 on success (or no work necessary), return errno on failure.
  *
  * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
  * If kqueue_register is called from a non-fd context, there usually/should
  * be no locks held.
  */
 static int
 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
 	int waitok)
 {
 	struct klist *list, *tmp_knhash, *to_free;
 	u_long tmp_knhashmask;
 	int size;
 	int fd;
 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
 
 	KQ_NOTOWNED(kq);
 
 	to_free = NULL;
 	if (fops->f_isfd) {
 		fd = ident;
 		if (kq->kq_knlistsize <= fd) {
 			size = kq->kq_knlistsize;
 			while (size <= fd)
 				size += KQEXTENT;
 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
 			if (list == NULL)
 				return ENOMEM;
 			KQ_LOCK(kq);
 			if (kq->kq_knlistsize > fd) {
 				to_free = list;
 				list = NULL;
 			} else {
 				if (kq->kq_knlist != NULL) {
 					bcopy(kq->kq_knlist, list,
 					    kq->kq_knlistsize * sizeof(*list));
 					to_free = kq->kq_knlist;
 					kq->kq_knlist = NULL;
 				}
 				bzero((caddr_t)list +
 				    kq->kq_knlistsize * sizeof(*list),
 				    (size - kq->kq_knlistsize) * sizeof(*list));
 				kq->kq_knlistsize = size;
 				kq->kq_knlist = list;
 			}
 			KQ_UNLOCK(kq);
 		}
 	} else {
 		if (kq->kq_knhashmask == 0) {
 			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
 			    &tmp_knhashmask);
 			if (tmp_knhash == NULL)
 				return ENOMEM;
 			KQ_LOCK(kq);
 			if (kq->kq_knhashmask == 0) {
 				kq->kq_knhash = tmp_knhash;
 				kq->kq_knhashmask = tmp_knhashmask;
 			} else {
 				to_free = tmp_knhash;
 			}
 			KQ_UNLOCK(kq);
 		}
 	}
 	free(to_free, M_KQUEUE);
 
 	KQ_NOTOWNED(kq);
 	return 0;
 }
 
 static void
 kqueue_task(void *arg, int pending)
 {
 	struct kqueue *kq;
 	int haskqglobal;
 
 	haskqglobal = 0;
 	kq = arg;
 
 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 	KQ_LOCK(kq);
 
 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
 
 	kq->kq_state &= ~KQ_TASKSCHED;
 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
 		wakeup(&kq->kq_state);
 	}
 	KQ_UNLOCK(kq);
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 }
 
 /*
  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
  * We treat KN_MARKER knotes as if they are in flux.
  */
 static int
 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
     const struct timespec *tsp, struct kevent *keva, struct thread *td)
 {
 	struct kevent *kevp;
 	struct knote *kn, *marker;
 	struct knlist *knl;
 	sbintime_t asbt, rsbt;
 	int count, error, haskqglobal, influx, nkev, touch;
 
 	count = maxevents;
 	nkev = 0;
 	error = 0;
 	haskqglobal = 0;
 
 	if (maxevents == 0)
 		goto done_nl;
 
 	rsbt = 0;
 	if (tsp != NULL) {
 		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
 		    tsp->tv_nsec >= 1000000000) {
 			error = EINVAL;
 			goto done_nl;
 		}
 		if (timespecisset(tsp)) {
 			if (tsp->tv_sec <= INT32_MAX) {
 				rsbt = tstosbt(*tsp);
 				if (TIMESEL(&asbt, rsbt))
 					asbt += tc_tick_sbt;
 				if (asbt <= SBT_MAX - rsbt)
 					asbt += rsbt;
 				else
 					asbt = 0;
 				rsbt >>= tc_precexp;
 			} else
 				asbt = 0;
 		} else
 			asbt = -1;
 	} else
 		asbt = 0;
 	marker = knote_alloc(1);
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
 
 retry:
 	kevp = keva;
 	if (kq->kq_count == 0) {
 		if (asbt == -1) {
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
 			    "kqread", asbt, rsbt, C_ABSOLUTE);
 		}
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
 		if (error == ERESTART)
 			error = EINTR;
 		else if (error == EWOULDBLOCK)
 			error = 0;
 		goto done;
 	}
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
 	influx = 0;
 	while (count) {
 		KQ_OWNED(kq);
 		kn = TAILQ_FIRST(&kq->kq_head);
 
 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
 		    kn_in_flux(kn)) {
 			if (influx) {
 				influx = 0;
 				KQ_FLUX_WAKEUP(kq);
 			}
 			kq->kq_state |= KQ_FLUXWAIT;
 			error = msleep(kq, &kq->kq_lock, PSOCK,
 			    "kqflxwt", 0);
 			continue;
 		}
 
 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			continue;
 		}
 		if (kn == marker) {
 			KQ_FLUX_WAKEUP(kq);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		KASSERT(!kn_in_flux(kn),
 		    ("knote %p is unexpectedly in flux", kn));
 
 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked it as in flux.
 			 */
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			continue;
 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked the knote as being in flux.
 			 */
 			*kevp = kn->kn_kevent;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			kn = NULL;
 		} else {
 			kn->kn_status |= KN_SCAN;
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 			knl = kn_list_lock(kn);
 			if (kn->kn_fop->f_event(kn, 0) == 0) {
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
 				    KN_SCAN);
 				kn_leave_flux(kn);
 				kq->kq_count--;
 				kn_list_unlock(knl);
 				influx = 1;
 				continue;
 			}
 			touch = (!kn->kn_fop->f_isfd &&
 			    kn->kn_fop->f_touch != NULL);
 			if (touch)
 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
 			else
 				*kevp = kn->kn_kevent;
 			KQ_LOCK(kq);
 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
 				/* 
 				 * Manually clear knotes who weren't 
 				 * 'touch'ed.
 				 */
 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
 					kn->kn_data = 0;
 					kn->kn_fflags = 0;
 				}
 				if (kn->kn_flags & EV_DISPATCH)
 					kn->kn_status |= KN_DISABLED;
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 				kq->kq_count--;
 			} else
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 			
 			kn->kn_status &= ~KN_SCAN;
 			kn_leave_flux(kn);
 			kn_list_unlock(knl);
 			influx = 1;
 		}
 
 		/* we are returning a copy to the user */
 		kevp++;
 		nkev++;
 		count--;
 
 		if (nkev == KQ_NEVENTS) {
 			influx = 0;
 			KQ_UNLOCK_FLUX(kq);
 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 			nkev = 0;
 			kevp = keva;
 			KQ_LOCK(kq);
 			if (error)
 				break;
 		}
 	}
 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
 done:
 	KQ_OWNED(kq);
 	KQ_UNLOCK_FLUX(kq);
 	knote_free(marker);
 done_nl:
 	KQ_NOTOWNED(kq);
 	if (nkev != 0)
 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 	td->td_retval[0] = maxevents - count;
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
 	/*
 	 * Enabling sigio causes two major problems:
 	 * 1) infinite recursion:
 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
 	 * set.  On receipt of a signal this will cause a kqueue to recurse
 	 * into itself over and over.  Sending the sigio causes the kqueue
 	 * to become ready, which in turn posts sigio again, forever.
 	 * Solution: this can be solved by setting a flag in the kqueue that
 	 * we have a SIGIO in progress.
 	 * 2) locking problems:
 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
 	 * us above the proc and pgrp locks.
 	 * Solution: Post a signal using an async mechanism, being sure to
 	 * record a generation count in the delivery so that we do not deliver
 	 * a signal to the wrong process.
 	 *
 	 * Note, these two mechanisms are somewhat mutually exclusive!
 	 */
 #if 0
 	struct kqueue *kq;
 
 	kq = fp->f_data;
 	switch (cmd) {
 	case FIOASYNC:
 		if (*(int *)data) {
 			kq->kq_state |= KQ_ASYNC;
 		} else {
 			kq->kq_state &= ~KQ_ASYNC;
 		}
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &kq->kq_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&kq->kq_sigio);
 		return (0);
 	}
 #endif
 
 	return (ENOTTY);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct kqueue *kq;
 	int revents = 0;
 	int error;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return POLLERR;
 
 	KQ_LOCK(kq);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (kq->kq_count) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			selrecord(td, &kq->kq_sel);
 			if (SEL_WAITING(&kq->kq_sel))
 				kq->kq_state |= KQ_SEL;
 		}
 	}
 	kqueue_release(kq, 1);
 	KQ_UNLOCK(kq);
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 
 	bzero((void *)st, sizeof *st);
 	/*
 	 * We no longer return kq_count because the unlocked value is useless.
 	 * If you spent all this time getting the count, why not spend your
 	 * syscall better by calling kevent?
 	 *
 	 * XXX - This is needed for libc_r.
 	 */
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 static void
 kqueue_drain(struct kqueue *kq, struct thread *td)
 {
 	struct knote *kn;
 	int i;
 
 	KQ_LOCK(kq);
 
 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
 	    ("kqueue already closing"));
 	kq->kq_state |= KQ_CLOSING;
 	if (kq->kq_refcnt > 1)
 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
 
 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
 
 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
 	    ("kqueue's knlist not empty"));
 
 	for (i = 0; i < kq->kq_knlistsize; i++) {
 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
 			if (kn_in_flux(kn)) {
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
 				continue;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 	}
 	if (kq->kq_knhashmask != 0) {
 		for (i = 0; i <= kq->kq_knhashmask; i++) {
 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
 				if (kn_in_flux(kn)) {
 					kq->kq_state |= KQ_FLUXWAIT;
 					msleep(kq, &kq->kq_lock, PSOCK,
 					       "kqclo2", 0);
 					continue;
 				}
 				kn_enter_flux(kn);
 				KQ_UNLOCK(kq);
 				knote_drop(kn, td);
 				KQ_LOCK(kq);
 			}
 		}
 	}
 
 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
 		kq->kq_state |= KQ_TASKDRAIN;
 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
 	}
 
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 
 	KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_destroy(struct kqueue *kq)
 {
 
 	KASSERT(kq->kq_fdp == NULL,
 	    ("kqueue still attached to a file descriptor"));
 	seldrain(&kq->kq_sel);
 	knlist_destroy(&kq->kq_sel.si_note);
 	mtx_destroy(&kq->kq_lock);
 
 	if (kq->kq_knhash != NULL)
 		free(kq->kq_knhash, M_KQUEUE);
 	if (kq->kq_knlist != NULL)
 		free(kq->kq_knlist, M_KQUEUE);
 
 	funsetown(&kq->kq_sigio);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = fp->f_data;
 	struct filedesc *fdp;
 	int error;
 	int filedesc_unlock;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return error;
 	kqueue_drain(kq, td);
 
 	/*
 	 * We could be called due to the knote_drop() doing fdrop(),
 	 * called from kqueue_register().  In this case the global
 	 * lock is owned, and filedesc sx is locked before, to not
 	 * take the sleepable lock after non-sleepable.
 	 */
 	fdp = kq->kq_fdp;
 	kq->kq_fdp = NULL;
 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
 		FILEDESC_XLOCK(fdp);
 		filedesc_unlock = 1;
 	} else
 		filedesc_unlock = 0;
 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(fdp);
 
 	kqueue_destroy(kq);
 	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
 	crfree(kq->kq_cred);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
 static int
 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_KQUEUE;
 	return (0);
 }
 
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
 		kq->kq_state &= ~KQ_SLEEP;
 		wakeup(kq);
 	}
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 	if (!knlist_empty(&kq->kq_sel.si_note))
 		kqueue_schedtask(kq);
 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
 		pgsigio(&kq->kq_sigio, SIGIO, 0);
 	}
 }
 
 /*
  * Walk down a list of knotes, activating them if their event has triggered.
  *
  * There is a possibility to optimize in the case of one kq watching another.
  * Instead of scheduling a task to wake it up, you could pass enough state
  * down the chain to make up the parent kqueue.  Make this code functional
  * first.
  */
 void
 knote(struct knlist *list, long hint, int lockflags)
 {
 	struct kqueue *kq;
 	struct knote *kn, *tkn;
 	int error;
 
 	if (list == NULL)
 		return;
 
 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
 
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
 	 * If we unlock the list lock (and enter influx), we can
 	 * eliminate the kqueue scheduling, but this will introduce
 	 * four lock/unlock's for each knote to test.  Also, marker
 	 * would be needed to keep iteration position, since filters
 	 * or other threads could remove events.
 	 */
 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			/*
 			 * Do not process the influx notes, except for
 			 * the influx coming from the kq unlock in the
 			 * kqueue_scan().  In the later case, we do
 			 * not interfere with the scan, since the code
 			 * fragment in kqueue_scan() locks the knlist,
 			 * and cannot proceed until we finished.
 			 */
 			KQ_UNLOCK(kq);
 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			error = kn->kn_fop->f_event(kn, hint);
 			KQ_LOCK(kq);
 			kn_leave_flux(kn);
 			if (error)
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK_FLUX(kq);
 		} else {
 			kn->kn_status |= KN_HASKQLOCK;
 			if (kn->kn_fop->f_event(kn, hint))
 				KNOTE_ACTIVATE(kn, 1);
 			kn->kn_status &= ~KN_HASKQLOCK;
 			KQ_UNLOCK(kq);
 		}
 	}
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_unlock(list->kl_lockarg); 
 }
 
 /*
  * add a knote to a knlist
  */
 void
 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	KNL_ASSERT_LOCK(knl, islocked);
 	KQ_NOTOWNED(kn->kn_kq);
 	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p was not detached", kn));
 	if (!islocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
 	if (!islocked)
 		knl->kl_unlock(knl->kl_lockarg);
 	KQ_LOCK(kn->kn_kq);
 	kn->kn_knlist = knl;
 	kn->kn_status &= ~KN_DETACHED;
 	KQ_UNLOCK(kn->kn_kq);
 }
 
 static void
 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
     int kqislocked)
 {
 
 	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
 	KNL_ASSERT_LOCK(knl, knlislocked);
 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
 	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) == 0,
 	    ("knote %p was already detached", kn));
 	if (!knlislocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
 	kn->kn_knlist = NULL;
 	if (!knlislocked)
 		kn_list_unlock(knl);
 	if (!kqislocked)
 		KQ_LOCK(kn->kn_kq);
 	kn->kn_status |= KN_DETACHED;
 	if (!kqislocked)
 		KQ_UNLOCK(kn->kn_kq);
 }
 
 /*
  * remove knote from the specified knlist
  */
 void
 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	knlist_remove_kq(knl, kn, islocked, 0);
 }
 
 int
 knlist_empty(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	return (SLIST_EMPTY(&knl->kl_list));
 }
 
 static struct mtx knlist_lock;
 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
     MTX_DEF);
 static void knlist_mtx_lock(void *arg);
 static void knlist_mtx_unlock(void *arg);
 
 static void
 knlist_mtx_lock(void *arg)
 {
 
 	mtx_lock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_unlock(void *arg)
 {
 
 	mtx_unlock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_assert_locked(void *arg)
 {
 
 	mtx_assert((struct mtx *)arg, MA_OWNED);
 }
 
 static void
 knlist_mtx_assert_unlocked(void *arg)
 {
 
 	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
 }
 
 static void
 knlist_rw_rlock(void *arg)
 {
 
 	rw_rlock((struct rwlock *)arg);
 }
 
 static void
 knlist_rw_runlock(void *arg)
 {
 
 	rw_runlock((struct rwlock *)arg);
 }
 
 static void
 knlist_rw_assert_locked(void *arg)
 {
 
 	rw_assert((struct rwlock *)arg, RA_LOCKED);
 }
 
 static void
 knlist_rw_assert_unlocked(void *arg)
 {
 
 	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
 }
 
 void
 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
     void (*kl_unlock)(void *),
     void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
 {
 
 	if (lock == NULL)
 		knl->kl_lockarg = &knlist_lock;
 	else
 		knl->kl_lockarg = lock;
 
 	if (kl_lock == NULL)
 		knl->kl_lock = knlist_mtx_lock;
 	else
 		knl->kl_lock = kl_lock;
 	if (kl_unlock == NULL)
 		knl->kl_unlock = knlist_mtx_unlock;
 	else
 		knl->kl_unlock = kl_unlock;
 	if (kl_assert_locked == NULL)
 		knl->kl_assert_locked = knlist_mtx_assert_locked;
 	else
 		knl->kl_assert_locked = kl_assert_locked;
 	if (kl_assert_unlocked == NULL)
 		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
 	else
 		knl->kl_assert_unlocked = kl_assert_unlocked;
 
 	knl->kl_autodestroy = 0;
 	SLIST_INIT(&knl->kl_list);
 }
 
 void
 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
 {
 
 	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
 }
 
 struct knlist *
 knlist_alloc(struct mtx *lock)
 {
 	struct knlist *knl;
 
 	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
 	knlist_init_mtx(knl, lock);
 	return (knl);
 }
 
 void
 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
 {
 
 	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
 	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
 }
 
 void
 knlist_destroy(struct knlist *knl)
 {
 
 	KASSERT(KNLIST_EMPTY(knl),
 	    ("destroying knlist %p with knotes on it", knl));
 }
 
 void
 knlist_detach(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	knl->kl_autodestroy = 1;
 	if (knlist_empty(knl)) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 /*
  * Even if we are locked, we may need to drop the lock to allow any influx
  * knotes time to "settle".
  */
 void
 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
 {
 	struct knote *kn, *kn2;
 	struct kqueue *kq;
 
 	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		KNL_ASSERT_UNLOCKED(knl);
 again:		/* need to reacquire lock since we have dropped it */
 		knl->kl_lock(knl->kl_lockarg);
 	}
 
 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn)) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 		knlist_remove_kq(knl, kn, 1, 1);
 		if (killkn) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop_detached(kn, td);
 		} else {
 			/* Make sure cleared knotes disappear soon */
 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
 			KQ_UNLOCK(kq);
 		}
 		kq = NULL;
 	}
 
 	if (!SLIST_EMPTY(&knl->kl_list)) {
 		/* there are still in flux knotes remaining */
 		kn = SLIST_FIRST(&knl->kl_list);
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
 		knl->kl_unlock(knl->kl_lockarg);
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
 		kq = NULL;
 		goto again;
 	}
 
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		knl->kl_unlock(knl->kl_lockarg);
 		KNL_ASSERT_UNLOCKED(knl);
 	}
 }
 
 /*
  * Remove all knotes referencing a specified fd must be called with FILEDESC
  * lock.  This prevents a race where a new fd comes along and occupies the
  * entry and we attach a knote to the fd.
  */
 void
 knote_fdclose(struct thread *td, int fd)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct kqueue *kq;
 	struct knote *kn;
 	int influx;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We shouldn't have to worry about new kevents appearing on fd
 	 * since filedesc is locked.
 	 */
 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
 		KQ_LOCK(kq);
 
 again:
 		influx = 0;
 		while (kq->kq_knlistsize > fd &&
 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
 			if (kn_in_flux(kn)) {
 				/* someone else might be waiting on our knote */
 				if (influx)
 					wakeup(kq);
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
 				goto again;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			influx = 1;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 static int
 knote_attach(struct knote *kn, struct kqueue *kq)
 {
 	struct klist *list;
 
 	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
 	KQ_OWNED(kq);
 
 	if (kn->kn_fop->f_isfd) {
 		if (kn->kn_id >= kq->kq_knlistsize)
 			return (ENOMEM);
 		list = &kq->kq_knlist[kn->kn_id];
 	} else {
 		if (kq->kq_knhash == NULL)
 			return (ENOMEM);
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 	}
 	SLIST_INSERT_HEAD(list, kn, kn_link);
 	return (0);
 }
 
 static void
 knote_drop(struct knote *kn, struct thread *td)
 {
 
 	if ((kn->kn_status & KN_DETACHED) == 0)
 		kn->kn_fop->f_detach(kn);
 	knote_drop_detached(kn, td);
 }
 
 static void
 knote_drop_detached(struct knote *kn, struct thread *td)
 {
 	struct kqueue *kq;
 	struct klist *list;
 
 	kq = kn->kn_kq;
 
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p still attached", kn));
 	KQ_NOTOWNED(kq);
 
 	KQ_LOCK(kq);
 	KASSERT(kn->kn_influx == 1,
 	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
 
 	if (kn->kn_fop->f_isfd)
 		list = &kq->kq_knlist[kn->kn_id];
 	else
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
 	if (!SLIST_EMPTY(list))
 		SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	KQ_UNLOCK_FLUX(kq);
 
 	if (kn->kn_fop->f_isfd) {
 		fdrop(kn->kn_fp, td);
 		kn->kn_fp = NULL;
 	}
 	kqueue_fo_release(kn->kn_kevent.filter);
 	kn->kn_fop = NULL;
 	knote_free(kn);
 }
 
 static void
 knote_enqueue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
 	kqueue_wakeup(kq);
 }
 
 static void
 knote_dequeue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
 }
 
 static void
 knote_init(void)
 {
 
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 
 static struct knote *
 knote_alloc(int waitok)
 {
 
 	return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) |
 	    M_ZERO));
 }
 
 static void
 knote_free(struct knote *kn)
 {
 
 	uma_zfree(knote_zone, kn);
 }
 
 /*
  * Register the kev w/ the kq specified by fd.
  */
 int 
 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
 {
 	struct kqueue *kq;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
 	if (error != 0)
 		return (error);
 	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto noacquire;
 
 	error = kqueue_register(kq, kev, td, waitok);
 	kqueue_release(kq, 0);
 
 noacquire:
 	fdrop(fp, td);
 	return (error);
 }
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 326270)
+++ head/sys/kern/kern_exec.c	(revision 326271)
@@ -1,1740 +1,1742 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exec;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, , , exec, "char *");
 SDT_PROBE_DEFINE1(proc, , , exec__failure, "int");
 SDT_PROBE_DEFINE1(proc, , , exec__success, "char *");
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 int coredump_pack_fileinfo = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN,
     &coredump_pack_fileinfo, 0,
     "Enable file path packing in 'procstat -f' coredump notes");
 
 int coredump_pack_vmmapinfo = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN,
     &coredump_pack_vmmapinfo, 0,
     "Enable file path packing in 'procstat -v' coredump notes");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
     CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int disallow_high_osrel;
 SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
     &disallow_high_osrel, 0,
     "Disallow execution of binaries built for higher version of the world");
 
 static int map_at_zero = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0,
     "Permit processes to map an object at virtual address 0.");
 
 EVENTHANDLER_LIST_DECLARE(process_exec);
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 int
 sys_execve(struct thread *td, struct execve_args *uap)
 {
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fexecve_args {
 	int	fd;
 	char	**argv;
 	char	**envv;
 }
 #endif
 int
 sys_fexecve(struct thread *td, struct fexecve_args *uap)
 {
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
 		args.fd = uap->fd;
 		error = kern_execve(td, &args, NULL);
 	}
 	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 int
 sys___mac_execve(struct thread *td, struct __mac_execve_args *uap)
 {
 #ifdef MAC
 	struct image_args args;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 	post_execve(td, error, oldvmspace);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 int
 pre_execve(struct thread *td, struct vmspace **oldvmspace)
 {
 	struct proc *p;
 	int error;
 
 	KASSERT(td == curthread, ("non-current thread %p", td));
 	error = 0;
 	p = td->td_proc;
 	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		if (thread_single(p, SINGLE_BOUNDARY) != 0)
 			error = ERESTART;
 		PROC_UNLOCK(p);
 	}
 	KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("nested execve"));
 	*oldvmspace = p->p_vmspace;
 	return (error);
 }
 
 void
 post_execve(struct thread *td, int error, struct vmspace *oldvmspace)
 {
 	struct proc *p;
 
 	KASSERT(td == curthread, ("non-current thread %p", td));
 	p = td->td_proc;
 	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == EJUSTRETURN)
 			thread_single(p, SINGLE_EXIT);
 		else
 			thread_single_end(p, SINGLE_BOUNDARY);
 		PROC_UNLOCK(p);
 	}
 	if ((td->td_pflags & TDP_EXECVMSPC) != 0) {
 		KASSERT(p->p_vmspace != oldvmspace,
 		    ("oldvmspace still used"));
 		vmspace_free(oldvmspace);
 		td->td_pflags &= ~TDP_EXECVMSPC;
 	}
 }
 
 /*
  * XXX: kern_execve has the astonishing property of not always returning to
  * the caller.  If sufficiently bad things happen during the call to
  * do_execve(), it can end up calling exit1(); as a result, callers must
  * avoid doing anything which they might need to undo (e.g., allocating
  * memory).
  */
 int
 kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
 {
 
 	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	return (do_execve(td, args, mac_p));
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  */
 static int
 do_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd;
 	struct ucred *oldcred;
 	struct uidinfo *euip = NULL;
 	register_t *stack_base;
 	int error, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *oldtextvp = NULL, *newtextvp;
 	cap_rights_t rights;
 	int credential_changing;
 	int textset;
 #ifdef MAC
 	struct label *interpvplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 	static const char fexecv_proc_title[] = "(fexecv)";
 
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	bzero(imgp, sizeof(*imgp));
 	imgp->proc = p;
 	imgp->attr = &attr;
 	imgp->args = args;
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp among other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	if (args->fname != NULL) {
 		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
 		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 	}
 
 	SDT_PROBE1(proc, , , exec, args->fname);
 
 interpret:
 	if (args->fname != NULL) {
 #ifdef CAPABILITY_MODE
 		/*
 		 * While capability mode can't reach this point via direct
 		 * path arguments to execve(), we also don't allow
 		 * interpreters to be used in capability mode (for now).
 		 * Catch indirect lookups and return a permissions error.
 		 */
 		if (IN_CAPABILITY_MODE(td)) {
 			error = ECAPMODE;
 			goto exec_fail;
 		}
 #endif
 		error = namei(&nd);
 		if (error)
 			goto exec_fail;
 
 		newtextvp = nd.ni_vp;
 		imgp->vp = newtextvp;
 	} else {
 		AUDIT_ARG_FD(args->fd);
 		/*
 		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
 		error = fgetvp_exec(td, args->fd,
 		    cap_rights_init(&rights, CAP_FEXECVE), &newtextvp);
 		if (error)
 			goto exec_fail;
 		vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(newtextvp);
 		imgp->vp = newtextvp;
 	}
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = VOP_IS_TEXT(imgp->vp);
 	VOP_SET_TEXT(imgp->vp);
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->proc->p_osrel = 0;
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Determine new credentials before attempting image activators
 	 * so that it can be used by process_exec handlers to determine
 	 * credential/setid changes.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * We disable setuid/setgid/etc in capability mode on the basis
 	 * that most setugid applications are not written with that
 	 * environment in mind, and will therefore almost certainly operate
 	 * incorrectly. In principle there's no reason that setugid
 	 * applications might not be useful in capability mode, so we may want
 	 * to reconsider this conservative design choice in the future.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & S_ISUID) &&
 	    oldcred->cr_uid != attr.va_uid;
 	credential_changing |= (attr.va_mode & S_ISGID) &&
 	    oldcred->cr_gid != attr.va_gid;
 #ifdef MAC
 	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
 	    interpvplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 #ifdef CAPABILITY_MODE
 	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
 #endif
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		imgp->credential_setid = true;
 		VOP_UNLOCK(imgp->vp, 0);
 		imgp->newcred = crdup(oldcred);
 		if (attr.va_mode & S_ISUID) {
 			euip = uifind(attr.va_uid);
 			change_euid(imgp->newcred, euip);
 		}
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 		if (attr.va_mode & S_ISGID)
 			change_egid(imgp->newcred, attr.va_gid);
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(imgp->newcred, imgp->newcred->cr_uid);
 		change_svgid(imgp->newcred, imgp->newcred->cr_gid);
 	} else {
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			VOP_UNLOCK(imgp->vp, 0);
 			imgp->newcred = crdup(oldcred);
 			vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 			change_svuid(imgp->newcred, imgp->newcred->cr_uid);
 			change_svgid(imgp->newcred, imgp->newcred->cr_gid);
 		}
 	}
 	/* The new credentials are installed into the process later. */
 
 	/*
 	 * Do the best to calculate the full path to the image file.
 	 */
 	if (args->fname != NULL && args->fname[0] == '/')
 		imgp->execpath = args->fname;
 	else {
 		VOP_UNLOCK(imgp->vp, 0);
 		if (vn_fullpath(td, imgp->vp, &imgp->execpath,
 		    &imgp->freepath) != 0)
 			imgp->execpath = args->fname;
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				VOP_UNSET_TEXT(imgp->vp);
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		VOP_UNSET_TEXT(imgp->vp);
 		/* free name buffer and old vnode */
 		if (args->fname != NULL)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 		mac_execve_interpreter_enter(newtextvp, &interpvplabel);
 #endif
 		if (imgp->opened) {
 			VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
 			imgp->opened = 0;
 		}
 		vput(newtextvp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		imgp->credential_setid = false;
 		if (imgp->newcred != NULL) {
 			crfree(imgp->newcred);
 			imgp->newcred = NULL;
 		}
 		imgp->execpath = NULL;
 		free(imgp->freepath, M_TEMP);
 		imgp->freepath = NULL;
 		/* set new name to that of the interpreter */
 		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		args->fname = imgp->interpreter_name;
 		goto interpret;
 	}
 
 	/*
 	 * NB: We unlock the vnode here because it is believed that none
 	 * of the sv_copyout_strings/sv_fixup operations require the vnode.
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
 	if (disallow_high_osrel &&
 	    P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
 		error = ENOEXEC;
 		uprintf("Osrel %d for image %s too high\n", p->p_osrel,
 		    imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		goto exec_fail_dealloc;
 	}
 
 	/* ABI enforces the use of Capsicum. Switch into capabilities mode. */
 	if (SV_PROC_FLAG(p, SV_CAPSICUM))
 		sys_cap_enter(td, NULL);
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	if (args->fdp != NULL) {
 		/* Install a brand new file descriptor table. */
 		fdinstall_remapped(td, args->fdp);
 		args->fdp = NULL;
 	} else {
 		/*
 		 * Keep on using the existing file descriptor table. For
 		 * security and other reasons, the file descriptor table
 		 * cannot be shared after an exec.
 		 */
 		fdunshare(td);
 		/* close files on exec */
 		fdcloseexec(td);
 	}
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 	}
 
 	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 
 	PROC_LOCK(p);
 	if (oldsigacts)
 		p->p_sigacts = newsigacts;
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	bzero(p->p_comm, sizeof(p->p_comm));
 	if (args->fname)
 		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
 		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
 	else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
 		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
 	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
 		p->p_flag2 &= ~P2_NOTRACE;
 	if (p->p_flag & P_PPWAIT) {
 		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
 		cv_broadcast(&p->p_pwait);
 		/* STOPs are no longer ignored, arrange for AST */
 		signotify(td);
 	}
 
 	/*
 	 * Implement image setuid/setgid installation.
 	 */
 	if (imgp->credential_setid) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracecred != NULL &&
 		    priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0))
 			ktrprocexec(p, &tracecred, &tracevp);
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * Both fdsetugidsafety() and fdcheckstd() may call functions
 		 * taking sleepable locks, so temporarily drop our locks.
 		 */
 		PROC_UNLOCK(p);
 		VOP_UNLOCK(imgp->vp, 0);
 		fdsetugidsafety(td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (error != 0)
 			goto exec_fail_dealloc;
 		PROC_LOCK(p);
 #ifdef MAC
 		if (will_transition) {
 			mac_vnode_execve_transition(oldcred, imgp->newcred,
 			    imgp->vp, interpvplabel, imgp);
 		}
 #endif
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 	}
 	/*
 	 * Set the new credentials.
 	 */
 	if (imgp->newcred != NULL) {
 		proc_set_cred(p, imgp->newcred);
 		crfree(oldcred);
 		oldcred = NULL;
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced by namei
 	 * or fgetvp_exec.
 	 */
 	oldtextvp = p->p_textvp;
 	p->p_textvp = newtextvp;
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exec if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exec)
 		dtrace_fasttrap_exec(p);
 #endif
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 	PROC_UNLOCK(p);
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		VOP_UNLOCK(imgp->vp, 0);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 	}
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp, 
 		    (u_long)(uintptr_t)stack_base);
 	else
 		exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
 
 	vfs_mark_atime(imgp->vp, td->td_ucred);
 
 	SDT_PROBE1(proc, , , exec__success, args->fname);
 
 exec_fail_dealloc:
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		if (args->fname)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (imgp->opened)
 			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
 		if (error != 0)
 			vput(imgp->vp);
 		else
 			VOP_UNLOCK(imgp->vp, 0);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	free(imgp->freepath, M_TEMP);
 
 	if (error == 0) {
 		if (p->p_ptevents & PTRACE_EXEC) {
 			PROC_LOCK(p);
 			if (p->p_ptevents & PTRACE_EXEC)
 				td->td_dbgflags |= TDB_EXEC;
 			PROC_UNLOCK(p);
 		}
 
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 	} else {
 exec_fail:
 		/* we're done here, clear P_INEXEC */
 		PROC_LOCK(p);
 		p->p_flag &= ~P_INEXEC;
 		PROC_UNLOCK(p);
 
 		SDT_PROBE1(proc, , , exec__failure, error);
 	}
 
 	if (imgp->newcred != NULL && oldcred != NULL)
 		crfree(imgp->newcred);
 
 #ifdef MAC
 	mac_execve_exit(imgp);
 	mac_execve_interpreter_exit(interpvplabel);
 #endif
 	exec_free_args(args);
 
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (oldtextvp != NULL)
 		vrele(oldtextvp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	pargs_drop(oldargs);
 	pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 	if (euip != NULL)
 		uifree(euip);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, 0, SIGABRT);
 		/* NOT REACHED */
 	}
 
 #ifdef KTRACE
 	if (error == 0)
 		ktrprocctor(p);
 #endif
 
 	/*
 	 * We don't want cpu_set_syscall_retval() to overwrite any of
 	 * the register values put in place by exec_setregs().
 	 * Implementations of cpu_set_syscall_retval() will leave
 	 * registers unmodified when returning EJUSTRETURN.
 	 */
 	return (error == 0 ? EJUSTRETURN : error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i, after, initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_WLOCK(object);
 #if VM_NRESERVLEVEL > 0
 	vm_object_color(object, 0);
 #endif
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (ma[0]->valid != VM_PAGE_BITS_ALL) {
 		vm_page_xbusy(ma[0]);
 		if (!vm_pager_has_page(object, 0, NULL, &after)) {
 			vm_page_lock(ma[0]);
 			vm_page_free(ma[0]);
 			vm_page_unlock(ma[0]);
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 		initial_pagein = min(after, VM_INITIAL_PAGEIN);
 		KASSERT(initial_pagein <= object->size,
 		    ("%s: initial_pagein %d object->size %ju",
 		    __func__, initial_pagein, (uintmax_t )object->size));
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
 				if (ma[i]->valid)
 					break;
 				if (vm_page_tryxbusy(ma[i]))
 					break;
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL);
 		if (rv != VM_PAGER_OK) {
 			for (i = 0; i < initial_pagein; i++) {
 				vm_page_lock(ma[i]);
 				vm_page_free(ma[i]);
 				vm_page_unlock(ma[i]);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 		vm_page_xunbusy(ma[0]);
 		for (i = 1; i < initial_pagein; i++)
 			vm_page_readahead_finish(ma[i]);
 	}
 	vm_page_lock(ma[0]);
 	vm_page_hold(ma[0]);
 	vm_page_activate(ma[0]);
 	vm_page_unlock(ma[0]);
 	VM_OBJECT_WUNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(struct image_params *imgp)
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock(m);
 		vm_page_unhold(m);
 		vm_page_unlock(m);
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack.
  *	The new stack is only sgrowsiz large because it is grown
  *	automatically on a page fault.
  */
 int
 exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_object_t obj;
 	struct rlimit rlim_stack;
 	vm_offset_t sv_minuser, stack_addr;
 	vm_map_t map;
 	u_long ssiz;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	/* May be called with Giant held */
 	EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (map_at_zero)
 		sv_minuser = sv->sv_minuser;
 	else
 		sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 		/* An exec terminates mlockall(MCL_FUTURE). */
 		vm_map_lock(map);
 		vm_map_modflags(map, 0, MAP_WIREFUTURE);
 		vm_map_unlock(map);
 	} else {
 		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
 		if (error)
 			return (error);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Map a shared page */
 	obj = sv->sv_shared_page_obj;
 	if (obj != NULL) {
 		vm_object_reference(obj);
 		error = vm_map_fixed(map, obj, 0,
 		    sv->sv_shared_page_base, sv->sv_shared_page_len,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    VM_PROT_READ | VM_PROT_EXECUTE,
 		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
 		if (error != KERN_SUCCESS) {
 			vm_object_deallocate(obj);
 			return (vm_mmap_to_errno(error));
 		}
 	}
 
 	/* Allocate a new stack */
 	if (imgp->stack_sz != 0) {
 		ssiz = trunc_page(imgp->stack_sz);
 		PROC_LOCK(p);
 		lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack);
 		PROC_UNLOCK(p);
 		if (ssiz > rlim_stack.rlim_max)
 			ssiz = rlim_stack.rlim_max;
 		if (ssiz > rlim_stack.rlim_cur) {
 			rlim_stack.rlim_cur = ssiz;
 			kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
 		}
 	} else if (sv->sv_maxssiz != NULL) {
 		ssiz = *sv->sv_maxssiz;
 	} else {
 		ssiz = maxssiz;
 	}
 	stack_addr = sv->sv_usrstack - ssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error != KERN_SUCCESS)
 		return (vm_mmap_to_errno(error));
 
 	/*
 	 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they
 	 * are still used to enforce the stack rlimit on the process stack.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)stack_addr;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process address
  * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	u_long argp, envp;
 	int error;
 	size_t length;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 
 	/*
 	 * Allocate demand-paged memory for the file name, argument, and
 	 * environment strings.
 	 */
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Copy the file name.
 	 */
 	if (fname != NULL) {
 		args->fname = args->buf;
 		error = (segflg == UIO_SYSSPACE) ?
 		    copystr(fname, args->fname, PATH_MAX, &length) :
 		    copyinstr(fname, args->fname, PATH_MAX, &length);
 		if (error != 0)
 			goto err_exit;
 	} else
 		length = 0;
 
 	args->begin_argv = args->buf + length;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	/*
 	 * extract arguments first
 	 */
 	for (;;) {
 		error = fueword(argv++, &argp);
 		if (error == -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if (argp == 0)
 			break;
 		error = copyinstr((void *)(uintptr_t)argp, args->endp,
 		    args->stringspace, &length);
 		if (error != 0) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		for (;;) {
 			error = fueword(envv++, &envp);
 			if (error == -1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if (envp == 0)
 				break;
 			error = copyinstr((void *)(uintptr_t)envp,
 			    args->endp, args->stringspace, &length);
 			if (error != 0) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 int
 exec_copyin_data_fds(struct thread *td, struct image_args *args,
     const void *data, size_t datalen, const int *fds, size_t fdslen)
 {
 	struct filedesc *ofdp;
 	const char *p;
 	int *kfds;
 	int error;
 
 	memset(args, '\0', sizeof(*args));
 	ofdp = td->td_proc->p_fd;
 	if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1)
 		return (E2BIG);
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	args->begin_argv = args->buf;
 	args->stringspace = ARG_MAX;
 
 	if (datalen > 0) {
 		/*
 		 * Argument buffer has been provided. Copy it into the
 		 * kernel as a single string and add a terminating null
 		 * byte.
 		 */
 		error = copyin(data, args->begin_argv, datalen);
 		if (error != 0)
 			goto err_exit;
 		args->begin_argv[datalen] = '\0';
 		args->endp = args->begin_argv + datalen + 1;
 		args->stringspace -= datalen + 1;
 
 		/*
 		 * Traditional argument counting. Count the number of
 		 * null bytes.
 		 */
 		for (p = args->begin_argv; p < args->endp; ++p)
 			if (*p == '\0')
 				++args->argc;
 	} else {
 		/* No argument buffer provided. */
 		args->endp = args->begin_argv;
 	}
 	/* There are no environment variables. */
 	args->begin_envv = args->endp;
 
 	/* Create new file descriptor table. */
 	kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK);
 	error = copyin(fds, kfds, fdslen * sizeof(int));
 	if (error != 0) {
 		free(kfds, M_TEMP);
 		goto err_exit;
 	}
 	error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp);
 	free(kfds, M_TEMP);
 	if (error != 0)
 		goto err_exit;
 
 	return (0);
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 struct exec_args_kva {
 	vm_offset_t addr;
 	u_int gen;
 	SLIST_ENTRY(exec_args_kva) next;
 };
 
 static DPCPU_DEFINE(struct exec_args_kva *, exec_args_kva);
 
 static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist;
 static struct mtx exec_args_kva_mtx;
 static u_int exec_args_gen;
 
 static void
 exec_prealloc_args_kva(void *arg __unused)
 {
 	struct exec_args_kva *argkva;
 	u_int i;
 
 	SLIST_INIT(&exec_args_kva_freelist);
 	mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF);
 	for (i = 0; i < exec_map_entries; i++) {
 		argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK);
 		argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size);
 		argkva->gen = exec_args_gen;
 		SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
 	}
 }
 SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL);
 
 static vm_offset_t
 exec_alloc_args_kva(void **cookie)
 {
 	struct exec_args_kva *argkva;
 
 	argkva = (void *)atomic_readandclear_ptr(
 	    (uintptr_t *)DPCPU_PTR(exec_args_kva));
 	if (argkva == NULL) {
 		mtx_lock(&exec_args_kva_mtx);
 		while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL)
 			(void)mtx_sleep(&exec_args_kva_freelist,
 			    &exec_args_kva_mtx, 0, "execkva", 0);
 		SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next);
 		mtx_unlock(&exec_args_kva_mtx);
 	}
 	*(struct exec_args_kva **)cookie = argkva;
 	return (argkva->addr);
 }
 
 static void
 exec_release_args_kva(struct exec_args_kva *argkva, u_int gen)
 {
 	vm_offset_t base;
 
 	base = argkva->addr;
 	if (argkva->gen != gen) {
 		vm_map_madvise(exec_map, base, base + exec_map_entry_size,
 		    MADV_FREE);
 		argkva->gen = gen;
 	}
 	if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva),
 	    (uintptr_t)NULL, (uintptr_t)argkva)) {
 		mtx_lock(&exec_args_kva_mtx);
 		SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
 		wakeup_one(&exec_args_kva_freelist);
 		mtx_unlock(&exec_args_kva_mtx);
 	}
 }
 
 static void
 exec_free_args_kva(void *cookie)
 {
 
 	exec_release_args_kva(cookie, exec_args_gen);
 }
 
 static void
 exec_args_kva_lowmem(void *arg __unused)
 {
 	SLIST_HEAD(, exec_args_kva) head;
 	struct exec_args_kva *argkva;
 	u_int gen;
 	int i;
 
 	gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1;
 
 	/*
 	 * Force an madvise of each KVA range. Any currently allocated ranges
 	 * will have MADV_FREE applied once they are freed.
 	 */
 	SLIST_INIT(&head);
 	mtx_lock(&exec_args_kva_mtx);
 	SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva);
 	mtx_unlock(&exec_args_kva_mtx);
 	while ((argkva = SLIST_FIRST(&head)) != NULL) {
 		SLIST_REMOVE_HEAD(&head, next);
 		exec_release_args_kva(argkva, gen);
 	}
 
 	CPU_FOREACH(i) {
 		argkva = (void *)atomic_readandclear_ptr(
 		    (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva));
 		if (argkva != NULL)
 			exec_release_args_kva(argkva, gen);
 	}
 }
 EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL,
     EVENTHANDLER_PRI_ANY);
 
 /*
  * Allocate temporary demand-paged, zero-filled memory for the file name,
  * argument, and environment strings.
  */
 int
 exec_alloc_args(struct image_args *args)
 {
 
 	args->buf = (char *)exec_alloc_args_kva(&args->bufkva);
 	return (0);
 }
 
 void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf != NULL) {
 		exec_free_args_kva(args->bufkva);
 		args->buf = NULL;
 	}
 	if (args->fname_buf != NULL) {
 		free(args->fname_buf, M_TEMP);
 		args->fname_buf = NULL;
 	}
 	if (args->fdp != NULL)
 		fdescfree_remapped(args->fdp);
 }
 
 /*
  * Copy strings out to the new process address space, constructing new arg
  * and env vector tables. Return a pointer to the base so that it can be used
  * as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(struct image_params *imgp)
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp;
 	uintptr_t destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	size_t execpath_len;
 	int szsigcode, szps;
 	char canary[sizeof(long) * 8];
 
 	szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL)
 		execpath_len = strlen(imgp->execpath) + 1;
 	else
 		execpath_len = 0;
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_sigcode_base == 0) {
 		if (p->p_sysent->sv_szsigcode != NULL)
 			szsigcode = *(p->p_sysent->sv_szsigcode);
 	}
 	destp =	(uintptr_t)arginfo;
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode != 0) {
 		destp -= szsigcode;
 		destp = rounddown2(destp, sizeof(void *));
 		copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode);
 	}
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (execpath_len != 0) {
 		destp -= execpath_len;
 		imgp->execpathp = destp;
 		copyout(imgp->execpath, (void *)destp, execpath_len);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	destp -= sizeof(canary);
 	imgp->canary = destp;
 	copyout(canary, (void *)destp, sizeof(canary));
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
 	destp -= szps;
 	destp = rounddown2(destp, sizeof(void *));
 	imgp->pagesizes = destp;
 	copyout(pagesizes, (void *)destp, szps);
 	imgp->pagesizeslen = szps;
 
 	destp -= ARG_MAX - imgp->args->stringspace;
 	destp = rounddown2(destp, sizeof(void *));
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size)
 		    * sizeof(char *));
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc
 		    + 2) * sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword32(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword32(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(struct image_params *imgp)
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error, writecount;
 
 	td = curthread;
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that
 	 *    this file resides on.
 	 * 2) Ensure that at least one execute bit is on. Otherwise, a
 	 *    privileged user will always succeed, and we don't want this
 	 *    to happen unless the file really is executable.
 	 * 3) Ensure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	error = VOP_GET_WRITECOUNT(vp, &writecount);
 	if (error != 0)
 		return (error);
 	if (writecount != 0)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	if (error == 0)
 		imgp->opened = 1;
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(const struct execsw *execsw_arg)
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(const struct execsw *execsw_arg)
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/kern_fail.c
===================================================================
--- head/sys/kern/kern_fail.c	(revision 326270)
+++ head/sys/kern/kern_fail.c	(revision 326271)
@@ -1,1124 +1,1126 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2009 Isilon Inc http://www.isilon.com/
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /**
  * @file
  *
  * fail(9) Facility.
  *
  * @ingroup failpoint_private
  */
 /**
  * @defgroup failpoint fail(9) Facility
  *
  * Failpoints allow for injecting fake errors into running code on the fly,
  * without modifying code or recompiling with flags.  Failpoints are always
  * present, and are very efficient when disabled.  Failpoints are described
  * in man fail(9).
  */
 /**
  * @defgroup failpoint_private Private fail(9) Implementation functions
  *
  * Private implementations for the actual failpoint code.
  *
  * @ingroup failpoint
  */
 /**
  * @addtogroup failpoint_private
  * @{
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_stack.h"
 
 #include <sys/ctype.h>
 #include <sys/errno.h>
 #include <sys/fail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
 
 #include <machine/atomic.h>
 #include <machine/stdarg.h>
 
 #ifdef ILOG_DEFINE_FOR_FILE
 ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point);
 #endif
 
 static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system");
 #define fp_free(ptr) free(ptr, M_FAIL_POINT)
 #define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags))
 #define fs_free(ptr) fp_free(ptr)
 #define fs_malloc() fp_malloc(sizeof(struct fail_point_setting), \
     M_WAITOK | M_ZERO)
 
 /**
  * These define the wchans that are used for sleeping, pausing respectively.
  * They are chosen arbitrarily but need to be distinct to the failpoint and
  * the sleep/pause distinction.
  */
 #define FP_SLEEP_CHANNEL(fp) (void*)(fp)
 #define FP_PAUSE_CHANNEL(fp) __DEVOLATILE(void*, &fp->fp_setting)
 
 /**
  * Don't allow more than this many entries in a fail point set by sysctl.
  * The 99.99...% case is to have 1 entry.  I can't imagine having this many
  * entries, so it should not limit us.  Saves on re-mallocs while holding
  * a non-sleepable lock.
  */
 #define FP_MAX_ENTRY_COUNT 20
 
 /* Used to drain sbufs to the sysctl output */
 int fail_sysctl_drain_func(void *, const char *, int);
 
 /* Head of tailq of struct fail_point_entry */
 TAILQ_HEAD(fail_point_entry_queue, fail_point_entry);
 
 /**
  * fp entries garbage list; outstanding entries are cleaned up in the
  * garbage collector
  */
 STAILQ_HEAD(fail_point_setting_garbage, fail_point_setting);
 static struct fail_point_setting_garbage fp_setting_garbage =
         STAILQ_HEAD_INITIALIZER(fp_setting_garbage);
 static struct mtx mtx_garbage_list;
 MTX_SYSINIT(mtx_garbage_list, &mtx_garbage_list, "fail point garbage mtx",
         MTX_SPIN);
 
 static struct sx sx_fp_set;
 SX_SYSINIT(sx_fp_set, &sx_fp_set, "fail point set sx");
 
 /**
  * Failpoint types.
  * Don't change these without changing fail_type_strings in fail.c.
  * @ingroup failpoint_private
  */
 enum fail_point_t {
 	FAIL_POINT_OFF,		/**< don't fail */
 	FAIL_POINT_PANIC,	/**< panic */
 	FAIL_POINT_RETURN,	/**< return an errorcode */
 	FAIL_POINT_BREAK,	/**< break into the debugger */
 	FAIL_POINT_PRINT,	/**< print a message */
 	FAIL_POINT_SLEEP,	/**< sleep for some msecs */
 	FAIL_POINT_PAUSE,	/**< sleep until failpoint is set to off */
 	FAIL_POINT_YIELD,	/**< yield the cpu */
 	FAIL_POINT_DELAY,	/**< busy wait the cpu */
 	FAIL_POINT_NUMTYPES,
 	FAIL_POINT_INVALID = -1
 };
 
 static struct {
 	const char *name;
 	int	nmlen;
 } fail_type_strings[] = {
 #define	FP_TYPE_NM_LEN(s)	{ s, sizeof(s) - 1 }
 	[FAIL_POINT_OFF] =	FP_TYPE_NM_LEN("off"),
 	[FAIL_POINT_PANIC] =	FP_TYPE_NM_LEN("panic"),
 	[FAIL_POINT_RETURN] =	FP_TYPE_NM_LEN("return"),
 	[FAIL_POINT_BREAK] =	FP_TYPE_NM_LEN("break"),
 	[FAIL_POINT_PRINT] =	FP_TYPE_NM_LEN("print"),
 	[FAIL_POINT_SLEEP] =	FP_TYPE_NM_LEN("sleep"),
 	[FAIL_POINT_PAUSE] =	FP_TYPE_NM_LEN("pause"),
 	[FAIL_POINT_YIELD] =	FP_TYPE_NM_LEN("yield"),
 	[FAIL_POINT_DELAY] =	FP_TYPE_NM_LEN("delay"),
 };
 
 #define FE_COUNT_UNTRACKED (INT_MIN)
 
 /**
  * Internal structure tracking a single term of a complete failpoint.
  * @ingroup failpoint_private
  */
 struct fail_point_entry {
 	volatile bool	fe_stale;
 	enum fail_point_t	fe_type;	/**< type of entry */
 	int		fe_arg;		/**< argument to type (e.g. return value) */
 	int		fe_prob;	/**< likelihood of firing in millionths */
 	int32_t		fe_count;	/**< number of times to fire, -1 means infinite */
 	pid_t		fe_pid;		/**< only fail for this process */
 	struct fail_point	*fe_parent;	/**< backpointer to fp */
 	TAILQ_ENTRY(fail_point_entry)	fe_entries; /**< next entry ptr */
 };
 
 struct fail_point_setting {
 	STAILQ_ENTRY(fail_point_setting) fs_garbage_link;
 	struct fail_point_entry_queue fp_entry_queue;
 	struct fail_point * fs_parent;
 	struct mtx feq_mtx; /* Gives fail_point_pause something to do.  */
 };
 
 /**
  * Defines stating the equivalent of probablilty one (100%)
  */
 enum {
 	PROB_MAX = 1000000,	/* probability between zero and this number */
 	PROB_DIGITS = 6		/* number of zero's in above number */
 };
 
 /* Get a ref on an fp's fp_setting */
 static inline struct fail_point_setting *fail_point_setting_get_ref(
         struct fail_point *fp);
 /* Release a ref on an fp_setting */
 static inline void fail_point_setting_release_ref(struct fail_point *fp);
 /* Allocate and initialize a struct fail_point_setting */
 static struct fail_point_setting *fail_point_setting_new(struct
         fail_point *);
 /* Free a struct fail_point_setting */
 static void fail_point_setting_destroy(struct fail_point_setting *fp_setting);
 /* Allocate and initialize a struct fail_point_entry */
 static struct fail_point_entry *fail_point_entry_new(struct
         fail_point_setting *);
 /* Free a struct fail_point_entry */
 static void fail_point_entry_destroy(struct fail_point_entry *fp_entry);
 /* Append fp setting to garbage list */
 static inline void fail_point_setting_garbage_append(
         struct fail_point_setting *fp_setting);
 /* Swap fp's setting with fp_setting_new */
 static inline struct fail_point_setting *
         fail_point_swap_settings(struct fail_point *fp,
         struct fail_point_setting *fp_setting_new);
 /* Free up any zero-ref setting in the garbage queue */
 static void fail_point_garbage_collect(void);
 /* If this fail point's setting are empty, then swap it out to NULL. */
 static inline void fail_point_eval_swap_out(struct fail_point *fp,
         struct fail_point_setting *fp_setting);
 
 bool
 fail_point_is_off(struct fail_point *fp)
 {
 	bool return_val;
 	struct fail_point_setting *fp_setting;
 	struct fail_point_entry *ent;
 
 	return_val = true;
 
 	fp_setting = fail_point_setting_get_ref(fp);
 	if (fp_setting != NULL) {
 		TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue,
 		    fe_entries) {
 			if (!ent->fe_stale) {
 				return_val = false;
 				break;
 			}
 		}
 	}
 	fail_point_setting_release_ref(fp);
 
 	return (return_val);
 }
 
 /* Allocate and initialize a struct fail_point_setting */
 static struct fail_point_setting *
 fail_point_setting_new(struct fail_point *fp)
 {
 	struct fail_point_setting *fs_new;
 
 	fs_new = fs_malloc();
 	fs_new->fs_parent = fp;
 	TAILQ_INIT(&fs_new->fp_entry_queue);
 	mtx_init(&fs_new->feq_mtx, "fail point entries", NULL, MTX_SPIN);
 
 	fail_point_setting_garbage_append(fs_new);
 
 	return (fs_new);
 }
 
 /* Free a struct fail_point_setting */
 static void
 fail_point_setting_destroy(struct fail_point_setting *fp_setting)
 {
 	struct fail_point_entry *ent;
 
 	while (!TAILQ_EMPTY(&fp_setting->fp_entry_queue)) {
 		ent = TAILQ_FIRST(&fp_setting->fp_entry_queue);
 		TAILQ_REMOVE(&fp_setting->fp_entry_queue, ent, fe_entries);
 		fail_point_entry_destroy(ent);
 	}
 
 	fs_free(fp_setting);
 }
 
 /* Allocate and initialize a struct fail_point_entry */
 static struct fail_point_entry *
 fail_point_entry_new(struct fail_point_setting *fp_setting)
 {
 	struct fail_point_entry *fp_entry;
 
 	fp_entry = fp_malloc(sizeof(struct fail_point_entry),
 	        M_WAITOK | M_ZERO);
 	fp_entry->fe_parent = fp_setting->fs_parent;
 	fp_entry->fe_prob = PROB_MAX;
 	fp_entry->fe_pid = NO_PID;
 	fp_entry->fe_count = FE_COUNT_UNTRACKED;
 	TAILQ_INSERT_TAIL(&fp_setting->fp_entry_queue, fp_entry,
 	        fe_entries);
 
 	return (fp_entry);
 }
 
 /* Free a struct fail_point_entry */
 static void
 fail_point_entry_destroy(struct fail_point_entry *fp_entry)
 {
 
 	fp_free(fp_entry);
 }
 
 /* Get a ref on an fp's fp_setting */
 static inline struct fail_point_setting *
 fail_point_setting_get_ref(struct fail_point *fp)
 {
 	struct fail_point_setting *fp_setting;
 
 	/* Invariant: if we have a ref, our pointer to fp_setting is safe */
 	atomic_add_acq_32(&fp->fp_ref_cnt, 1);
 	fp_setting = fp->fp_setting;
 
 	return (fp_setting);
 }
 
 /* Release a ref on an fp_setting */
 static inline void
 fail_point_setting_release_ref(struct fail_point *fp)
 {
 
 	KASSERT(&fp->fp_ref_cnt > 0, ("Attempting to deref w/no refs"));
 	atomic_subtract_rel_32(&fp->fp_ref_cnt, 1);
 }
 
 /* Append fp entries to fp garbage list */
 static inline void
 fail_point_setting_garbage_append(struct fail_point_setting *fp_setting)
 {
 
 	mtx_lock_spin(&mtx_garbage_list);
 	STAILQ_INSERT_TAIL(&fp_setting_garbage, fp_setting,
 	        fs_garbage_link);
 	mtx_unlock_spin(&mtx_garbage_list);
 }
 
 /* Swap fp's entries with fp_setting_new */
 static struct fail_point_setting *
 fail_point_swap_settings(struct fail_point *fp,
         struct fail_point_setting *fp_setting_new)
 {
 	struct fail_point_setting *fp_setting_old;
 
 	fp_setting_old = fp->fp_setting;
 	fp->fp_setting = fp_setting_new;
 
 	return (fp_setting_old);
 }
 
 static inline void
 fail_point_eval_swap_out(struct fail_point *fp,
         struct fail_point_setting *fp_setting)
 {
 
 	/* We may have already been swapped out and replaced; ignore. */
 	if (fp->fp_setting == fp_setting)
 		fail_point_swap_settings(fp, NULL);
 }
 
 /* Free up any zero-ref entries in the garbage queue */
 static void
 fail_point_garbage_collect(void)
 {
 	struct fail_point_setting *fs_current, *fs_next;
 	struct fail_point_setting_garbage fp_ents_free_list;
 
 	/**
 	  * We will transfer the entries to free to fp_ents_free_list while holding
 	  * the spin mutex, then free it after we drop the lock. This avoids
 	  * triggering witness due to sleepable mutexes in the memory
 	  * allocator.
 	  */
 	STAILQ_INIT(&fp_ents_free_list);
 
 	mtx_lock_spin(&mtx_garbage_list);
 	STAILQ_FOREACH_SAFE(fs_current, &fp_setting_garbage, fs_garbage_link,
 	    fs_next) {
 		if (fs_current->fs_parent->fp_setting != fs_current &&
 		        fs_current->fs_parent->fp_ref_cnt == 0) {
 			STAILQ_REMOVE(&fp_setting_garbage, fs_current,
 			        fail_point_setting, fs_garbage_link);
 			STAILQ_INSERT_HEAD(&fp_ents_free_list, fs_current,
 			        fs_garbage_link);
 		}
 	}
 	mtx_unlock_spin(&mtx_garbage_list);
 
 	STAILQ_FOREACH_SAFE(fs_current, &fp_ents_free_list, fs_garbage_link,
 	        fs_next)
 		fail_point_setting_destroy(fs_current);
 }
 
 /* Drain out all refs from this fail point */
 static inline void
 fail_point_drain(struct fail_point *fp, int expected_ref)
 {
 	struct fail_point_setting *entries;
 
 	entries = fail_point_swap_settings(fp, NULL);
 	/**
 	 * We have unpaused all threads; so we will wait no longer
 	 * than the time taken for the longest remaining sleep, or
 	 * the length of time of a long-running code block.
 	 */
 	while (fp->fp_ref_cnt > expected_ref) {
 		wakeup(FP_PAUSE_CHANNEL(fp));
 		tsleep(&fp, PWAIT, "fail_point_drain", hz / 100);
 	}
 	fail_point_swap_settings(fp, entries);
 }
 
 static inline void
 fail_point_pause(struct fail_point *fp, enum fail_point_return_code *pret,
         struct mtx *mtx_sleep)
 {
 
 	if (fp->fp_pre_sleep_fn)
 		fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
 
 	msleep_spin(FP_PAUSE_CHANNEL(fp), mtx_sleep, "failpt", 0);
 
 	if (fp->fp_post_sleep_fn)
 		fp->fp_post_sleep_fn(fp->fp_post_sleep_arg);
 }
 
 static inline void
 fail_point_sleep(struct fail_point *fp, int msecs,
         enum fail_point_return_code *pret)
 {
 	int timo;
 
 	/* Convert from millisecs to ticks, rounding up */
 	timo = howmany((int64_t)msecs * hz, 1000L);
 
 	if (timo > 0) {
 		if (!(fp->fp_flags & FAIL_POINT_USE_TIMEOUT_PATH)) {
 			if (fp->fp_pre_sleep_fn)
 				fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
 
 			tsleep(FP_SLEEP_CHANNEL(fp), PWAIT, "failpt", timo);
 
 			if (fp->fp_post_sleep_fn)
 				fp->fp_post_sleep_fn(fp->fp_post_sleep_arg);
 		} else {
 			if (fp->fp_pre_sleep_fn)
 				fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg);
 
 			timeout(fp->fp_post_sleep_fn, fp->fp_post_sleep_arg,
 			    timo);
 			*pret = FAIL_POINT_RC_QUEUED;
 		}
 	}
 }
 
 static char *parse_fail_point(struct fail_point_setting *, char *);
 static char *parse_term(struct fail_point_setting *, char *);
 static char *parse_number(int *out_units, int *out_decimal, char *);
 static char *parse_type(struct fail_point_entry *, char *);
 
 /**
  * Initialize a fail_point.  The name is formed in a printf-like fashion
  * from "fmt" and subsequent arguments.  This function is generally used
  * for custom failpoints located at odd places in the sysctl tree, and is
  * not explicitly needed for standard in-line-declared failpoints.
  *
  * @ingroup failpoint
  */
 void
 fail_point_init(struct fail_point *fp, const char *fmt, ...)
 {
 	va_list ap;
 	char *name;
 	int n;
 
 	fp->fp_setting = NULL;
 	fp->fp_flags = 0;
 
 	/* Figure out the size of the name. */
 	va_start(ap, fmt);
 	n = vsnprintf(NULL, 0, fmt, ap);
 	va_end(ap);
 
 	/* Allocate the name and fill it in. */
 	name = fp_malloc(n + 1, M_WAITOK);
 	if (name != NULL) {
 		va_start(ap, fmt);
 		vsnprintf(name, n + 1, fmt, ap);
 		va_end(ap);
 	}
 	fp->fp_name = name;
 	fp->fp_location = "";
 	fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME;
 	fp->fp_pre_sleep_fn = NULL;
 	fp->fp_pre_sleep_arg = NULL;
 	fp->fp_post_sleep_fn = NULL;
 	fp->fp_post_sleep_arg = NULL;
 }
 
 /**
  * Free the resources held by a fail_point, and wake any paused threads.
  * Thou shalt not allow threads to hit this fail point after you enter this
  * function, nor shall you call this multiple times for a given fp.
  * @ingroup failpoint
  */
 void
 fail_point_destroy(struct fail_point *fp)
 {
 
 	fail_point_drain(fp, 0);
 
 	if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) {
 		fp_free(__DECONST(void *, fp->fp_name));
 		fp->fp_name = NULL;
 	}
 	fp->fp_flags = 0;
 
 	sx_xlock(&sx_fp_set);
 	fail_point_garbage_collect();
 	sx_xunlock(&sx_fp_set);
 }
 
 /**
  * This does the real work of evaluating a fail point. If the fail point tells
  * us to return a value, this function returns 1 and fills in 'return_value'
  * (return_value is allowed to be null). If the fail point tells us to panic,
  * we never return. Otherwise we just return 0 after doing some work, which
  * means "keep going".
  */
 enum fail_point_return_code
 fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
 {
 	bool execute = false;
 	struct fail_point_entry *ent;
 	struct fail_point_setting *fp_setting;
 	enum fail_point_return_code ret;
 	int cont;
 	int count;
 	int msecs;
 	int usecs;
 
 	ret = FAIL_POINT_RC_CONTINUE;
 	cont = 0; /* don't continue by default */
 
 	fp_setting = fail_point_setting_get_ref(fp);
 	if (fp_setting == NULL)
 		goto abort;
 
 	TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) {
 
 		if (ent->fe_stale)
 			continue;
 
 		if (ent->fe_prob < PROB_MAX &&
 		    ent->fe_prob < random() % PROB_MAX)
 			continue;
 
 		if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid)
 			continue;
 
 		if (ent->fe_count != FE_COUNT_UNTRACKED) {
 			count = ent->fe_count;
 			while (count > 0) {
 				if (atomic_cmpset_32(&ent->fe_count, count, count - 1)) {
 					count--;
 					execute = true;
 					break;
 				}
 				count = ent->fe_count;
 			}
 			if (execute == false)
 				/* We lost the race; consider the entry stale and bail now */
 				continue;
 			if (count == 0)
 				ent->fe_stale = true;
 		}
 
 		switch (ent->fe_type) {
 		case FAIL_POINT_PANIC:
 			panic("fail point %s panicking", fp->fp_name);
 			/* NOTREACHED */
 
 		case FAIL_POINT_RETURN:
 			if (return_value != NULL)
 				*return_value = ent->fe_arg;
 			ret = FAIL_POINT_RC_RETURN;
 			break;
 
 		case FAIL_POINT_BREAK:
 			printf("fail point %s breaking to debugger\n",
 			        fp->fp_name);
 			breakpoint();
 			break;
 
 		case FAIL_POINT_PRINT:
 			printf("fail point %s executing\n", fp->fp_name);
 			cont = ent->fe_arg;
 			break;
 
 		case FAIL_POINT_SLEEP:
 			msecs = ent->fe_arg;
 			if (msecs)
 				fail_point_sleep(fp, msecs, &ret);
 			break;
 
 		case FAIL_POINT_PAUSE:
 			/**
 			 * Pausing is inherently strange with multiple
 			 * entries given our design.  That is because some
 			 * entries could be unreachable, for instance in cases like:
 			 * pause->return. We can never reach the return entry.
 			 * The sysctl layer actually truncates all entries after
 			 * a pause for this reason.
 			 */
 			mtx_lock_spin(&fp_setting->feq_mtx);
 			fail_point_pause(fp, &ret, &fp_setting->feq_mtx);
 			mtx_unlock_spin(&fp_setting->feq_mtx);
 			break;
 
 		case FAIL_POINT_YIELD:
 			kern_yield(PRI_UNCHANGED);
 			break;
 
 		case FAIL_POINT_DELAY:
 			usecs = ent->fe_arg;
 			DELAY(usecs);
 			break;
 
 		default:
 			break;
 		}
 
 		if (cont == 0)
 			break;
 	}
 
 	if (fail_point_is_off(fp))
 		fail_point_eval_swap_out(fp, fp_setting);
 
 abort:
 	fail_point_setting_release_ref(fp);
 
 	return (ret);
 }
 
 /**
  * Translate internal fail_point structure into human-readable text.
  */
 static void
 fail_point_get(struct fail_point *fp, struct sbuf *sb,
         bool verbose)
 {
 	struct fail_point_entry *ent;
 	struct fail_point_setting *fp_setting;
 	struct fail_point_entry *fp_entry_cpy;
 	int cnt_sleeping;
 	int idx;
 	int printed_entry_count;
 
 	cnt_sleeping = 0;
 	idx = 0;
 	printed_entry_count = 0;
 
 	fp_entry_cpy = fp_malloc(sizeof(struct fail_point_entry) *
 	        (FP_MAX_ENTRY_COUNT + 1), M_WAITOK);
 
 	fp_setting = fail_point_setting_get_ref(fp);
 
 	if (fp_setting != NULL) {
 		TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) {
 			if (ent->fe_stale)
 				continue;
 
 			KASSERT(printed_entry_count < FP_MAX_ENTRY_COUNT,
 			        ("FP entry list larger than allowed"));
 
 			fp_entry_cpy[printed_entry_count] = *ent;
 			++printed_entry_count;
 		}
 	}
 	fail_point_setting_release_ref(fp);
 
 	/* This is our equivalent of a NULL terminator */
 	fp_entry_cpy[printed_entry_count].fe_type = FAIL_POINT_INVALID;
 
 	while (idx < printed_entry_count) {
 		ent = &fp_entry_cpy[idx];
 		++idx;
 		if (ent->fe_prob < PROB_MAX) {
 			int decimal = ent->fe_prob % (PROB_MAX / 100);
 			int units = ent->fe_prob / (PROB_MAX / 100);
 			sbuf_printf(sb, "%d", units);
 			if (decimal) {
 				int digits = PROB_DIGITS - 2;
 				while (!(decimal % 10)) {
 					digits--;
 					decimal /= 10;
 				}
 				sbuf_printf(sb, ".%0*d", digits, decimal);
 			}
 			sbuf_printf(sb, "%%");
 		}
 		if (ent->fe_count >= 0)
 			sbuf_printf(sb, "%d*", ent->fe_count);
 		sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name);
 		if (ent->fe_arg)
 			sbuf_printf(sb, "(%d)", ent->fe_arg);
 		if (ent->fe_pid != NO_PID)
 			sbuf_printf(sb, "[pid %d]", ent->fe_pid);
 		if (TAILQ_NEXT(ent, fe_entries))
 			sbuf_printf(sb, "->");
 	}
 	if (!printed_entry_count)
 		sbuf_printf(sb, "off");
 
 	fp_free(fp_entry_cpy);
 	if (verbose) {
 #ifdef STACK
 		/* Print number of sleeping threads. queue=0 is the argument
 		 * used by msleep when sending our threads to sleep. */
 		sbuf_printf(sb, "\nsleeping_thread_stacks = {\n");
 		sleepq_sbuf_print_stacks(sb, FP_SLEEP_CHANNEL(fp), 0,
 		        &cnt_sleeping);
 
 		sbuf_printf(sb, "},\n");
 #endif
 		sbuf_printf(sb, "sleeping_thread_count = %d,\n",
 		        cnt_sleeping);
 
 #ifdef STACK
 		sbuf_printf(sb, "paused_thread_stacks = {\n");
 		sleepq_sbuf_print_stacks(sb, FP_PAUSE_CHANNEL(fp), 0,
 		        &cnt_sleeping);
 
 		sbuf_printf(sb, "},\n");
 #endif
 		sbuf_printf(sb, "paused_thread_count = %d\n",
 		        cnt_sleeping);
 	}
 }
 
 /**
  * Set an internal fail_point structure from a human-readable failpoint string
  * in a lock-safe manner.
  */
 static int
 fail_point_set(struct fail_point *fp, char *buf)
 {
 	struct fail_point_entry *ent, *ent_next;
 	struct fail_point_setting *entries;
 	bool should_wake_paused;
 	bool should_truncate;
 	int error;
 
 	error = 0;
 	should_wake_paused = false;
 	should_truncate = false;
 
 	/* Parse new entries. */
 	/**
 	 * ref protects our new malloc'd stuff from being garbage collected
 	 * before we link it.
 	 */
 	fail_point_setting_get_ref(fp);
 	entries = fail_point_setting_new(fp);
 	if (parse_fail_point(entries, buf) == NULL) {
 		STAILQ_REMOVE(&fp_setting_garbage, entries,
 		        fail_point_setting, fs_garbage_link);
 		fail_point_setting_destroy(entries);
 		error = EINVAL;
 		goto end;
 	}
 
 	/**
 	 * Transfer the entries we are going to keep to a new list.
 	 * Get rid of useless zero probability entries, and entries with hit
 	 * count 0.
 	 * If 'off' is present, and it has no hit count set, then all entries
 	 *       after it are discarded since they are unreachable.
 	 */
 	TAILQ_FOREACH_SAFE(ent, &entries->fp_entry_queue, fe_entries, ent_next) {
 		if (ent->fe_prob == 0 || ent->fe_count == 0) {
 			printf("Discarding entry which cannot execute %s\n",
 			        fail_type_strings[ent->fe_type].name);
 			TAILQ_REMOVE(&entries->fp_entry_queue, ent,
 			        fe_entries);
 			fp_free(ent);
 			continue;
 		} else if (should_truncate) {
 			printf("Discarding unreachable entry %s\n",
 			        fail_type_strings[ent->fe_type].name);
 			TAILQ_REMOVE(&entries->fp_entry_queue, ent,
 			        fe_entries);
 			fp_free(ent);
 			continue;
 		}
 
 		if (ent->fe_type == FAIL_POINT_OFF) {
 			should_wake_paused = true;
 			if (ent->fe_count == FE_COUNT_UNTRACKED) {
 				should_truncate = true;
 				TAILQ_REMOVE(&entries->fp_entry_queue, ent,
 				        fe_entries);
 				fp_free(ent);
 			}
 		} else if (ent->fe_type == FAIL_POINT_PAUSE) {
 			should_truncate = true;
 		} else if (ent->fe_type == FAIL_POINT_SLEEP && (fp->fp_flags &
 		        FAIL_POINT_NONSLEEPABLE)) {
 			/**
 			 * If this fail point is annotated as being in a
 			 * non-sleepable ctx, convert sleep to delay and
 			 * convert the msec argument to usecs.
 			 */
 			printf("Sleep call request on fail point in "
 			        "non-sleepable context; using delay instead "
 			        "of sleep\n");
 			ent->fe_type = FAIL_POINT_DELAY;
 			ent->fe_arg *= 1000;
 		}
 	}
 
 	if (TAILQ_EMPTY(&entries->fp_entry_queue)) {
 		entries = fail_point_swap_settings(fp, NULL);
 		if (entries != NULL)
 			wakeup(FP_PAUSE_CHANNEL(fp));
 	} else {
 		if (should_wake_paused)
 			wakeup(FP_PAUSE_CHANNEL(fp));
 		fail_point_swap_settings(fp, entries);
 	}
 
 end:
 #ifdef IWARNING
 	if (error)
 		IWARNING("Failed to set %s %s to %s",
 		    fp->fp_name, fp->fp_location, buf);
 	else
 		INOTICE("Set %s %s to %s",
 		    fp->fp_name, fp->fp_location, buf);
 #endif /* IWARNING */
 
 	fail_point_setting_release_ref(fp);
 	return (error);
 }
 
 #define MAX_FAIL_POINT_BUF	1023
 
 /**
  * Handle kernel failpoint set/get.
  */
 int
 fail_point_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct fail_point *fp;
 	char *buf;
 	struct sbuf sb, *sb_check;
 	int error;
 
 	buf = NULL;
 	error = 0;
 	fp = arg1;
 
 	sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND);
 	if (sb_check != &sb)
 		return (ENOMEM);
 
 	sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req);
 
 	/* Setting */
 	/**
 	 * Lock protects any new entries from being garbage collected before we
 	 * can link them to the fail point.
 	 */
 	sx_xlock(&sx_fp_set);
 	if (req->newptr) {
 		if (req->newlen > MAX_FAIL_POINT_BUF) {
 			error = EINVAL;
 			goto out;
 		}
 
 		buf = fp_malloc(req->newlen + 1, M_WAITOK);
 
 		error = SYSCTL_IN(req, buf, req->newlen);
 		if (error)
 			goto out;
 		buf[req->newlen] = '\0';
 
 		error = fail_point_set(fp, buf);
 	}
 
 	fail_point_garbage_collect();
 	sx_xunlock(&sx_fp_set);
 
 	/* Retrieving. */
 	fail_point_get(fp, &sb, false);
 
 out:
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (buf)
 		fp_free(buf);
 
 	return (error);
 }
 
 int
 fail_point_sysctl_status(SYSCTL_HANDLER_ARGS)
 {
 	struct fail_point *fp;
 	struct sbuf sb, *sb_check;
 
 	fp = arg1;
 
 	sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND);
 	if (sb_check != &sb)
 		return (ENOMEM);
 
 	sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req);
 
 	/* Retrieving. */
 	fail_point_get(fp, &sb, true);
 
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	/**
 	 * Lock protects any new entries from being garbage collected before we
 	 * can link them to the fail point.
 	 */
 	sx_xlock(&sx_fp_set);
 	fail_point_garbage_collect();
 	sx_xunlock(&sx_fp_set);
 
 	return (0);
 }
 
 int
 fail_sysctl_drain_func(void *sysctl_args, const char *buf, int len)
 {
 	struct sysctl_req *sa;
 	int error;
 
 	sa = sysctl_args;
 
 	error = SYSCTL_OUT(sa, buf, len);
 
 	if (error == ENOMEM)
 		return (-1);
 	else
 		return (len);
 }
 
 /**
  * Internal helper function to translate a human-readable failpoint string
  * into a internally-parsable fail_point structure.
  */
 static char *
 parse_fail_point(struct fail_point_setting *ents, char *p)
 {
 	/*  <fail_point> ::
 	 *      <term> ( "->" <term> )*
 	 */
 	uint8_t term_count;
 
 	term_count = 1;
 
 	p = parse_term(ents, p);
 	if (p == NULL)
 		return (NULL);
 
 	while (*p != '\0') {
 		term_count++;
 		if (p[0] != '-' || p[1] != '>' ||
 		        (p = parse_term(ents, p+2)) == NULL ||
 		        term_count > FP_MAX_ENTRY_COUNT)
 			return (NULL);
 	}
 	return (p);
 }
 
 /**
  * Internal helper function to parse an individual term from a failpoint.
  */
 static char *
 parse_term(struct fail_point_setting *ents, char *p)
 {
 	struct fail_point_entry *ent;
 
 	ent = fail_point_entry_new(ents);
 
 	/*
 	 * <term> ::
 	 *     ( (<float> "%") | (<integer> "*" ) )*
 	 *     <type>
 	 *     [ "(" <integer> ")" ]
 	 *     [ "[pid " <integer> "]" ]
 	 */
 
 	/* ( (<float> "%") | (<integer> "*" ) )* */
 	while (isdigit(*p) || *p == '.') {
 		int units, decimal;
 
 		p = parse_number(&units, &decimal, p);
 		if (p == NULL)
 			return (NULL);
 
 		if (*p == '%') {
 			if (units > 100) /* prevent overflow early */
 				units = 100;
 			ent->fe_prob = units * (PROB_MAX / 100) + decimal;
 			if (ent->fe_prob > PROB_MAX)
 				ent->fe_prob = PROB_MAX;
 		} else if (*p == '*') {
 			if (!units || units < 0 || decimal)
 				return (NULL);
 			ent->fe_count = units;
 		} else
 			return (NULL);
 		p++;
 	}
 
 	/* <type> */
 	p = parse_type(ent, p);
 	if (p == NULL)
 		return (NULL);
 	if (*p == '\0')
 		return (p);
 
 	/* [ "(" <integer> ")" ] */
 	if (*p != '(')
 		return (p);
 	p++;
 	if (!isdigit(*p) && *p != '-')
 		return (NULL);
 	ent->fe_arg = strtol(p, &p, 0);
 	if (*p++ != ')')
 		return (NULL);
 
 	/* [ "[pid " <integer> "]" ] */
 #define PID_STRING "[pid "
 	if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0)
 		return (p);
 	p += sizeof(PID_STRING) - 1;
 	if (!isdigit(*p))
 		return (NULL);
 	ent->fe_pid = strtol(p, &p, 0);
 	if (*p++ != ']')
 		return (NULL);
 
 	return (p);
 }
 
 /**
  * Internal helper function to parse a numeric for a failpoint term.
  */
 static char *
 parse_number(int *out_units, int *out_decimal, char *p)
 {
 	char *old_p;
 
 	/**
 	 *  <number> ::
 	 *      <integer> [ "." <integer> ] |
 	 *      "." <integer>
 	 */
 
 	/* whole part */
 	old_p = p;
 	*out_units = strtol(p, &p, 10);
 	if (p == old_p && *p != '.')
 		return (NULL);
 
 	/* fractional part */
 	*out_decimal = 0;
 	if (*p == '.') {
 		int digits = 0;
 		p++;
 		while (isdigit(*p)) {
 			int digit = *p - '0';
 			if (digits < PROB_DIGITS - 2)
 				*out_decimal = *out_decimal * 10 + digit;
 			else if (digits == PROB_DIGITS - 2 && digit >= 5)
 				(*out_decimal)++;
 			digits++;
 			p++;
 		}
 		if (!digits) /* need at least one digit after '.' */
 			return (NULL);
 		while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */
 			*out_decimal *= 10;
 	}
 
 	return (p); /* success */
 }
 
 /**
  * Internal helper function to parse an individual type for a failpoint term.
  */
 static char *
 parse_type(struct fail_point_entry *ent, char *beg)
 {
 	enum fail_point_t type;
 	int len;
 
 	for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) {
 		len = fail_type_strings[type].nmlen;
 		if (strncmp(fail_type_strings[type].name, beg, len) == 0) {
 			ent->fe_type = type;
 			return (beg + len);
 		}
 	}
 	return (NULL);
 }
 
 /* The fail point sysctl tree. */
 SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points");
 
 /* Debugging/testing stuff for fail point */
 static int
 sysctl_test_fail_point(SYSCTL_HANDLER_ARGS)
 {
 
 	KFAIL_POINT_RETURN(DEBUG_FP, test_fail_point);
 	return (0);
 }
 SYSCTL_OID(_debug_fail_point, OID_AUTO, test_trigger_fail_point,
         CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_test_fail_point, "A",
         "Trigger test fail points");
Index: head/sys/kern/kern_ffclock.c
===================================================================
--- head/sys/kern/kern_ffclock.c	(revision 326270)
+++ head/sys/kern/kern_ffclock.c	(revision 326271)
@@ -1,482 +1,484 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 The University of Melbourne
  * All rights reserved.
  *
  * This software was developed by Julien Ridoux at the University of Melbourne
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ffclock.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/timeffc.h>
 
 #ifdef FFCLOCK
 
 FEATURE(ffclock, "Feed-forward clock support");
 
 extern struct ffclock_estimate ffclock_estimate;
 extern struct bintime ffclock_boottime;
 extern int8_t ffclock_updated;
 extern struct mtx ffclock_mtx;
 
 /*
  * Feed-forward clock absolute time. This should be the preferred way to read
  * the feed-forward clock for "wall-clock" type time. The flags allow to compose
  * various flavours of absolute time (e.g. with or without leap seconds taken
  * into account). If valid pointers are provided, the ffcounter value and an
  * upper bound on clock error associated with the bintime are provided.
  * NOTE: use ffclock_convert_abs() to differ the conversion of a ffcounter value
  * read earlier.
  */
 void
 ffclock_abstime(ffcounter *ffcount, struct bintime *bt,
     struct bintime *error_bound, uint32_t flags)
 {
 	struct ffclock_estimate cest;
 	ffcounter ffc;
 	ffcounter update_ffcount;
 	ffcounter ffdelta_error;
 
 	/* Get counter and corresponding time. */
 	if ((flags & FFCLOCK_FAST) == FFCLOCK_FAST)
 		ffclock_last_tick(&ffc, bt, flags);
 	else {
 		ffclock_read_counter(&ffc);
 		ffclock_convert_abs(ffc, bt, flags);
 	}
 
 	/* Current ffclock estimate, use update_ffcount as generation number. */
 	do {
 		update_ffcount = ffclock_estimate.update_ffcount;
 		bcopy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate));
 	} while (update_ffcount != ffclock_estimate.update_ffcount);
 
 	/*
 	 * Leap second adjustment. Total as seen by synchronisation algorithm
 	 * since it started. cest.leapsec_next is the ffcounter prediction of
 	 * when the next leapsecond occurs.
 	 */
 	if ((flags & FFCLOCK_LEAPSEC) == FFCLOCK_LEAPSEC) {
 		bt->sec -= cest.leapsec_total;
 		if (ffc > cest.leapsec_next)
 			bt->sec -= cest.leapsec;
 	}
 
 	/* Boot time adjustment, for uptime/monotonic clocks. */
 	if ((flags & FFCLOCK_UPTIME) == FFCLOCK_UPTIME) {
 		bintime_sub(bt, &ffclock_boottime);
 	}
 
 	/* Compute error bound if a valid pointer has been passed. */
 	if (error_bound) {
 		ffdelta_error = ffc - cest.update_ffcount;
 		ffclock_convert_diff(ffdelta_error, error_bound);
 		/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */
 		bintime_mul(error_bound, cest.errb_rate *
 		    (uint64_t)18446744073709LL);
 		/* 18446744073 = int(2^64 / 1e9), since err_abs in [ns] */
 		bintime_addx(error_bound, cest.errb_abs *
 		    (uint64_t)18446744073LL);
 	}
 
 	if (ffcount)
 		*ffcount = ffc;
 }
 
 /*
  * Feed-forward difference clock. This should be the preferred way to convert a
  * time interval in ffcounter values into a time interval in seconds. If a valid
  * pointer is passed, an upper bound on the error in computing the time interval
  * in seconds is provided.
  */
 void
 ffclock_difftime(ffcounter ffdelta, struct bintime *bt,
     struct bintime *error_bound)
 {
 	ffcounter update_ffcount;
 	uint32_t err_rate;
 
 	ffclock_convert_diff(ffdelta, bt);
 
 	if (error_bound) {
 		do {
 			update_ffcount = ffclock_estimate.update_ffcount;
 			err_rate = ffclock_estimate.errb_rate;
 		} while (update_ffcount != ffclock_estimate.update_ffcount);
 
 		ffclock_convert_diff(ffdelta, error_bound);
 		/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */
 		bintime_mul(error_bound, err_rate * (uint64_t)18446744073709LL);
 	}
 }
 
 /*
  * Create a new kern.sysclock sysctl node, which will be home to some generic
  * sysclock configuration variables. Feed-forward clock specific variables will
  * live under the ffclock subnode.
  */
 
 SYSCTL_NODE(_kern, OID_AUTO, sysclock, CTLFLAG_RW, 0,
     "System clock related configuration");
 SYSCTL_NODE(_kern_sysclock, OID_AUTO, ffclock, CTLFLAG_RW, 0,
     "Feed-forward clock configuration");
 
 static char *sysclocks[] = {"feedback", "feed-forward"};
 #define	MAX_SYSCLOCK_NAME_LEN 16
 #define	NUM_SYSCLOCKS nitems(sysclocks)
 
 static int ffclock_version = 2;
 SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, version, CTLFLAG_RD,
     &ffclock_version, 0, "Feed-forward clock kernel version");
 
 /* List available sysclocks. */
 static int
 sysctl_kern_sysclock_available(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *s;
 	int clk, error;
 
 	s = sbuf_new_for_sysctl(NULL, NULL,
 	    MAX_SYSCLOCK_NAME_LEN * NUM_SYSCLOCKS, req);
 	if (s == NULL)
 		return (ENOMEM);
 
 	for (clk = 0; clk < NUM_SYSCLOCKS; clk++) {
 		sbuf_cat(s, sysclocks[clk]);
 		if (clk + 1 < NUM_SYSCLOCKS)
 			sbuf_cat(s, " ");
 	}
 	error = sbuf_finish(s);
 	sbuf_delete(s);
 
 	return (error);
 }
 
 SYSCTL_PROC(_kern_sysclock, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, sysctl_kern_sysclock_available, "A",
     "List of available system clocks");
 
 /*
  * Return the name of the active system clock if read, or attempt to change
  * the active system clock to the user specified one if written to. The active
  * system clock is read when calling any of the [get]{bin,nano,micro}[up]time()
  * functions.
  */
 static int
 sysctl_kern_sysclock_active(SYSCTL_HANDLER_ARGS)
 {
 	char newclock[MAX_SYSCLOCK_NAME_LEN];
 	int error;
 	int clk;
 
 	/* Return the name of the current active sysclock. */
 	strlcpy(newclock, sysclocks[sysclock_active], sizeof(newclock));
 	error = sysctl_handle_string(oidp, newclock, sizeof(newclock), req);
 
 	/* Check for error or no change */
 	if (error != 0 || req->newptr == NULL)
 		goto done;
 
 	/* Change the active sysclock to the user specified one: */
 	error = EINVAL;
 	for (clk = 0; clk < NUM_SYSCLOCKS; clk++) {
 		if (strncmp(newclock, sysclocks[clk],
 		    MAX_SYSCLOCK_NAME_LEN - 1)) {
 			continue;
 		}
 		sysclock_active = clk;
 		error = 0;
 		break;
 	}
 done:
 	return (error);
 }
 
 SYSCTL_PROC(_kern_sysclock, OID_AUTO, active, CTLTYPE_STRING | CTLFLAG_RW,
     0, 0, sysctl_kern_sysclock_active, "A",
     "Name of the active system clock which is currently serving time");
 
 static int sysctl_kern_ffclock_ffcounter_bypass = 0;
 SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, ffcounter_bypass, CTLFLAG_RW,
     &sysctl_kern_ffclock_ffcounter_bypass, 0,
     "Use reliable hardware timecounter as the feed-forward counter");
 
 /*
  * High level functions to access the Feed-Forward Clock.
  */
 void
 ffclock_bintime(struct bintime *bt)
 {
 
 	ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
 }
 
 void
 ffclock_nanotime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 ffclock_microtime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 ffclock_getbintime(struct bintime *bt)
 {
 
 	ffclock_abstime(NULL, bt, NULL,
 	    FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
 }
 
 void
 ffclock_getnanotime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	ffclock_abstime(NULL, &bt, NULL,
 	    FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 ffclock_getmicrotime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	ffclock_abstime(NULL, &bt, NULL,
 	    FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 ffclock_binuptime(struct bintime *bt)
 {
 
 	ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
 }
 
 void
 ffclock_nanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 ffclock_microuptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 ffclock_getbinuptime(struct bintime *bt)
 {
 
 	ffclock_abstime(NULL, bt, NULL,
 	    FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
 }
 
 void
 ffclock_getnanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	ffclock_abstime(NULL, &bt, NULL,
 	    FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 ffclock_getmicrouptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	ffclock_abstime(NULL, &bt, NULL,
 	    FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 ffclock_bindifftime(ffcounter ffdelta, struct bintime *bt)
 {
 
 	ffclock_difftime(ffdelta, bt, NULL);
 }
 
 void
 ffclock_nanodifftime(ffcounter ffdelta, struct timespec *tsp)
 {
 	struct bintime bt;
 
 	ffclock_difftime(ffdelta, &bt, NULL);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 ffclock_microdifftime(ffcounter ffdelta, struct timeval *tvp)
 {
 	struct bintime bt;
 
 	ffclock_difftime(ffdelta, &bt, NULL);
 	bintime2timeval(&bt, tvp);
 }
 
 /*
  * System call allowing userland applications to retrieve the current value of
  * the Feed-Forward Clock counter.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ffclock_getcounter_args {
 	ffcounter *ffcount;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap)
 {
 	ffcounter ffcount;
 	int error;
 
 	ffcount = 0;
 	ffclock_read_counter(&ffcount);
 	if (ffcount == 0)
 		return (EAGAIN);
 	error = copyout(&ffcount, uap->ffcount, sizeof(ffcounter));
 
 	return (error);
 }
 
 /*
  * System call allowing the synchronisation daemon to push new feed-foward clock
  * estimates to the kernel. Acquire ffclock_mtx to prevent concurrent updates
  * and ensure data consistency.
  * NOTE: ffclock_updated signals the fftimehands that new estimates are
  * available. The updated estimates are picked up by the fftimehands on next
  * tick, which could take as long as 1/hz seconds (if ticks are not missed).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ffclock_setestimate_args {
 	struct ffclock_estimate *cest;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap)
 {
 	struct ffclock_estimate cest;
 	int error;
 
 	/* Reuse of PRIV_CLOCK_SETTIME. */
 	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 		return (error);
 
 	if ((error = copyin(uap->cest, &cest, sizeof(struct ffclock_estimate)))
 	    != 0)
 		return (error);
 
 	mtx_lock(&ffclock_mtx);
 	memcpy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate));
 	ffclock_updated++;
 	mtx_unlock(&ffclock_mtx);
 	return (error);
 }
 
 /*
  * System call allowing userland applications to retrieve the clock estimates
  * stored within the kernel. It is useful to kickstart the synchronisation
  * daemon with the kernel's knowledge of hardware timecounter.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ffclock_getestimate_args {
 	struct ffclock_estimate *cest;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap)
 {
 	struct ffclock_estimate cest;
 	int error;
 
 	mtx_lock(&ffclock_mtx);
 	memcpy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
 	mtx_unlock(&ffclock_mtx);
 	error = copyout(&cest, uap->cest, sizeof(struct ffclock_estimate));
 	return (error);
 }
 
 #else /* !FFCLOCK */
 
 int
 sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #endif /* FFCLOCK */
Index: head/sys/kern/kern_hhook.c
===================================================================
--- head/sys/kern/kern_hhook.c	(revision 326270)
+++ head/sys/kern/kern_hhook.c	(revision 326271)
@@ -1,522 +1,524 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2010,2013 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Lawrence Stewart while studying at the Centre
  * for Advanced Internet Architectures, Swinburne University of Technology,
  * made possible in part by grants from the FreeBSD Foundation and Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
 #include <sys/khelp.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/module_khelp.h>
 #include <sys/osd.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 struct hhook {
 	hhook_func_t		hhk_func;
 	struct helper		*hhk_helper;
 	void			*hhk_udata;
 	STAILQ_ENTRY(hhook)	hhk_next;
 };
 
 static MALLOC_DEFINE(M_HHOOK, "hhook", "Helper hooks are linked off hhook_head lists");
 
 LIST_HEAD(hhookheadhead, hhook_head);
 struct hhookheadhead hhook_head_list;
 VNET_DEFINE(struct hhookheadhead, hhook_vhead_list);
 #define	V_hhook_vhead_list VNET(hhook_vhead_list)
 
 static struct mtx hhook_head_list_lock;
 MTX_SYSINIT(hhookheadlistlock, &hhook_head_list_lock, "hhook_head list lock",
     MTX_DEF);
 
 /* Protected by hhook_head_list_lock. */
 static uint32_t n_hhookheads;
 
 /* Private function prototypes. */
 static void hhook_head_destroy(struct hhook_head *hhh);
 void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
 
 #define	HHHLIST_LOCK() mtx_lock(&hhook_head_list_lock)
 #define	HHHLIST_UNLOCK() mtx_unlock(&hhook_head_list_lock)
 #define	HHHLIST_LOCK_ASSERT() mtx_assert(&hhook_head_list_lock, MA_OWNED)
 
 #define	HHH_LOCK_INIT(hhh) rm_init(&(hhh)->hhh_lock, "hhook_head rm lock")
 #define	HHH_LOCK_DESTROY(hhh) rm_destroy(&(hhh)->hhh_lock)
 #define	HHH_WLOCK(hhh) rm_wlock(&(hhh)->hhh_lock)
 #define	HHH_WUNLOCK(hhh) rm_wunlock(&(hhh)->hhh_lock)
 #define	HHH_RLOCK(hhh, rmpt) rm_rlock(&(hhh)->hhh_lock, (rmpt))
 #define	HHH_RUNLOCK(hhh, rmpt) rm_runlock(&(hhh)->hhh_lock, (rmpt))
 
 /*
  * Run all helper hook functions for a given hook point.
  */
 void
 hhook_run_hooks(struct hhook_head *hhh, void *ctx_data, struct osd *hosd)
 {
 	struct hhook *hhk;
 	void *hdata;
 	struct rm_priotracker rmpt;
 
 	KASSERT(hhh->hhh_refcount > 0, ("hhook_head %p refcount is 0", hhh));
 
 	HHH_RLOCK(hhh, &rmpt);
 	STAILQ_FOREACH(hhk, &hhh->hhh_hooks, hhk_next) {
 		if (hhk->hhk_helper != NULL &&
 		    hhk->hhk_helper->h_flags & HELPER_NEEDS_OSD) {
 			hdata = osd_get(OSD_KHELP, hosd, hhk->hhk_helper->h_id);
 			if (hdata == NULL)
 				continue;
 		} else
 			hdata = NULL;
 
 		/*
 		 * XXXLAS: We currently ignore the int returned by the hook,
 		 * but will likely want to handle it in future to allow hhook to
 		 * be used like pfil and effect changes at the hhook calling
 		 * site e.g. we could define a new hook type of HHOOK_TYPE_PFIL
 		 * and standardise what particular return values mean and set
 		 * the context data to pass exactly the same information as pfil
 		 * hooks currently receive, thus replicating pfil with hhook.
 		 */
 		hhk->hhk_func(hhh->hhh_type, hhh->hhh_id, hhk->hhk_udata,
 		    ctx_data, hdata, hosd);
 	}
 	HHH_RUNLOCK(hhh, &rmpt);
 }
 
 /*
  * Register a new helper hook function with a helper hook point.
  */
 int
 hhook_add_hook(struct hhook_head *hhh, struct hookinfo *hki, uint32_t flags)
 {
 	struct hhook *hhk, *tmp;
 	int error;
 
 	error = 0;
 
 	if (hhh == NULL)
 		return (ENOENT);
 
 	hhk = malloc(sizeof(struct hhook), M_HHOOK,
 	    M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
 
 	if (hhk == NULL)
 		return (ENOMEM);
 
 	hhk->hhk_helper = hki->hook_helper;
 	hhk->hhk_func = hki->hook_func;
 	hhk->hhk_udata = hki->hook_udata;
 
 	HHH_WLOCK(hhh);
 	STAILQ_FOREACH(tmp, &hhh->hhh_hooks, hhk_next) {
 		if (tmp->hhk_func == hki->hook_func &&
 		    tmp->hhk_udata == hki->hook_udata) {
 			/* The helper hook function is already registered. */
 			error = EEXIST;
 			break;
 		}
 	}
 
 	if (!error) {
 		STAILQ_INSERT_TAIL(&hhh->hhh_hooks, hhk, hhk_next);
 		hhh->hhh_nhooks++;
 	} else
 		free(hhk, M_HHOOK);
 
 	HHH_WUNLOCK(hhh);
 
 	return (error);
 }
 
 /*
  * Register a helper hook function with a helper hook point (including all
  * virtual instances of the hook point if it is virtualised).
  *
  * The logic is unfortunately far more complex than for
  * hhook_remove_hook_lookup() because hhook_add_hook() can call malloc() with
  * M_WAITOK and thus we cannot call hhook_add_hook() with the
  * hhook_head_list_lock held.
  *
  * The logic assembles an array of hhook_head structs that correspond to the
  * helper hook point being hooked and bumps the refcount on each (all done with
  * the hhook_head_list_lock held). The hhook_head_list_lock is then dropped, and
  * hhook_add_hook() is called and the refcount dropped for each hhook_head
  * struct in the array.
  */
 int
 hhook_add_hook_lookup(struct hookinfo *hki, uint32_t flags)
 {
 	struct hhook_head **heads_to_hook, *hhh;
 	int error, i, n_heads_to_hook;
 
 tryagain:
 	error = i = 0;
 	/*
 	 * Accessing n_hhookheads without hhook_head_list_lock held opens up a
 	 * race with hhook_head_register() which we are unlikely to lose, but
 	 * nonetheless have to cope with - hence the complex goto logic.
 	 */
 	n_heads_to_hook = n_hhookheads;
 	heads_to_hook = malloc(n_heads_to_hook * sizeof(struct hhook_head *),
 	    M_HHOOK, flags & HHOOK_WAITOK ? M_WAITOK : M_NOWAIT);
 	if (heads_to_hook == NULL)
 		return (ENOMEM);
 
 	HHHLIST_LOCK();
 	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
 		if (hhh->hhh_type == hki->hook_type &&
 		    hhh->hhh_id == hki->hook_id) {
 			if (i < n_heads_to_hook) {
 				heads_to_hook[i] = hhh;
 				refcount_acquire(&heads_to_hook[i]->hhh_refcount);
 				i++;
 			} else {
 				/*
 				 * We raced with hhook_head_register() which
 				 * inserted a hhook_head that we need to hook
 				 * but did not malloc space for. Abort this run
 				 * and try again.
 				 */
 				for (i--; i >= 0; i--)
 					refcount_release(&heads_to_hook[i]->hhh_refcount);
 				free(heads_to_hook, M_HHOOK);
 				HHHLIST_UNLOCK();
 				goto tryagain;
 			}
 		}
 	}
 	HHHLIST_UNLOCK();
 
 	for (i--; i >= 0; i--) {
 		if (!error)
 			error = hhook_add_hook(heads_to_hook[i], hki, flags);
 		refcount_release(&heads_to_hook[i]->hhh_refcount);
 	}
 
 	free(heads_to_hook, M_HHOOK);
 
 	return (error);
 }
 
 /*
  * Remove a helper hook function from a helper hook point.
  */
 int
 hhook_remove_hook(struct hhook_head *hhh, struct hookinfo *hki)
 {
 	struct hhook *tmp;
 
 	if (hhh == NULL)
 		return (ENOENT);
 
 	HHH_WLOCK(hhh);
 	STAILQ_FOREACH(tmp, &hhh->hhh_hooks, hhk_next) {
 		if (tmp->hhk_func == hki->hook_func &&
 		    tmp->hhk_udata == hki->hook_udata) {
 			STAILQ_REMOVE(&hhh->hhh_hooks, tmp, hhook, hhk_next);
 			free(tmp, M_HHOOK);
 			hhh->hhh_nhooks--;
 			break;
 		}
 	}
 	HHH_WUNLOCK(hhh);
 
 	return (0);
 }
 
 /*
  * Remove a helper hook function from a helper hook point (including all
  * virtual instances of the hook point if it is virtualised).
  */
 int
 hhook_remove_hook_lookup(struct hookinfo *hki)
 {
 	struct hhook_head *hhh;
 
 	HHHLIST_LOCK();
 	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
 		if (hhh->hhh_type == hki->hook_type &&
 		    hhh->hhh_id == hki->hook_id)
 			hhook_remove_hook(hhh, hki);
 	}
 	HHHLIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Register a new helper hook point.
  */
 int
 hhook_head_register(int32_t hhook_type, int32_t hhook_id, struct hhook_head **hhh,
     uint32_t flags)
 {
 	struct hhook_head *tmphhh;
 
 	tmphhh = hhook_head_get(hhook_type, hhook_id);
 
 	if (tmphhh != NULL) {
 		/* Hook point previously registered. */
 		hhook_head_release(tmphhh);
 		return (EEXIST);
 	}
 
 	tmphhh = malloc(sizeof(struct hhook_head), M_HHOOK,
 	    M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
 
 	if (tmphhh == NULL)
 		return (ENOMEM);
 
 	tmphhh->hhh_type = hhook_type;
 	tmphhh->hhh_id = hhook_id;
 	tmphhh->hhh_nhooks = 0;
 	STAILQ_INIT(&tmphhh->hhh_hooks);
 	HHH_LOCK_INIT(tmphhh);
 	refcount_init(&tmphhh->hhh_refcount, 1);
 
 	HHHLIST_LOCK();
 	if (flags & HHOOK_HEADISINVNET) {
 		tmphhh->hhh_flags |= HHH_ISINVNET;
 #ifdef VIMAGE
 		KASSERT(curvnet != NULL, ("curvnet is NULL"));
 		tmphhh->hhh_vid = (uintptr_t)curvnet;
 		LIST_INSERT_HEAD(&V_hhook_vhead_list, tmphhh, hhh_vnext);
 #endif
 	}
 	LIST_INSERT_HEAD(&hhook_head_list, tmphhh, hhh_next);
 	n_hhookheads++;
 	HHHLIST_UNLOCK();
 
 	khelp_new_hhook_registered(tmphhh, flags);
 
 	if (hhh != NULL)
 		*hhh = tmphhh;
 	else
 		refcount_release(&tmphhh->hhh_refcount);
 
 	return (0);
 }
 
 static void
 hhook_head_destroy(struct hhook_head *hhh)
 {
 	struct hhook *tmp, *tmp2;
 
 	HHHLIST_LOCK_ASSERT();
 	KASSERT(n_hhookheads > 0, ("n_hhookheads should be > 0"));
 
 	LIST_REMOVE(hhh, hhh_next);
 #ifdef VIMAGE
 	if (hhook_head_is_virtualised(hhh) == HHOOK_HEADISINVNET)
 		LIST_REMOVE(hhh, hhh_vnext);
 #endif
 	HHH_WLOCK(hhh);
 	STAILQ_FOREACH_SAFE(tmp, &hhh->hhh_hooks, hhk_next, tmp2)
 		free(tmp, M_HHOOK);
 	HHH_WUNLOCK(hhh);
 	HHH_LOCK_DESTROY(hhh);
 	free(hhh, M_HHOOK);
 	n_hhookheads--;
 }
 
 /*
  * Remove a helper hook point.
  */
 int
 hhook_head_deregister(struct hhook_head *hhh)
 {
 	int error;
 
 	error = 0;
 
 	HHHLIST_LOCK();
 	if (hhh == NULL)
 		error = ENOENT;
 	else if (hhh->hhh_refcount > 1)
 		error = EBUSY;
 	else
 		hhook_head_destroy(hhh);
 	HHHLIST_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Remove a helper hook point via a hhook_head lookup.
  */
 int
 hhook_head_deregister_lookup(int32_t hhook_type, int32_t hhook_id)
 {
 	struct hhook_head *hhh;
 	int error;
 
 	hhh = hhook_head_get(hhook_type, hhook_id);
 	error = hhook_head_deregister(hhh);
 
 	if (error == EBUSY)
 		hhook_head_release(hhh);
 
 	return (error);
 }
 
 /*
  * Lookup and return the hhook_head struct associated with the specified type
  * and id, or NULL if not found. If found, the hhook_head's refcount is bumped.
  */
 struct hhook_head *
 hhook_head_get(int32_t hhook_type, int32_t hhook_id)
 {
 	struct hhook_head *hhh;
 
 	HHHLIST_LOCK();
 	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
 		if (hhh->hhh_type == hhook_type && hhh->hhh_id == hhook_id) {
 #ifdef VIMAGE
 			if (hhook_head_is_virtualised(hhh) ==
 			    HHOOK_HEADISINVNET) {
 				KASSERT(curvnet != NULL, ("curvnet is NULL"));
 				if (hhh->hhh_vid != (uintptr_t)curvnet)
 					continue;
 			}
 #endif
 			refcount_acquire(&hhh->hhh_refcount);
 			break;
 		}
 	}
 	HHHLIST_UNLOCK();
 
 	return (hhh);
 }
 
 void
 hhook_head_release(struct hhook_head *hhh)
 {
 
 	refcount_release(&hhh->hhh_refcount);
 }
 
 /*
  * Check the hhook_head private flags and return the appropriate public
  * representation of the flag to the caller. The function is implemented in a
  * way that allows us to cope with other subsystems becoming virtualised in the
  * future.
  */
 uint32_t
 hhook_head_is_virtualised(struct hhook_head *hhh)
 {
 	uint32_t ret;
 
 	ret = 0;
 
 	if (hhh != NULL) {
 		if (hhh->hhh_flags & HHH_ISINVNET)
 			ret = HHOOK_HEADISINVNET;
 	}
 
 	return (ret);
 }
 
 uint32_t
 hhook_head_is_virtualised_lookup(int32_t hook_type, int32_t hook_id)
 {
 	struct hhook_head *hhh;
 	uint32_t ret;
 
 	hhh = hhook_head_get(hook_type, hook_id);
 
 	if (hhh == NULL)
 		return (0);
 
 	ret = hhook_head_is_virtualised(hhh);
 	hhook_head_release(hhh);
 
 	return (ret);
 }
 
 /*
  * Vnet created and being initialised.
  */
 static void
 hhook_vnet_init(const void *unused __unused)
 {
 
 	LIST_INIT(&V_hhook_vhead_list);
 }
 
 /*
  * Vnet being torn down and destroyed.
  */
 static void
 hhook_vnet_uninit(const void *unused __unused)
 {
 	struct hhook_head *hhh, *tmphhh;
 
 	/*
 	 * If subsystems which export helper hook points use the hhook KPI
 	 * correctly, the loop below should have no work to do because the
 	 * subsystem should have already called hhook_head_deregister().
 	 */
 	HHHLIST_LOCK();
 	LIST_FOREACH_SAFE(hhh, &V_hhook_vhead_list, hhh_vnext, tmphhh) {
 		printf("%s: hhook_head type=%d, id=%d cleanup required\n",
 		    __func__, hhh->hhh_type, hhh->hhh_id);
 		hhook_head_destroy(hhh);
 	}
 	HHHLIST_UNLOCK();
 }
 
 
 /*
  * When a vnet is created and being initialised, init the V_hhook_vhead_list.
  */
 VNET_SYSINIT(hhook_vnet_init, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     hhook_vnet_init, NULL);
 
 /*
  * The hhook KPI provides a mechanism for subsystems which export helper hook
  * points to clean up on vnet tear down, but in case the KPI is misused,
  * provide a function to clean up and free memory for a vnet being destroyed.
  */
 VNET_SYSUNINIT(hhook_vnet_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
     hhook_vnet_uninit, NULL);
Index: head/sys/kern/kern_idle.c
===================================================================
--- head/sys/kern/kern_idle.c	(revision 326270)
+++ head/sys/kern/kern_idle.c	(revision 326271)
@@ -1,86 +1,88 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (C) 2000-2004 The FreeBSD Project. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/unistd.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 
 static void idle_setup(void *dummy);
 SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL);
 
 /*
  * Set up per-cpu idle process contexts.  The AP's shouldn't be running or
  * accessing their idle processes at this point, so don't bother with
  * locking.
  */
 static void
 idle_setup(void *dummy)
 {
 #ifdef SMP
 	struct pcpu *pc;
 #endif
 	struct proc *p;
 	struct thread *td;
 	int error;
 
 	p = NULL; /* start with no idle process */
 #ifdef SMP
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 #endif
 #ifdef SMP
 		error = kproc_kthread_add(sched_idletd, NULL, &p, &td,
 		    RFSTOPPED | RFHIGHPID, 0, "idle", "idle: cpu%d", pc->pc_cpuid);
 		pc->pc_idlethread = td;
 #else
 		error = kproc_kthread_add(sched_idletd, NULL, &p, &td,
 		    RFSTOPPED | RFHIGHPID, 0, "idle", "idle");
 		PCPU_SET(idlethread, td);
 #endif
 		if (error)
 			panic("idle_setup: kproc_create error %d\n", error);
 
 		thread_lock(td);
 		TD_SET_CAN_RUN(td);
 		td->td_flags |= TDF_IDLETD | TDF_NOLOAD;
 		sched_class(td, PRI_IDLE);
 		sched_prio(td, PRI_MAX_IDLE);
 		thread_unlock(td);
 #ifdef SMP
 	}
 #endif
 }
Index: head/sys/kern/kern_intr.c
===================================================================
--- head/sys/kern/kern_intr.c	(revision 326270)
+++ head/sys/kern/kern_intr.c	(revision 326271)
@@ -1,2008 +1,2010 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/cpuset.h>
 #include <sys/rtprio.h>
 #include <sys/systm.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/stdarg.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 /*
  * Describe an interrupt thread.  There is one of these per interrupt event.
  */
 struct intr_thread {
 	struct intr_event *it_event;
 	struct thread *it_thread;	/* Kernel thread. */
 	int	it_flags;		/* (j) IT_* flags. */
 	int	it_need;		/* Needs service. */
 };
 
 /* Interrupt thread flags kept in it_flags */
 #define	IT_DEAD		0x000001	/* Thread is waiting to exit. */
 #define	IT_WAIT		0x000002	/* Thread is waiting for completion. */
 
 struct	intr_entropy {
 	struct	thread *td;
 	uintptr_t event;
 };
 
 struct	intr_event *clk_intr_event;
 struct	intr_event *tty_intr_event;
 void	*vm_ih;
 struct proc *intrproc;
 
 static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
 
 static int intr_storm_threshold = 1000;
 SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN,
     &intr_storm_threshold, 0,
     "Number of consecutive interrupts before storm protection is enabled");
 static TAILQ_HEAD(, intr_event) event_list =
     TAILQ_HEAD_INITIALIZER(event_list);
 static struct mtx event_lock;
 MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF);
 
 static void	intr_event_update(struct intr_event *ie);
 #ifdef INTR_FILTER
 static int	intr_event_schedule_thread(struct intr_event *ie,
 		    struct intr_thread *ithd);
 static int	intr_filter_loop(struct intr_event *ie,
 		    struct trapframe *frame, struct intr_thread **ithd);
 static struct intr_thread *ithread_create(const char *name,
 			      struct intr_handler *ih);
 #else
 static int	intr_event_schedule_thread(struct intr_event *ie);
 static struct intr_thread *ithread_create(const char *name);
 #endif
 static void	ithread_destroy(struct intr_thread *ithread);
 static void	ithread_execute_handlers(struct proc *p, 
 		    struct intr_event *ie);
 #ifdef INTR_FILTER
 static void	priv_ithread_execute_handler(struct proc *p, 
 		    struct intr_handler *ih);
 #endif
 static void	ithread_loop(void *);
 static void	ithread_update(struct intr_thread *ithd);
 static void	start_softintr(void *);
 
 /* Map an interrupt type to an ithread priority. */
 u_char
 intr_priority(enum intr_type flags)
 {
 	u_char pri;
 
 	flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
 	    INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV);
 	switch (flags) {
 	case INTR_TYPE_TTY:
 		pri = PI_TTY;
 		break;
 	case INTR_TYPE_BIO:
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_NET:
 		pri = PI_NET;
 		break;
 	case INTR_TYPE_CAM:
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_AV:
 		pri = PI_AV;
 		break;
 	case INTR_TYPE_CLK:
 		pri = PI_REALTIME;
 		break;
 	case INTR_TYPE_MISC:
 		pri = PI_DULL;          /* don't care */
 		break;
 	default:
 		/* We didn't specify an interrupt level. */
 		panic("intr_priority: no interrupt type in flags");
 	}
 
 	return pri;
 }
 
 /*
  * Update an ithread based on the associated intr_event.
  */
 static void
 ithread_update(struct intr_thread *ithd)
 {
 	struct intr_event *ie;
 	struct thread *td;
 	u_char pri;
 
 	ie = ithd->it_event;
 	td = ithd->it_thread;
 
 	/* Determine the overall priority of this event. */
 	if (TAILQ_EMPTY(&ie->ie_handlers))
 		pri = PRI_MAX_ITHD;
 	else
 		pri = TAILQ_FIRST(&ie->ie_handlers)->ih_pri;
 
 	/* Update name and priority. */
 	strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 	thread_lock(td);
 	sched_prio(td, pri);
 	thread_unlock(td);
 }
 
 /*
  * Regenerate the full name of an interrupt event and update its priority.
  */
 static void
 intr_event_update(struct intr_event *ie)
 {
 	struct intr_handler *ih;
 	char *last;
 	int missed, space;
 
 	/* Start off with no entropy and just the name of the event. */
 	mtx_assert(&ie->ie_lock, MA_OWNED);
 	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
 	ie->ie_flags &= ~IE_ENTROPY;
 	missed = 0;
 	space = 1;
 
 	/* Run through all the handlers updating values. */
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 <
 		    sizeof(ie->ie_fullname)) {
 			strcat(ie->ie_fullname, " ");
 			strcat(ie->ie_fullname, ih->ih_name);
 			space = 0;
 		} else
 			missed++;
 		if (ih->ih_flags & IH_ENTROPY)
 			ie->ie_flags |= IE_ENTROPY;
 	}
 
 	/*
 	 * If the handler names were too long, add +'s to indicate missing
 	 * names. If we run out of room and still have +'s to add, change
 	 * the last character from a + to a *.
 	 */
 	last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2];
 	while (missed-- > 0) {
 		if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) {
 			if (*last == '+') {
 				*last = '*';
 				break;
 			} else
 				*last = '+';
 		} else if (space) {
 			strcat(ie->ie_fullname, " +");
 			space = 0;
 		} else
 			strcat(ie->ie_fullname, "+");
 	}
 
 	/*
 	 * If this event has an ithread, update it's priority and
 	 * name.
 	 */
 	if (ie->ie_thread != NULL)
 		ithread_update(ie->ie_thread);
 	CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
 }
 
 int
 intr_event_create(struct intr_event **event, void *source, int flags, int irq,
     void (*pre_ithread)(void *), void (*post_ithread)(void *),
     void (*post_filter)(void *), int (*assign_cpu)(void *, int),
     const char *fmt, ...)
 {
 	struct intr_event *ie;
 	va_list ap;
 
 	/* The only valid flag during creation is IE_SOFT. */
 	if ((flags & ~IE_SOFT) != 0)
 		return (EINVAL);
 	ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
 	ie->ie_source = source;
 	ie->ie_pre_ithread = pre_ithread;
 	ie->ie_post_ithread = post_ithread;
 	ie->ie_post_filter = post_filter;
 	ie->ie_assign_cpu = assign_cpu;
 	ie->ie_flags = flags;
 	ie->ie_irq = irq;
 	ie->ie_cpu = NOCPU;
 	TAILQ_INIT(&ie->ie_handlers);
 	mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
 
 	va_start(ap, fmt);
 	vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
 	va_end(ap);
 	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
 	mtx_lock(&event_lock);
 	TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
 	mtx_unlock(&event_lock);
 	if (event != NULL)
 		*event = ie;
 	CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
 	return (0);
 }
 
 /*
  * Bind an interrupt event to the specified CPU.  Note that not all
  * platforms support binding an interrupt to a CPU.  For those
  * platforms this request will fail.  Using a cpu id of NOCPU unbinds
  * the interrupt event.
  */
 static int
 _intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread)
 {
 	lwpid_t id;
 	int error;
 
 	/* Need a CPU to bind to. */
 	if (cpu != NOCPU && CPU_ABSENT(cpu))
 		return (EINVAL);
 
 	if (ie->ie_assign_cpu == NULL)
 		return (EOPNOTSUPP);
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR);
 	if (error)
 		return (error);
 
 	/*
 	 * If we have any ithreads try to set their mask first to verify
 	 * permissions, etc.
 	 */
 	if (bindithread) {
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_thread != NULL) {
 			id = ie->ie_thread->it_thread->td_tid;
 			mtx_unlock(&ie->ie_lock);
 			error = cpuset_setithread(id, cpu);
 			if (error)
 				return (error);
 		} else
 			mtx_unlock(&ie->ie_lock);
 	}
 	if (bindirq)
 		error = ie->ie_assign_cpu(ie->ie_source, cpu);
 	if (error) {
 		if (bindithread) {
 			mtx_lock(&ie->ie_lock);
 			if (ie->ie_thread != NULL) {
 				cpu = ie->ie_cpu;
 				id = ie->ie_thread->it_thread->td_tid;
 				mtx_unlock(&ie->ie_lock);
 				(void)cpuset_setithread(id, cpu);
 			} else
 				mtx_unlock(&ie->ie_lock);
 		}
 		return (error);
 	}
 
 	if (bindirq) {
 		mtx_lock(&ie->ie_lock);
 		ie->ie_cpu = cpu;
 		mtx_unlock(&ie->ie_lock);
 	}
 
 	return (error);
 }
 
 /*
  * Bind an interrupt event to the specified CPU.  For supported platforms, any
  * associated ithreads as well as the primary interrupt context will be bound
  * to the specificed CPU.
  */
 int
 intr_event_bind(struct intr_event *ie, int cpu)
 {
 
 	return (_intr_event_bind(ie, cpu, true, true));
 }
 
 /*
  * Bind an interrupt event to the specified CPU, but do not bind associated
  * ithreads.
  */
 int
 intr_event_bind_irqonly(struct intr_event *ie, int cpu)
 {
 
 	return (_intr_event_bind(ie, cpu, true, false));
 }
 
 /*
  * Bind an interrupt event's ithread to the specified CPU.
  */
 int
 intr_event_bind_ithread(struct intr_event *ie, int cpu)
 {
 
 	return (_intr_event_bind(ie, cpu, false, true));
 }
 
 static struct intr_event *
 intr_lookup(int irq)
 {
 	struct intr_event *ie;
 
 	mtx_lock(&event_lock);
 	TAILQ_FOREACH(ie, &event_list, ie_list)
 		if (ie->ie_irq == irq &&
 		    (ie->ie_flags & IE_SOFT) == 0 &&
 		    TAILQ_FIRST(&ie->ie_handlers) != NULL)
 			break;
 	mtx_unlock(&event_lock);
 	return (ie);
 }
 
 int
 intr_setaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
 	cpuset_t *mask;
 	int cpu, n;
 
 	mask = m;
 	cpu = NOCPU;
 	/*
 	 * If we're setting all cpus we can unbind.  Otherwise make sure
 	 * only one cpu is in the set.
 	 */
 	if (CPU_CMP(cpuset_root, mask)) {
 		for (n = 0; n < CPU_SETSIZE; n++) {
 			if (!CPU_ISSET(n, mask))
 				continue;
 			if (cpu != NOCPU)
 				return (EINVAL);
 			cpu = n;
 		}
 	}
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
 	switch (mode) {
 	case CPU_WHICH_IRQ:
 		return (intr_event_bind(ie, cpu));
 	case CPU_WHICH_INTRHANDLER:
 		return (intr_event_bind_irqonly(ie, cpu));
 	case CPU_WHICH_ITHREAD:
 		return (intr_event_bind_ithread(ie, cpu));
 	default:
 		return (EINVAL);
 	}
 }
 
 int
 intr_getaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	cpuset_t *mask;
 	lwpid_t id;
 	int error;
 
 	mask = m;
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
 
 	error = 0;
 	CPU_ZERO(mask);
 	switch (mode) {
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_INTRHANDLER:
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_cpu == NOCPU)
 			CPU_COPY(cpuset_root, mask);
 		else
 			CPU_SET(ie->ie_cpu, mask);
 		mtx_unlock(&ie->ie_lock);
 		break;
 	case CPU_WHICH_ITHREAD:
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_thread == NULL) {
 			mtx_unlock(&ie->ie_lock);
 			CPU_COPY(cpuset_root, mask);
 		} else {
 			id = ie->ie_thread->it_thread->td_tid;
 			mtx_unlock(&ie->ie_lock);
 			error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL);
 			if (error != 0)
 				return (error);
 			CPU_COPY(&td->td_cpuset->cs_mask, mask);
 			PROC_UNLOCK(p);
 		}
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 intr_event_destroy(struct intr_event *ie)
 {
 
 	mtx_lock(&event_lock);
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		mtx_unlock(&ie->ie_lock);
 		mtx_unlock(&event_lock);
 		return (EBUSY);
 	}
 	TAILQ_REMOVE(&event_list, ie, ie_list);
 #ifndef notyet
 	if (ie->ie_thread != NULL) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	mtx_unlock(&event_lock);
 	mtx_destroy(&ie->ie_lock);
 	free(ie, M_ITHREAD);
 	return (0);
 }
 
 #ifndef INTR_FILTER
 static struct intr_thread *
 ithread_create(const char *name)
 {
 	struct intr_thread *ithd;
 	struct thread *td;
 	int error;
 
 	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
 
 	error = kproc_kthread_add(ithread_loop, ithd, &intrproc,
 		    &td, RFSTOPPED | RFHIGHPID,
 	    	    0, "intr", "%s", name);
 	if (error)
 		panic("kproc_create() failed with %d", error);
 	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
 	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
 	return (ithd);
 }
 #else
 static struct intr_thread *
 ithread_create(const char *name, struct intr_handler *ih)
 {
 	struct intr_thread *ithd;
 	struct thread *td;
 	int error;
 
 	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
 
 	error = kproc_kthread_add(ithread_loop, ih, &intrproc,
 		    &td, RFSTOPPED | RFHIGHPID,
 	    	    0, "intr", "%s", name);
 	if (error)
 		panic("kproc_create() failed with %d", error);
 	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
 	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
 	return (ithd);
 }
 #endif
 
 static void
 ithread_destroy(struct intr_thread *ithread)
 {
 	struct thread *td;
 
 	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
 	td = ithread->it_thread;
 	thread_lock(td);
 	ithread->it_flags |= IT_DEAD;
 	if (TD_AWAITING_INTR(td)) {
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	}
 	thread_unlock(td);
 }
 
 #ifndef INTR_FILTER
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
     enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
 	struct intr_thread *it;
 
 	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
 	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
 	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
 	if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 
 	/* We can only have one exclusive handler in a event. */
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		if ((flags & INTR_EXCL) ||
 		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
 			mtx_unlock(&ie->ie_lock);
 			free(ih, M_ITHREAD);
 			return (EINVAL);
 		}
 	}
 
 	/* Create a thread if we need one. */
 	while (ie->ie_thread == NULL && handler != NULL) {
 		if (ie->ie_flags & IE_ADDING_THREAD)
 			msleep(ie, &ie->ie_lock, 0, "ithread", 0);
 		else {
 			ie->ie_flags |= IE_ADDING_THREAD;
 			mtx_unlock(&ie->ie_lock);
 			it = ithread_create("intr: newborn");
 			mtx_lock(&ie->ie_lock);
 			ie->ie_flags &= ~IE_ADDING_THREAD;
 			ie->ie_thread = it;
 			it->it_event = ie;
 			ithread_update(it);
 			wakeup(ie);
 		}
 	}
 
 	/* Add the new handler to the event in priority order. */
 	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
 		if (temp_ih->ih_pri > ih->ih_pri)
 			break;
 	}
 	if (temp_ih == NULL)
 		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
 	else
 		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
 	intr_event_update(ie);
 
 	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
 	    ie->ie_name);
 	mtx_unlock(&ie->ie_lock);
 
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 }
 #else
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
     enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
 	struct intr_thread *it;
 
 	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
 	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
 	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
 	if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 
 	/* We can only have one exclusive handler in a event. */
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		if ((flags & INTR_EXCL) ||
 		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
 			mtx_unlock(&ie->ie_lock);
 			free(ih, M_ITHREAD);
 			return (EINVAL);
 		}
 	}
 
 	/* For filtered handlers, create a private ithread to run on. */
 	if (filter != NULL && handler != NULL) {
 		mtx_unlock(&ie->ie_lock);
 		it = ithread_create("intr: newborn", ih);
 		mtx_lock(&ie->ie_lock);
 		it->it_event = ie;
 		ih->ih_thread = it;
 		ithread_update(it); /* XXX - do we really need this?!?!? */
 	} else { /* Create the global per-event thread if we need one. */
 		while (ie->ie_thread == NULL && handler != NULL) {
 			if (ie->ie_flags & IE_ADDING_THREAD)
 				msleep(ie, &ie->ie_lock, 0, "ithread", 0);
 			else {
 				ie->ie_flags |= IE_ADDING_THREAD;
 				mtx_unlock(&ie->ie_lock);
 				it = ithread_create("intr: newborn", ih);
 				mtx_lock(&ie->ie_lock);
 				ie->ie_flags &= ~IE_ADDING_THREAD;
 				ie->ie_thread = it;
 				it->it_event = ie;
 				ithread_update(it);
 				wakeup(ie);
 			}
 		}
 	}
 
 	/* Add the new handler to the event in priority order. */
 	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
 		if (temp_ih->ih_pri > ih->ih_pri)
 			break;
 	}
 	if (temp_ih == NULL)
 		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
 	else
 		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
 	intr_event_update(ie);
 
 	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
 	    ie->ie_name);
 	mtx_unlock(&ie->ie_lock);
 
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 }
 #endif
 
 /*
  * Append a description preceded by a ':' to the name of the specified
  * interrupt handler.
  */
 int
 intr_event_describe_handler(struct intr_event *ie, void *cookie,
     const char *descr)
 {
 	struct intr_handler *ih;
 	size_t space;
 	char *start;
 
 	mtx_lock(&ie->ie_lock);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (ih == cookie)
 			break;
 	}
 	if (ih == NULL) {
 		mtx_unlock(&ie->ie_lock);
 		panic("handler %p not found in interrupt event %p", cookie, ie);
 	}
 #endif
 	ih = cookie;
 
 	/*
 	 * Look for an existing description by checking for an
 	 * existing ":".  This assumes device names do not include
 	 * colons.  If one is found, prepare to insert the new
 	 * description at that point.  If one is not found, find the
 	 * end of the name to use as the insertion point.
 	 */
 	start = strchr(ih->ih_name, ':');
 	if (start == NULL)
 		start = strchr(ih->ih_name, 0);
 
 	/*
 	 * See if there is enough remaining room in the string for the
 	 * description + ":".  The "- 1" leaves room for the trailing
 	 * '\0'.  The "+ 1" accounts for the colon.
 	 */
 	space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1;
 	if (strlen(descr) + 1 > space) {
 		mtx_unlock(&ie->ie_lock);
 		return (ENOSPC);
 	}
 
 	/* Append a colon followed by the description. */
 	*start = ':';
 	strcpy(start + 1, descr);
 	intr_event_update(ie);
 	mtx_unlock(&ie->ie_lock);
 	return (0);
 }
 
 /*
  * Return the ie_source field from the intr_event an intr_handler is
  * associated with.
  */
 void *
 intr_handler_source(void *cookie)
 {
 	struct intr_handler *ih;
 	struct intr_event *ie;
 
 	ih = (struct intr_handler *)cookie;
 	if (ih == NULL)
 		return (NULL);
 	ie = ih->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    ih->ih_name));
 	return (ie->ie_source);
 }
 
 /*
  * Sleep until an ithread finishes executing an interrupt handler.
  *
  * XXX Doesn't currently handle interrupt filters or fast interrupt
  * handlers.  This is intended for compatibility with linux drivers
  * only.  Do not use in BSD code.
  */
 void
 _intr_drain(int irq)
 {
 	struct intr_event *ie;
 	struct intr_thread *ithd;
 	struct thread *td;
 
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return;
 	if (ie->ie_thread == NULL)
 		return;
 	ithd = ie->ie_thread;
 	td = ithd->it_thread;
 	/*
 	 * We set the flag and wait for it to be cleared to avoid
 	 * long delays with potentially busy interrupt handlers
 	 * were we to only sample TD_AWAITING_INTR() every tick.
 	 */
 	thread_lock(td);
 	if (!TD_AWAITING_INTR(td)) {
 		ithd->it_flags |= IT_WAIT;
 		while (ithd->it_flags & IT_WAIT) {
 			thread_unlock(td);
 			pause("idrain", 1);
 			thread_lock(td);
 		}
 	}
 	thread_unlock(td);
 	return;
 }
 
 
 #ifndef INTR_FILTER
 int
 intr_event_remove_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 #ifdef INVARIANTS
 	struct intr_handler *ih;
 #endif
 #ifdef notyet
 	int dead;
 #endif
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		if (ih == handler)
 			goto ok;
 	mtx_unlock(&ie->ie_lock);
 	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
 	    ih->ih_name, ie->ie_name);
 ok:
 #endif
 	/*
 	 * If there is no ithread, then just remove the handler and return.
 	 * XXX: Note that an INTR_FAST handler might be running on another
 	 * CPU!
 	 */
 	if (ie->ie_thread == NULL) {
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 		mtx_unlock(&ie->ie_lock);
 		free(handler, M_ITHREAD);
 		return (0);
 	}
 
 	/*
 	 * If the interrupt thread is already running, then just mark this
 	 * handler as being dead and let the ithread do the actual removal.
 	 *
 	 * During a cold boot while cold is set, msleep() does not sleep,
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
 	thread_lock(ie->ie_thread->it_thread);
 	if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
 		/*
 		 * Ensure that the thread will process the handler list
 		 * again and remove this handler if it has already passed
 		 * it on the list.
 		 *
 		 * The release part of the following store ensures
 		 * that the update of ih_flags is ordered before the
 		 * it_need setting.  See the comment before
 		 * atomic_cmpset_acq(&ithd->it_need, ...) operation in
 		 * the ithread_execute_handlers().
 		 */
 		atomic_store_rel_int(&ie->ie_thread->it_need, 1);
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 	thread_unlock(ie->ie_thread->it_thread);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	intr_event_update(ie);
 #ifdef notyet
 	/*
 	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
 	 * this could lead to races of stale data when servicing an
 	 * interrupt.
 	 */
 	dead = 1;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (!(ih->ih_flags & IH_FAST)) {
 			dead = 0;
 			break;
 		}
 	}
 	if (dead) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
 static int
 intr_event_schedule_thread(struct intr_event *ie)
 {
 	struct intr_entropy entropy;
 	struct intr_thread *it;
 	struct thread *td;
 	struct thread *ctd;
 	struct proc *p;
 
 	/*
 	 * If no ithread or no handlers, then we have a stray interrupt.
 	 */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) ||
 	    ie->ie_thread == NULL)
 		return (EINVAL);
 
 	ctd = curthread;
 	it = ie->ie_thread;
 	td = it->it_thread;
 	p = td->td_proc;
 
 	/*
 	 * If any of the handlers for this ithread claim to be good
 	 * sources of entropy, then gather some.
 	 */
 	if (ie->ie_flags & IE_ENTROPY) {
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
 		random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT);
 	}
 
 	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 *
 	 * Use store_rel to arrange that the store to ih_need in
 	 * swi_sched() is before the store to it_need and prepare for
 	 * transfer of this order to loads in the ithread.
 	 */
 	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    td->td_name);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
 	}
 	thread_unlock(td);
 
 	return (0);
 }
 #else
 int
 intr_event_remove_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 	struct intr_thread *it;
 #ifdef INVARIANTS
 	struct intr_handler *ih;
 #endif
 #ifdef notyet
 	int dead;
 #endif
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		if (ih == handler)
 			goto ok;
 	mtx_unlock(&ie->ie_lock);
 	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
 	    ih->ih_name, ie->ie_name);
 ok:
 #endif
 	/*
 	 * If there are no ithreads (per event and per handler), then
 	 * just remove the handler and return.  
 	 * XXX: Note that an INTR_FAST handler might be running on another CPU!
 	 */
 	if (ie->ie_thread == NULL && handler->ih_thread == NULL) {
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 		mtx_unlock(&ie->ie_lock);
 		free(handler, M_ITHREAD);
 		return (0);
 	}
 
 	/* Private or global ithread? */
 	it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread;
 	/*
 	 * If the interrupt thread is already running, then just mark this
 	 * handler as being dead and let the ithread do the actual removal.
 	 *
 	 * During a cold boot while cold is set, msleep() does not sleep,
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
 	thread_lock(it->it_thread);
 	if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
 		/*
 		 * Ensure that the thread will process the handler list
 		 * again and remove this handler if it has already passed
 		 * it on the list.
 		 *
 		 * The release part of the following store ensures
 		 * that the update of ih_flags is ordered before the
 		 * it_need setting.  See the comment before
 		 * atomic_cmpset_acq(&ithd->it_need, ...) operation in
 		 * the ithread_execute_handlers().
 		 */
 		atomic_store_rel_int(&it->it_need, 1);
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 	thread_unlock(it->it_thread);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	/* 
 	 * At this point, the handler has been disconnected from the event,
 	 * so we can kill the private ithread if any.
 	 */
 	if (handler->ih_thread) {
 		ithread_destroy(handler->ih_thread);
 		handler->ih_thread = NULL;
 	}
 	intr_event_update(ie);
 #ifdef notyet
 	/*
 	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
 	 * this could lead to races of stale data when servicing an
 	 * interrupt.
 	 */
 	dead = 1;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (handler != NULL) {
 			dead = 0;
 			break;
 		}
 	}
 	if (dead) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
 static int
 intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
 {
 	struct intr_entropy entropy;
 	struct thread *td;
 	struct thread *ctd;
 	struct proc *p;
 
 	/*
 	 * If no ithread or no handlers, then we have a stray interrupt.
 	 */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL)
 		return (EINVAL);
 
 	ctd = curthread;
 	td = it->it_thread;
 	p = td->td_proc;
 
 	/*
 	 * If any of the handlers for this ithread claim to be good
 	 * sources of entropy, then gather some.
 	 */
 	if (ie->ie_flags & IE_ENTROPY) {
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
 		random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT);
 	}
 
 	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 *
 	 * Use store_rel to arrange that the store to ih_need in
 	 * swi_sched() is before the store to it_need and prepare for
 	 * transfer of this order to loads in the ithread.
 	 */
 	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    td->td_name);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
 	}
 	thread_unlock(td);
 
 	return (0);
 }
 #endif
 
 /*
  * Allow interrupt event binding for software interrupt handlers -- a no-op,
  * since interrupts are generated in software rather than being directed by
  * a PIC.
  */
 static int
 swi_assign_cpu(void *arg, int cpu)
 {
 
 	return (0);
 }
 
 /*
  * Add a software interrupt handler to a specified event.  If a given event
  * is not specified, then a new event is created.
  */
 int
 swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler,
 	    void *arg, int pri, enum intr_type flags, void **cookiep)
 {
 	struct intr_event *ie;
 	int error;
 
 	if (flags & INTR_ENTROPY)
 		return (EINVAL);
 
 	ie = (eventp != NULL) ? *eventp : NULL;
 
 	if (ie != NULL) {
 		if (!(ie->ie_flags & IE_SOFT))
 			return (EINVAL);
 	} else {
 		error = intr_event_create(&ie, NULL, IE_SOFT, 0,
 		    NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri);
 		if (error)
 			return (error);
 		if (eventp != NULL)
 			*eventp = ie;
 	}
 	error = intr_event_add_handler(ie, name, NULL, handler, arg,
 	    PI_SWI(pri), flags, cookiep);
 	return (error);
 }
 
 /*
  * Schedule a software interrupt thread.
  */
 void
 swi_sched(void *cookie, int flags)
 {
 	struct intr_handler *ih = (struct intr_handler *)cookie;
 	struct intr_event *ie = ih->ih_event;
 	struct intr_entropy entropy;
 	int error;
 
 	CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
 	    ih->ih_need);
 
 	entropy.event = (uintptr_t)ih;
 	entropy.td = curthread;
 	random_harvest_queue(&entropy, sizeof(entropy), 1, RANDOM_SWI);
 
 	/*
 	 * Set ih_need for this handler so that if the ithread is already
 	 * running it will execute this handler on the next pass.  Otherwise,
 	 * it will execute it the next time it runs.
 	 */
 	ih->ih_need = 1;
 
 	if (!(flags & SWI_DELAY)) {
 		VM_CNT_INC(v_soft);
 #ifdef INTR_FILTER
 		error = intr_event_schedule_thread(ie, ie->ie_thread);
 #else
 		error = intr_event_schedule_thread(ie);
 #endif
 		KASSERT(error == 0, ("stray software interrupt"));
 	}
 }
 
 /*
  * Remove a software interrupt handler.  Currently this code does not
  * remove the associated interrupt event if it becomes empty.  Calling code
  * may do so manually via intr_event_destroy(), but that's not really
  * an optimal interface.
  */
 int
 swi_remove(void *cookie)
 {
 
 	return (intr_event_remove_handler(cookie));
 }
 
 #ifdef INTR_FILTER
 static void
 priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih)
 {
 	struct intr_event *ie;
 
 	ie = ih->ih_event;
 	/*
 	 * If this handler is marked for death, remove it from
 	 * the list of handlers and wake up the sleeper.
 	 */
 	if (ih->ih_flags & IH_DEAD) {
 		mtx_lock(&ie->ie_lock);
 		TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
 		ih->ih_flags &= ~IH_DEAD;
 		wakeup(ih);
 		mtx_unlock(&ie->ie_lock);
 		return;
 	}
 	
 	/* Execute this handler. */
 	CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
 	     __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
 	     ih->ih_name, ih->ih_flags);
 	
 	if (!(ih->ih_flags & IH_MPSAFE))
 		mtx_lock(&Giant);
 	ih->ih_handler(ih->ih_argument);
 	if (!(ih->ih_flags & IH_MPSAFE))
 		mtx_unlock(&Giant);
 }
 #endif
 
 /*
  * This is a public function for use by drivers that mux interrupt
  * handlers for child devices from their interrupt handler.
  */
 void
 intr_event_execute_handlers(struct proc *p, struct intr_event *ie)
 {
 	struct intr_handler *ih, *ihn;
 
 	TAILQ_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) {
 		/*
 		 * If this handler is marked for death, remove it from
 		 * the list of handlers and wake up the sleeper.
 		 */
 		if (ih->ih_flags & IH_DEAD) {
 			mtx_lock(&ie->ie_lock);
 			TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
 			ih->ih_flags &= ~IH_DEAD;
 			wakeup(ih);
 			mtx_unlock(&ie->ie_lock);
 			continue;
 		}
 
 		/* Skip filter only handlers */
 		if (ih->ih_handler == NULL)
 			continue;
 
 		/*
 		 * For software interrupt threads, we only execute
 		 * handlers that have their need flag set.  Hardware
 		 * interrupt threads always invoke all of their handlers.
 		 *
 		 * ih_need can only be 0 or 1.  Failed cmpset below
 		 * means that there is no request to execute handlers,
 		 * so a retry of the cmpset is not needed.
 		 */
 		if ((ie->ie_flags & IE_SOFT) != 0 &&
 		    atomic_cmpset_int(&ih->ih_need, 1, 0) == 0)
 			continue;
 
 		/* Execute this handler. */
 		CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
 		    __func__, p->p_pid, (void *)ih->ih_handler, 
 		    ih->ih_argument, ih->ih_name, ih->ih_flags);
 
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_lock(&Giant);
 		ih->ih_handler(ih->ih_argument);
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_unlock(&Giant);
 	}
 }
 
 static void
 ithread_execute_handlers(struct proc *p, struct intr_event *ie)
 {
 
 	/* Interrupt handlers should not sleep. */
 	if (!(ie->ie_flags & IE_SOFT))
 		THREAD_NO_SLEEPING();
 	intr_event_execute_handlers(p, ie);
 	if (!(ie->ie_flags & IE_SOFT))
 		THREAD_SLEEPING_OK();
 
 	/*
 	 * Interrupt storm handling:
 	 *
 	 * If this interrupt source is currently storming, then throttle
 	 * it to only fire the handler once  per clock tick.
 	 *
 	 * If this interrupt source is not currently storming, but the
 	 * number of back to back interrupts exceeds the storm threshold,
 	 * then enter storming mode.
 	 */
 	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
 	    !(ie->ie_flags & IE_SOFT)) {
 		/* Report the message only once every second. */
 		if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
 			printf(
 	"interrupt storm detected on \"%s\"; throttling interrupt source\n",
 			    ie->ie_name);
 		}
 		pause("istorm", 1);
 	} else
 		ie->ie_count++;
 
 	/*
 	 * Now that all the handlers have had a chance to run, reenable
 	 * the interrupt source.
 	 */
 	if (ie->ie_post_ithread != NULL)
 		ie->ie_post_ithread(ie->ie_source);
 }
 
 #ifndef INTR_FILTER
 /*
  * This is the main code for interrupt threads.
  */
 static void
 ithread_loop(void *arg)
 {
 	struct intr_thread *ithd;
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	int wake;
 
 	td = curthread;
 	p = td->td_proc;
 	ithd = (struct intr_thread *)arg;
 	KASSERT(ithd->it_thread == td,
 	    ("%s: ithread and proc linkage out of sync", __func__));
 	ie = ithd->it_event;
 	ie->ie_count = 0;
 	wake = 0;
 
 	/*
 	 * As long as we have interrupts outstanding, go through the
 	 * list of handlers, giving each one a go at it.
 	 */
 	for (;;) {
 		/*
 		 * If we are an orphaned thread, then just die.
 		 */
 		if (ithd->it_flags & IT_DEAD) {
 			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
 			    p->p_pid, td->td_name);
 			free(ithd, M_ITHREAD);
 			kthread_exit();
 		}
 
 		/*
 		 * Service interrupts.  If another interrupt arrives while
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 *
 		 * The load_acq part of the following cmpset ensures
 		 * that the load of ih_need in ithread_execute_handlers()
 		 * is ordered after the load of it_need here.
 		 */
 		while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0)
 			ithread_execute_handlers(p, ie);
 		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		/*
 		 * Processed all our interrupts.  Now get the sched
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
 		if (atomic_load_acq_int(&ithd->it_need) == 0 &&
 		    (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
 		}
 		if (ithd->it_flags & IT_WAIT) {
 			wake = 1;
 			ithd->it_flags &= ~IT_WAIT;
 		}
 		thread_unlock(td);
 		if (wake) {
 			wakeup(ithd);
 			wake = 0;
 		}
 	}
 }
 
 /*
  * Main interrupt handling body.
  *
  * Input:
  * o ie:                        the event connected to this interrupt.
  * o frame:                     some archs (i.e. i386) pass a frame to some.
  *                              handlers as their main argument.
  * Return value:
  * o 0:                         everything ok.
  * o EINVAL:                    stray interrupt.
  */
 int
 intr_event_handle(struct intr_event *ie, struct trapframe *frame)
 {
 	struct intr_handler *ih;
 	struct trapframe *oldframe;
 	struct thread *td;
 	int error, ret, thread;
 
 	td = curthread;
 
 #ifdef KSTACK_USAGE_PROF
 	intr_prof_stack_use(td, frame);
 #endif
 
 	/* An interrupt with no event or handlers is a stray interrupt. */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
 
 	/*
 	 * Execute fast interrupt handlers directly.
 	 * To support clock handlers, if a handler registers
 	 * with a NULL argument, then we pass it a pointer to
 	 * a trapframe as its argument.
 	 */
 	td->td_intr_nesting_level++;
 	thread = 0;
 	ret = 0;
 	critical_enter();
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = frame;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (ih->ih_filter == NULL) {
 			thread = 1;
 			continue;
 		}
 		CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__,
 		    ih->ih_filter, ih->ih_argument == NULL ? frame :
 		    ih->ih_argument, ih->ih_name);
 		if (ih->ih_argument == NULL)
 			ret = ih->ih_filter(frame);
 		else
 			ret = ih->ih_filter(ih->ih_argument);
 		KASSERT(ret == FILTER_STRAY ||
 		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
 		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
 		    ("%s: incorrect return value %#x from %s", __func__, ret,
 		    ih->ih_name));
 
 		/* 
 		 * Wrapper handler special handling:
 		 *
 		 * in some particular cases (like pccard and pccbb), 
 		 * the _real_ device handler is wrapped in a couple of
 		 * functions - a filter wrapper and an ithread wrapper.
 		 * In this case (and just in this case), the filter wrapper 
 		 * could ask the system to schedule the ithread and mask
 		 * the interrupt source if the wrapped handler is composed
 		 * of just an ithread handler.
 		 *
 		 * TODO: write a generic wrapper to avoid people rolling 
 		 * their own
 		 */
 		if (!thread) {
 			if (ret == FILTER_SCHEDULE_THREAD)
 				thread = 1;
 		}
 	}
 	td->td_intr_frame = oldframe;
 
 	if (thread) {
 		if (ie->ie_pre_ithread != NULL)
 			ie->ie_pre_ithread(ie->ie_source);
 	} else {
 		if (ie->ie_post_filter != NULL)
 			ie->ie_post_filter(ie->ie_source);
 	}
 	
 	/* Schedule the ithread if needed. */
 	if (thread) {
 		error = intr_event_schedule_thread(ie);
 		KASSERT(error == 0, ("bad stray interrupt"));
 	}
 	critical_exit();
 	td->td_intr_nesting_level--;
 	return (0);
 }
 #else
 /*
  * This is the main code for interrupt threads.
  */
 static void
 ithread_loop(void *arg)
 {
 	struct intr_thread *ithd;
 	struct intr_handler *ih;
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	int priv;
 	int wake;
 
 	td = curthread;
 	p = td->td_proc;
 	ih = (struct intr_handler *)arg;
 	priv = (ih->ih_thread != NULL) ? 1 : 0;
 	ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread;
 	KASSERT(ithd->it_thread == td,
 	    ("%s: ithread and proc linkage out of sync", __func__));
 	ie = ithd->it_event;
 	ie->ie_count = 0;
 	wake = 0;
 
 	/*
 	 * As long as we have interrupts outstanding, go through the
 	 * list of handlers, giving each one a go at it.
 	 */
 	for (;;) {
 		/*
 		 * If we are an orphaned thread, then just die.
 		 */
 		if (ithd->it_flags & IT_DEAD) {
 			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
 			    p->p_pid, td->td_name);
 			free(ithd, M_ITHREAD);
 			kthread_exit();
 		}
 
 		/*
 		 * Service interrupts.  If another interrupt arrives while
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 *
 		 * The load_acq part of the following cmpset ensures
 		 * that the load of ih_need in ithread_execute_handlers()
 		 * is ordered after the load of it_need here.
 		 */
 		while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) {
 			if (priv)
 				priv_ithread_execute_handler(p, ih);
 			else 
 				ithread_execute_handlers(p, ie);
 		}
 		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		/*
 		 * Processed all our interrupts.  Now get the sched
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
 		if (atomic_load_acq_int(&ithd->it_need) == 0 &&
 		    (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
 		}
 		if (ithd->it_flags & IT_WAIT) {
 			wake = 1;
 			ithd->it_flags &= ~IT_WAIT;
 		}
 		thread_unlock(td);
 		if (wake) {
 			wakeup(ithd);
 			wake = 0;
 		}
 	}
 }
 
 /* 
  * Main loop for interrupt filter.
  *
  * Some architectures (i386, amd64 and arm) require the optional frame 
  * parameter, and use it as the main argument for fast handler execution
  * when ih_argument == NULL.
  *
  * Return value:
  * o FILTER_STRAY:              No filter recognized the event, and no
  *                              filter-less handler is registered on this 
  *                              line.
  * o FILTER_HANDLED:            A filter claimed the event and served it.
  * o FILTER_SCHEDULE_THREAD:    No filter claimed the event, but there's at
  *                              least one filter-less handler on this line.
  * o FILTER_HANDLED | 
  *   FILTER_SCHEDULE_THREAD:    A filter claimed the event, and asked for
  *                              scheduling the per-handler ithread.
  *
  * In case an ithread has to be scheduled, in *ithd there will be a 
  * pointer to a struct intr_thread containing the thread to be
  * scheduled.
  */
 
 static int
 intr_filter_loop(struct intr_event *ie, struct trapframe *frame, 
 		 struct intr_thread **ithd) 
 {
 	struct intr_handler *ih;
 	void *arg;
 	int ret, thread_only;
 
 	ret = 0;
 	thread_only = 0;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		/*
 		 * Execute fast interrupt handlers directly.
 		 * To support clock handlers, if a handler registers
 		 * with a NULL argument, then we pass it a pointer to
 		 * a trapframe as its argument.
 		 */
 		arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument);
 		
 		CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__,
 		     ih->ih_filter, ih->ih_handler, arg, ih->ih_name);
 
 		if (ih->ih_filter != NULL)
 			ret = ih->ih_filter(arg);
 		else {
 			thread_only = 1;
 			continue;
 		}
 		KASSERT(ret == FILTER_STRAY ||
 		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
 		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
 		    ("%s: incorrect return value %#x from %s", __func__, ret,
 		    ih->ih_name));
 		if (ret & FILTER_STRAY)
 			continue;
 		else { 
 			*ithd = ih->ih_thread;
 			return (ret);
 		}
 	}
 
 	/*
 	 * No filters handled the interrupt and we have at least
 	 * one handler without a filter.  In this case, we schedule
 	 * all of the filter-less handlers to run in the ithread.
 	 */	
 	if (thread_only) {
 		*ithd = ie->ie_thread;
 		return (FILTER_SCHEDULE_THREAD);
 	}
 	return (FILTER_STRAY);
 }
 
 /*
  * Main interrupt handling body.
  *
  * Input:
  * o ie:                        the event connected to this interrupt.
  * o frame:                     some archs (i.e. i386) pass a frame to some.
  *                              handlers as their main argument.
  * Return value:
  * o 0:                         everything ok.
  * o EINVAL:                    stray interrupt.
  */
 int
 intr_event_handle(struct intr_event *ie, struct trapframe *frame)
 {
 	struct intr_thread *ithd;
 	struct trapframe *oldframe;
 	struct thread *td;
 	int thread;
 
 	ithd = NULL;
 	td = curthread;
 
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
 
 	td->td_intr_nesting_level++;
 	thread = 0;
 	critical_enter();
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = frame;
 	thread = intr_filter_loop(ie, frame, &ithd);	
 	if (thread & FILTER_HANDLED) {
 		if (ie->ie_post_filter != NULL)
 			ie->ie_post_filter(ie->ie_source);
 	} else {
 		if (ie->ie_pre_ithread != NULL)
 			ie->ie_pre_ithread(ie->ie_source);
 	}
 	td->td_intr_frame = oldframe;
 	critical_exit();
 	
 	/* Interrupt storm logic */
 	if (thread & FILTER_STRAY) {
 		ie->ie_count++;
 		if (ie->ie_count < intr_storm_threshold)
 			printf("Interrupt stray detection not present\n");
 	}
 
 	/* Schedule an ithread if needed. */
 	if (thread & FILTER_SCHEDULE_THREAD) {
 		if (intr_event_schedule_thread(ie, ithd) != 0)
 			panic("%s: impossible stray interrupt", __func__);
 	}
 	td->td_intr_nesting_level--;
 	return (0);
 }
 #endif
 
 #ifdef DDB
 /*
  * Dump details about an interrupt handler
  */
 static void
 db_dump_intrhand(struct intr_handler *ih)
 {
 	int comma;
 
 	db_printf("\t%-10s ", ih->ih_name);
 	switch (ih->ih_pri) {
 	case PI_REALTIME:
 		db_printf("CLK ");
 		break;
 	case PI_AV:
 		db_printf("AV  ");
 		break;
 	case PI_TTY:
 		db_printf("TTY ");
 		break;
 	case PI_NET:
 		db_printf("NET ");
 		break;
 	case PI_DISK:
 		db_printf("DISK");
 		break;
 	case PI_DULL:
 		db_printf("DULL");
 		break;
 	default:
 		if (ih->ih_pri >= PI_SOFT)
 			db_printf("SWI ");
 		else
 			db_printf("%4u", ih->ih_pri);
 		break;
 	}
 	db_printf(" ");
 	if (ih->ih_filter != NULL) {
 		db_printf("[F]");
 		db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC);
 	}
 	if (ih->ih_handler != NULL) {
 		if (ih->ih_filter != NULL)
 			db_printf(",");
 		db_printf("[H]");
 		db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
 	}
 	db_printf("(%p)", ih->ih_argument);
 	if (ih->ih_need ||
 	    (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
 	    IH_MPSAFE)) != 0) {
 		db_printf(" {");
 		comma = 0;
 		if (ih->ih_flags & IH_EXCLUSIVE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("EXCL");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_ENTROPY) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ENTROPY");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_DEAD) {
 			if (comma)
 				db_printf(", ");
 			db_printf("DEAD");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_MPSAFE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("MPSAFE");
 			comma = 1;
 		}
 		if (ih->ih_need) {
 			if (comma)
 				db_printf(", ");
 			db_printf("NEED");
 		}
 		db_printf("}");
 	}
 	db_printf("\n");
 }
 
 /*
  * Dump details about a event.
  */
 void
 db_dump_intr_event(struct intr_event *ie, int handlers)
 {
 	struct intr_handler *ih;
 	struct intr_thread *it;
 	int comma;
 
 	db_printf("%s ", ie->ie_fullname);
 	it = ie->ie_thread;
 	if (it != NULL)
 		db_printf("(pid %d)", it->it_thread->td_proc->p_pid);
 	else
 		db_printf("(no thread)");
 	if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 ||
 	    (it != NULL && it->it_need)) {
 		db_printf(" {");
 		comma = 0;
 		if (ie->ie_flags & IE_SOFT) {
 			db_printf("SOFT");
 			comma = 1;
 		}
 		if (ie->ie_flags & IE_ENTROPY) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ENTROPY");
 			comma = 1;
 		}
 		if (ie->ie_flags & IE_ADDING_THREAD) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ADDING_THREAD");
 			comma = 1;
 		}
 		if (it != NULL && it->it_need) {
 			if (comma)
 				db_printf(", ");
 			db_printf("NEED");
 		}
 		db_printf("}");
 	}
 	db_printf("\n");
 
 	if (handlers)
 		TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		    db_dump_intrhand(ih);
 }
 
 /*
  * Dump data about interrupt handlers
  */
 DB_SHOW_COMMAND(intr, db_show_intr)
 {
 	struct intr_event *ie;
 	int all, verbose;
 
 	verbose = strchr(modif, 'v') != NULL;
 	all = strchr(modif, 'a') != NULL;
 	TAILQ_FOREACH(ie, &event_list, ie_list) {
 		if (!all && TAILQ_EMPTY(&ie->ie_handlers))
 			continue;
 		db_dump_intr_event(ie, verbose);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* DDB */
 
 /*
  * Start standard software interrupt threads
  */
 static void
 start_softintr(void *dummy)
 {
 
 	if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih))
 		panic("died while creating vm swi ithread");
 }
 SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr,
     NULL);
 
 /*
  * Sysctls used by systat and others: hw.intrnames and hw.intrcnt.
  * The data for this machine dependent, and the declarations are in machine
  * dependent code.  The layout of intrnames and intrcnt however is machine
  * independent.
  *
  * We do not know the length of intrcnt and intrnames at compile time, so
  * calculate things at run time.
  */
 static int
 sysctl_intrnames(SYSCTL_HANDLER_ARGS)
 {
 	return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_intrnames, "", "Interrupt Names");
 
 static int
 sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
 {
 #ifdef SCTL_MASK32
 	uint32_t *intrcnt32;
 	unsigned i;
 	int error;
 
 	if (req->flags & SCTL_MASK32) {
 		if (!req->oldptr)
 			return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req));
 		intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT);
 		if (intrcnt32 == NULL)
 			return (ENOMEM);
 		for (i = 0; i < sintrcnt / sizeof (u_long); i++)
 			intrcnt32[i] = intrcnt[i];
 		error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req);
 		free(intrcnt32, M_TEMP);
 		return (error);
 	}
 #endif
 	return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_intrcnt, "", "Interrupt Counts");
 
 #ifdef DDB
 /*
  * DDB command to dump the interrupt statistics.
  */
 DB_SHOW_COMMAND(intrcnt, db_show_intrcnt)
 {
 	u_long *i;
 	char *cp;
 	u_int j;
 
 	cp = intrnames;
 	j = 0;
 	for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit;
 	    i++, j++) {
 		if (*cp == '\0')
 			break;
 		if (*i != 0)
 			db_printf("%s\t%lu\n", cp, *i);
 		cp += strlen(cp) + 1;
 	}
 }
 #endif
Index: head/sys/kern/kern_jail.c
===================================================================
--- head/sys/kern/kern_jail.c	(revision 326270)
+++ head/sys/kern/kern_jail.c	(revision 326271)
@@ -1,4117 +1,4119 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2008 Bjoern A. Zeeb.
  * Copyright (c) 2009 James Gritton.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/osd.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 #include <security/mac/mac_framework.h>
 
 #define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
 
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
 
 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
 #ifdef INET
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
 #endif
 #else /* !INET */
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	0
 #endif
 #endif
 
 /* prison0 describes what is "real" about the system. */
 struct prison prison0 = {
 	.pr_id		= 0,
 	.pr_name	= "0",
 	.pr_ref		= 1,
 	.pr_uref	= 1,
 	.pr_path	= "/",
 	.pr_securelevel	= -1,
 	.pr_devfs_rsnum = 0,
 	.pr_childmax	= JAIL_MAX,
 	.pr_hostuuid	= DEFAULT_HOSTUUID,
 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
 #ifdef VIMAGE
 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
 #else
 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
 #endif
 	.pr_allow	= PR_ALLOW_ALL,
 };
 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
 
 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
 struct	sx allprison_lock;
 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
 LIST_HEAD(, prison_racct) allprison_racct;
 int	lastprid = 0;
 
 static int do_jail_attach(struct thread *td, struct prison *pr);
 static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
 static char *prison_path(struct prison *pr1, struct prison *pr2);
 static void prison_remove_one(struct prison *pr);
 #ifdef RACCT
 static void prison_racct_attach(struct prison *pr);
 static void prison_racct_modify(struct prison *pr);
 static void prison_racct_detach(struct prison *pr);
 #endif
 
 /* Flags for prison_deref */
 #define	PD_DEREF	0x01
 #define	PD_DEUREF	0x02
 #define	PD_LOCKED	0x04
 #define	PD_LIST_SLOCKED	0x08
 #define	PD_LIST_XLOCKED	0x10
 
 /*
  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
  * as we cannot figure out the size of a sparse array, or an array without a
  * terminating entry.
  */
 static char *pr_flag_names[] = {
 	[0] = "persist",
 #ifdef INET
 	[7] = "ip4.saddrsel",
 #endif
 #ifdef INET6
 	[8] = "ip6.saddrsel",
 #endif
 };
 const size_t pr_flag_names_size = sizeof(pr_flag_names);
 
 static char *pr_flag_nonames[] = {
 	[0] = "nopersist",
 #ifdef INET
 	[7] = "ip4.nosaddrsel",
 #endif
 #ifdef INET6
 	[8] = "ip6.nosaddrsel",
 #endif
 };
 const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
 
 struct jailsys_flags {
 	const char	*name;
 	unsigned	 disable;
 	unsigned	 new;
 } pr_flag_jailsys[] = {
 	{ "host", 0, PR_HOST },
 #ifdef VIMAGE
 	{ "vnet", 0, PR_VNET },
 #endif
 #ifdef INET
 	{ "ip4", PR_IP4_USER, PR_IP4_USER },
 #endif
 #ifdef INET6
 	{ "ip6", PR_IP6_USER, PR_IP6_USER },
 #endif
 };
 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
 
 static char *pr_allow_names[] = {
 	"allow.set_hostname",
 	"allow.sysvipc",
 	"allow.raw_sockets",
 	"allow.chflags",
 	"allow.mount",
 	"allow.quotas",
 	"allow.socket_af",
 	"allow.mount.devfs",
 	"allow.mount.nullfs",
 	"allow.mount.zfs",
 	"allow.mount.procfs",
 	"allow.mount.tmpfs",
 	"allow.mount.fdescfs",
 	"allow.mount.linprocfs",
 	"allow.mount.linsysfs",
 	"allow.reserved_ports",
 };
 const size_t pr_allow_names_size = sizeof(pr_allow_names);
 
 static char *pr_allow_nonames[] = {
 	"allow.noset_hostname",
 	"allow.nosysvipc",
 	"allow.noraw_sockets",
 	"allow.nochflags",
 	"allow.nomount",
 	"allow.noquotas",
 	"allow.nosocket_af",
 	"allow.mount.nodevfs",
 	"allow.mount.nonullfs",
 	"allow.mount.nozfs",
 	"allow.mount.noprocfs",
 	"allow.mount.notmpfs",
 	"allow.mount.nofdescfs",
 	"allow.mount.nolinprocfs",
 	"allow.mount.nolinsysfs",
 	"allow.noreserved_ports",
 };
 const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
 
 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | PR_ALLOW_RESERVED_PORTS)
 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
 #if defined(INET) || defined(INET6)
 static unsigned jail_max_af_ips = 255;
 #endif
 
 /*
  * Initialize the parts of prison0 that can't be static-initialized with
  * constants.  This is called from proc0_init() after creating thread0 cpuset.
  */
 void
 prison0_init(void)
 {
 
 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
 	prison0.pr_osreldate = osreldate;
 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
 }
 
 /*
  * struct jail_args {
  *	struct jail *jail;
  * };
  */
 int
 sys_jail(struct thread *td, struct jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		struct jail_v0 j0;
 
 		/* FreeBSD single IPv4 jails. */
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
 		if (error)
 			return (error);
 		j.version = j0.version;
 		j.path = j0.path;
 		j.hostname = j0.hostname;
 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		error = copyin(uap->jail, &j, sizeof(struct jail));
 		if (error)
 			return (error);
 		break;
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 kern_jail(struct thread *td, struct jail *j)
 {
 	struct iovec optiov[2 * (4 + nitems(pr_allow_names)
 #ifdef INET
 			    + 1
 #endif
 #ifdef INET6
 			    + 1
 #endif
 			    )];
 	struct uio opt;
 	char *u_path, *u_hostname, *u_name;
 #ifdef INET
 	uint32_t ip4s;
 	struct in_addr *u_ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *u_ip6;
 #endif
 	size_t tmplen;
 	int error, enforce_statfs, fi;
 
 	bzero(&optiov, sizeof(optiov));
 	opt.uio_iov = optiov;
 	opt.uio_iovcnt = 0;
 	opt.uio_offset = -1;
 	opt.uio_resid = -1;
 	opt.uio_segflg = UIO_SYSSPACE;
 	opt.uio_rw = UIO_READ;
 	opt.uio_td = td;
 
 	/* Set permissions for top-level jails from sysctls. */
 	if (!jailed(td->td_ucred)) {
 		for (fi = 0; fi < nitems(pr_allow_names); fi++) {
 			optiov[opt.uio_iovcnt].iov_base =
 			    (jail_default_allow & (1 << fi))
 			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
 			optiov[opt.uio_iovcnt].iov_len =
 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
 			opt.uio_iovcnt += 2;
 		}
 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
 		opt.uio_iovcnt++;
 		enforce_statfs = jail_default_enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
 		opt.uio_iovcnt++;
 	}
 
 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
 #ifdef INET
 	ip4s = (j->version == 0) ? 1 : j->ip4s;
 	if (ip4s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += ip4s * sizeof(struct in_addr);
 #else
 	if (j->ip4s > 0)
 		return (EINVAL);
 #endif
 #ifdef INET6
 	if (j->ip6s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += j->ip6s * sizeof(struct in6_addr);
 #else
 	if (j->ip6s > 0)
 		return (EINVAL);
 #endif
 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
 	u_hostname = u_path + MAXPATHLEN;
 	u_name = u_hostname + MAXHOSTNAMELEN;
 #ifdef INET
 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #ifdef INET6
 #ifdef INET
 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
 #else
 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #endif
 	optiov[opt.uio_iovcnt].iov_base = "path";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_path;
 	error = copyinstr(j->path, u_path, MAXPATHLEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	if (j->jailname != NULL) {
 		optiov[opt.uio_iovcnt].iov_base = "name";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
 		opt.uio_iovcnt++;
 		optiov[opt.uio_iovcnt].iov_base = u_name;
 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
 		    &optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 		opt.uio_iovcnt++;
 	}
 #ifdef INET
 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
 	if (j->version == 0)
 		u_ip4->s_addr = j->ip4s;
 	else {
 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 	}
 	opt.uio_iovcnt++;
 #endif
 #ifdef INET6
 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 #endif
 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
 	free(u_path, M_TEMP);
 	return (error);
 }
 
 
 /*
  * struct jail_set_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_set(struct thread *td, struct jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 {
 	struct nameidata nd;
 #ifdef INET
 	struct in_addr *ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6;
 #endif
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
 	struct vnode *root;
 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
 	char *g_path, *osrelstr;
 #if defined(INET) || defined(INET6)
 	struct prison *tppr;
 	void *op;
 #endif
 	unsigned long hid;
 	size_t namelen, onamelen, pnamelen;
 	int born, created, cuflags, descend, enforce;
 	int error, errmsg_len, errmsg_pos;
 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
 	int fi, jid, jsys, len, level;
 	int childmax, osreldt, rsnum, slevel;
 	int fullpath_disabled;
 #if defined(INET) || defined(INET6)
 	int ii, ij;
 #endif
 #ifdef INET
 	int ip4s, redo_ip4;
 #endif
 #ifdef INET6
 	int ip6s, redo_ip6;
 #endif
 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
 	unsigned tallow;
 	char numbuf[12];
 
 	error = priv_check(td, PRIV_JAIL_SET);
 	if (!error && (flags & JAIL_ATTACH))
 		error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 	mypr = td->td_ucred->cr_prison;
 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
 		return (EPERM);
 	if (flags & ~JAIL_SET_MASK)
 		return (EINVAL);
 
 	/*
 	 * Check all the parameters before committing to anything.  Not all
 	 * errors can be caught early, but we may as well try.  Also, this
 	 * takes care of some expensive stuff (path lookup) before getting
 	 * the allprison lock.
 	 *
 	 * XXX Jails are not filesystems, and jail parameters are not mount
 	 *     options.  But it makes more sense to re-use the vfsopt code
 	 *     than duplicate it under a different name.
 	 */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 #ifdef INET
 	ip4 = NULL;
 #endif
 #ifdef INET6
 	ip6 = NULL;
 #endif
 	g_path = NULL;
 
 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
 	if (!cuflags) {
 		error = EINVAL;
 		vfs_opterror(opts, "no valid operation (create or update)");
 		goto done_errmsg;
 	}
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == ENOENT)
 		jid = 0;
 	else if (error != 0)
 		goto done_free;
 
 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
 	if (error == ENOENT)
 		gotslevel = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotslevel = 1;
 
 	error =
 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
 	if (error == ENOENT)
 		gotchildmax = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotchildmax = 1;
 
 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
 	if (error == ENOENT)
 		gotenforce = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (enforce < 0 || enforce > 2) {
 		error = EINVAL;
 		goto done_free;
 	} else
 		gotenforce = 1;
 
 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
 	if (error == ENOENT)
 		gotrsnum = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotrsnum = 1;
 
 	pr_flags = ch_flags = 0;
 	for (fi = 0; fi < nitems(pr_flag_names); fi++) {
 		if (pr_flag_names[fi] == NULL)
 			continue;
 		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
 		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
 	}
 	ch_flags |= pr_flags;
 	for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) {
 		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
 		    sizeof(jsys));
 		if (error == ENOENT)
 			continue;
 		if (error != 0)
 			goto done_free;
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			if (!pr_flag_jailsys[fi].disable) {
 				error = EINVAL;
 				goto done_free;
 			}
 			pr_flags |= pr_flag_jailsys[fi].disable;
 			break;
 		case JAIL_SYS_NEW:
 			pr_flags |= pr_flag_jailsys[fi].new;
 			break;
 		case JAIL_SYS_INHERIT:
 			break;
 		default:
 			error = EINVAL;
 			goto done_free;
 		}
 		ch_flags |=
 		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
 	}
 	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
 	    && !(pr_flags & PR_PERSIST)) {
 		error = EINVAL;
 		vfs_opterror(opts, "new jail must persist or attach");
 		goto done_errmsg;
 	}
 #ifdef VIMAGE
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
 		error = EINVAL;
 		vfs_opterror(opts, "vnet cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip4 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET6
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip6 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 
 	pr_allow = ch_allow = 0;
 	for (fi = 0; fi < nitems(pr_allow_names); fi++) {
 		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
 		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
 	}
 	ch_allow |= pr_allow;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == ENOENT)
 		name = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
 	if (error == ENOENT)
 		host = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || host[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
 	if (error == ENOENT)
 		domain = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || domain[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
 	if (error == ENOENT)
 		uuid = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || uuid[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > HOSTUUIDLEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32;
 
 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
 		hid = hid32;
 	} else
 #endif
 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
 	if (error == ENOENT)
 		gothid = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		gothid = 1;
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 	}
 
 #ifdef INET
 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
 	if (error == ENOENT)
 		ip4s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip4s & (sizeof(*ip4) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP4_USER;
 		pr_flags |= PR_IP4_USER;
 		if (ip4s > 0) {
 			ip4s /= sizeof(*ip4);
 			if (ip4s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv4 addresses");
 				goto done_errmsg;
 			}
 			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 			bcopy(op, ip4, ip4s * sizeof(*ip4));
 			/*
 			 * IP addresses are all sorted but ip[0] to preserve
 			 * the primary IP address as given from userland.
 			 * This special IP is used for unbound outgoing
 			 * connections as well for "loopback" traffic in case
 			 * source address selection cannot find any more fitting
 			 * address to connect from.
 			 */
 			if (ip4s > 1)
 				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
 				    prison_qcmp_v4);
 			/*
 			 * Check for duplicate addresses and do some simple
 			 * zero and broadcast checks. If users give other bogus
 			 * addresses it is their problem.
 			 *
 			 * We do not have to care about byte order for these
 			 * checks so we will do them in NBO.
 			 */
 			for (ii = 0; ii < ip4s; ii++) {
 				if (ip4[ii].s_addr == INADDR_ANY ||
 				    ip4[ii].s_addr == INADDR_BROADCAST) {
 					error = EINVAL;
 					goto done_free;
 				}
 				if ((ii+1) < ip4s &&
 				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
 				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
 					error = EINVAL;
 					goto done_free;
 				}
 			}
 		}
 	}
 #endif
 
 #ifdef INET6
 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
 	if (error == ENOENT)
 		ip6s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip6s & (sizeof(*ip6) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP6_USER;
 		pr_flags |= PR_IP6_USER;
 		if (ip6s > 0) {
 			ip6s /= sizeof(*ip6);
 			if (ip6s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv6 addresses");
 				goto done_errmsg;
 			}
 			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 			bcopy(op, ip6, ip6s * sizeof(*ip6));
 			if (ip6s > 1)
 				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
 				    prison_qcmp_v6);
 			for (ii = 0; ii < ip6s; ii++) {
 				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
 					error = EINVAL;
 					goto done_free;
 				}
 				if ((ii+1) < ip6s &&
 				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
 				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
 				{
 					error = EINVAL;
 					goto done_free;
 				}
 			}
 		}
 	}
 #endif
 
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 		error = EINVAL;
 		vfs_opterror(opts,
 		    "vnet jails cannot have IP address restrictions");
 		goto done_errmsg;
 	}
 #endif
 
 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
 	if (error == ENOENT)
 		osrelstr = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osrelease cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || len >= OSRELEASELEN) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osrelease string must be 1-%d bytes long",
 			    OSRELEASELEN - 1);
 			goto done_errmsg;
 		}
 	}
 
 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
 	if (error == ENOENT)
 		osreldt = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osreldate cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (osreldt == 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "osreldate cannot be 0");
 			goto done_errmsg;
 		}
 	}
 
 	fullpath_disabled = 0;
 	root = NULL;
 	error = vfs_getopt(opts, "path", (void **)&path, &len);
 	if (error == ENOENT)
 		path = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "path cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || path[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 		    path, td);
 		error = namei(&nd);
 		if (error)
 			goto done_free;
 		root = nd.ni_vp;
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		strlcpy(g_path, path, MAXPATHLEN);
 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
 		if (error == 0)
 			path = g_path;
 		else if (error == ENODEV) {
 			/* proceed if sysctl debug.disablefullpath == 1 */
 			fullpath_disabled = 1;
 			if (len < 2 || (len == 2 && path[0] == '/'))
 				path = NULL;
 		} else {
 			/* exit on other errors */
 			goto done_free;
 		}
 		if (root->v_type != VDIR) {
 			error = ENOTDIR;
 			vput(root);
 			goto done_free;
 		}
 		VOP_UNLOCK(root, 0);
 		if (fullpath_disabled) {
 			/* Leave room for a real-root full pathname. */
 			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
 			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
 				error = ENAMETOOLONG;
 				vrele(root);
 				goto done_free;
 			}
 		}
 	}
 
 	/*
 	 * Find the specified jail, or at least its parent.
 	 * This abuses the file error codes ENOENT and EEXIST.
 	 */
 	pr = NULL;
 	ppr = mypr;
 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
 		namelc = strrchr(name, '.');
 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
 		if (*p != '\0')
 			jid = 0;
 	}
 	sx_xlock(&allprison_lock);
 	if (jid != 0) {
 		/*
 		 * See if a requested jid already exists.  There is an
 		 * information leak here if the jid exists but is not within
 		 * the caller's jail hierarchy.  Jail creators will get EEXIST
 		 * even though they cannot see the jail, and CREATE | UPDATE
 		 * will return ENOENT which is not normally a valid error.
 		 */
 		if (jid < 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "negative jid");
 			goto done_unlock_list;
 		}
 		pr = prison_find(jid);
 		if (pr != NULL) {
 			ppr = pr->pr_parent;
 			/* Create: jid must not exist. */
 			if (cuflags == JAIL_CREATE) {
 				mtx_unlock(&pr->pr_mtx);
 				error = EEXIST;
 				vfs_opterror(opts, "jail %d already exists",
 				    jid);
 				goto done_unlock_list;
 			}
 			if (!prison_ischild(mypr, pr)) {
 				mtx_unlock(&pr->pr_mtx);
 				pr = NULL;
 			} else if (pr->pr_uref == 0) {
 				if (!(flags & JAIL_DYING)) {
 					mtx_unlock(&pr->pr_mtx);
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_unlock_list;
 				} else if ((flags & JAIL_ATTACH) ||
 				    (pr_flags & PR_PERSIST)) {
 					/*
 					 * A dying jail might be resurrected
 					 * (via attach or persist), but first
 					 * it must determine if another jail
 					 * has claimed its name.  Accomplish
 					 * this by implicitly re-setting the
 					 * name.
 					 */
 					if (name == NULL)
 						name = prison_name(mypr, pr);
 				}
 			}
 		}
 		if (pr == NULL) {
 			/* Update: jid must exist. */
 			if (cuflags == JAIL_UPDATE) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d not found", jid);
 				goto done_unlock_list;
 			}
 		}
 	}
 	/*
 	 * If the caller provided a name, look for a jail by that name.
 	 * This has different semantics for creates and updates keyed by jid
 	 * (where the name must not already exist in a different jail),
 	 * and updates keyed by the name itself (where the name must exist
 	 * because that is the jail being updated).
 	 */
 	namelc = NULL;
 	if (name != NULL) {
 		namelc = strrchr(name, '.');
 		if (namelc == NULL)
 			namelc = name;
 		else {
 			/*
 			 * This is a hierarchical name.  Split it into the
 			 * parent and child names, and make sure the parent
 			 * exists or matches an already found jail.
 			 */
 			if (pr != NULL) {
 				if (strncmp(name, ppr->pr_name, namelc - name)
 				    || ppr->pr_name[namelc - name] != '\0') {
 					mtx_unlock(&pr->pr_mtx);
 					error = EINVAL;
 					vfs_opterror(opts,
 					    "cannot change jail's parent");
 					goto done_unlock_list;
 				}
 			} else {
 				*namelc = '\0';
 				ppr = prison_find_name(mypr, name);
 				if (ppr == NULL) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" not found", name);
 					goto done_unlock_list;
 				}
 				mtx_unlock(&ppr->pr_mtx);
 				*namelc = '.';
 			}
 			namelc++;
 		}
 		if (namelc[0] != '\0') {
 			pnamelen =
 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
  name_again:
 			deadpr = NULL;
 			FOREACH_PRISON_CHILD(ppr, tpr) {
 				if (tpr != pr && tpr->pr_ref > 0 &&
 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
 					if (pr == NULL &&
 					    cuflags != JAIL_CREATE) {
 						mtx_lock(&tpr->pr_mtx);
 						if (tpr->pr_ref > 0) {
 							/*
 							 * Use this jail
 							 * for updates.
 							 */
 							if (tpr->pr_uref > 0) {
 								pr = tpr;
 								break;
 							}
 							deadpr = tpr;
 						}
 						mtx_unlock(&tpr->pr_mtx);
 					} else if (tpr->pr_uref > 0) {
 						/*
 						 * Create, or update(jid):
 						 * name must not exist in an
 						 * active sibling jail.
 						 */
 						error = EEXIST;
 						if (pr != NULL)
 							mtx_unlock(&pr->pr_mtx);
 						vfs_opterror(opts,
 						   "jail \"%s\" already exists",
 						   name);
 						goto done_unlock_list;
 					}
 				}
 			}
 			/* If no active jail is found, use a dying one. */
 			if (deadpr != NULL && pr == NULL) {
 				if (flags & JAIL_DYING) {
 					mtx_lock(&deadpr->pr_mtx);
 					if (deadpr->pr_ref == 0) {
 						mtx_unlock(&deadpr->pr_mtx);
 						goto name_again;
 					}
 					pr = deadpr;
 				} else if (cuflags == JAIL_UPDATE) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" is dying", name);
 					goto done_unlock_list;
 				}
 			}
 			/* Update: name must exist if no jid. */
 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" not found",
 				    name);
 				goto done_unlock_list;
 			}
 		}
 	}
 	/* Update: must provide a jid or name. */
 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
 		error = ENOENT;
 		vfs_opterror(opts, "update specified no jail");
 		goto done_unlock_list;
 	}
 
 	/* If there's no prison to update, create a new one and link it in. */
 	if (pr == NULL) {
 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
 			if (tpr->pr_childcount >= tpr->pr_childmax) {
 				error = EPERM;
 				vfs_opterror(opts, "prison limit exceeded");
 				goto done_unlock_list;
 			}
 		created = 1;
 		mtx_lock(&ppr->pr_mtx);
 		if (ppr->pr_ref == 0) {
 			mtx_unlock(&ppr->pr_mtx);
 			error = ENOENT;
 			vfs_opterror(opts, "jail \"%s\" not found",
 			    prison_name(mypr, ppr));
 			goto done_unlock_list;
 		}
 		ppr->pr_ref++;
 		ppr->pr_uref++;
 		mtx_unlock(&ppr->pr_mtx);
 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 		if (jid == 0) {
 			/* Find the next free jid. */
 			jid = lastprid + 1;
  findnext:
 			if (jid == JAIL_MAX)
 				jid = 1;
 			TAILQ_FOREACH(tpr, &allprison, pr_list) {
 				if (tpr->pr_id < jid)
 					continue;
 				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
 					break;
 				}
 				if (jid == lastprid) {
 					error = EAGAIN;
 					vfs_opterror(opts,
 					    "no available jail IDs");
 					free(pr, M_PRISON);
 					prison_deref(ppr, PD_DEREF |
 					    PD_DEUREF | PD_LIST_XLOCKED);
 					goto done_releroot;
 				}
 				jid++;
 				goto findnext;
 			}
 			lastprid = jid;
 		} else {
 			/*
 			 * The jail already has a jid (that did not yet exist),
 			 * so just find where to insert it.
 			 */
 			TAILQ_FOREACH(tpr, &allprison, pr_list)
 				if (tpr->pr_id >= jid) {
 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
 					break;
 				}
 		}
 		if (tpr == NULL)
 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount++;
 
 		pr->pr_parent = ppr;
 		pr->pr_id = jid;
 
 		/* Set some default values, and inherit some from the parent. */
 		if (namelc == NULL)
 			namelc = "";
 		if (path == NULL) {
 			path = "/";
 			root = mypr->pr_root;
 			vref(root);
 		}
 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
 		pr->pr_flags |= PR_HOST;
 #if defined(INET) || defined(INET6)
 #ifdef VIMAGE
 		if (!(pr_flags & PR_VNET))
 #endif
 		{
 #ifdef INET
 			if (!(ch_flags & PR_IP4_USER))
 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
 			else if (!(pr_flags & PR_IP4_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
 				if (ppr->pr_ip4 != NULL) {
 					pr->pr_ip4s = ppr->pr_ip4s;
 					pr->pr_ip4 = malloc(pr->pr_ip4s *
 					    sizeof(struct in_addr), M_PRISON,
 					    M_WAITOK);
 					bcopy(ppr->pr_ip4, pr->pr_ip4,
 					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
 				}
 			}
 #endif
 #ifdef INET6
 			if (!(ch_flags & PR_IP6_USER))
 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
 			else if (!(pr_flags & PR_IP6_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
 				if (ppr->pr_ip6 != NULL) {
 					pr->pr_ip6s = ppr->pr_ip6s;
 					pr->pr_ip6 = malloc(pr->pr_ip6s *
 					    sizeof(struct in6_addr), M_PRISON,
 					    M_WAITOK);
 					bcopy(ppr->pr_ip6, pr->pr_ip6,
 					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
 				}
 			}
 #endif
 		}
 #endif
 		/* Source address selection is always on by default. */
 		pr->pr_flags |= _PR_IP_SADDRSEL;
 
 		pr->pr_securelevel = ppr->pr_securelevel;
 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
 
 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
 		if (osrelstr == NULL)
 		    strcpy(pr->pr_osrelease, ppr->pr_osrelease);
 		else
 		    strcpy(pr->pr_osrelease, osrelstr);
 
 		LIST_INIT(&pr->pr_children);
 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 
 #ifdef VIMAGE
 		/* Allocate a new vnet if specified. */
 		pr->pr_vnet = (pr_flags & PR_VNET)
 		    ? vnet_alloc() : ppr->pr_vnet;
 #endif
 		/*
 		 * Allocate a dedicated cpuset for each jail.
 		 * Unlike other initial settings, this may return an erorr.
 		 */
 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
 		if (error) {
 			prison_deref(pr, PD_LIST_XLOCKED);
 			goto done_releroot;
 		}
 
 		mtx_lock(&pr->pr_mtx);
 		/*
 		 * New prisons do not yet have a reference, because we do not
 		 * want others to see the incomplete prison once the
 		 * allprison_lock is downgraded.
 		 */
 	} else {
 		created = 0;
 		/*
 		 * Grab a reference for existing prisons, to ensure they
 		 * continue to exist for the duration of the call.
 		 */
 		pr->pr_ref++;
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 		if ((pr->pr_flags & PR_VNET) &&
 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "vnet jails cannot have IP address restrictions");
 			goto done_deref_locked;
 		}
 #endif
 #ifdef INET
 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip4 cannot be changed after creation");
 			goto done_deref_locked;
 		}
 #endif
 #ifdef INET6
 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip6 cannot be changed after creation");
 			goto done_deref_locked;
 		}
 #endif
 	}
 
 	/* Do final error checking before setting anything. */
 	if (gotslevel) {
 		if (slevel < ppr->pr_securelevel) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotchildmax) {
 		if (childmax >= ppr->pr_childmax) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotenforce) {
 		if (enforce < ppr->pr_enforce_statfs) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotrsnum) {
 		/*
 		 * devfs_rsnum is a uint16_t
 		 */
 		if (rsnum < 0 || rsnum > 65535) {
 			error = EINVAL;
 			goto done_deref_locked;
 		}
 		/*
 		 * Nested jails always inherit parent's devfs ruleset
 		 */
 		if (jailed(td->td_ucred)) {
 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
 				error = EPERM;
 				goto done_deref_locked;
 			} else
 				rsnum = ppr->pr_devfs_rsnum;
 		}
 	}
 #ifdef INET
 	if (ip4s > 0) {
 		if (ppr->pr_flags & PR_IP4) {
 			/*
 			 * Make sure the new set of IP addresses is a
 			 * subset of the parent's list.  Don't worry
 			 * about the parent being unlocked, as any
 			 * setting is done with allprison_lock held.
 			 */
 			for (ij = 0; ij < ppr->pr_ip4s; ij++)
 				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
 					break;
 			if (ij == ppr->pr_ip4s) {
 				error = EPERM;
 				goto done_deref_locked;
 			}
 			if (ip4s > 1) {
 				for (ii = ij = 1; ii < ip4s; ii++) {
 					if (ip4[ii].s_addr ==
 					    ppr->pr_ip4[0].s_addr)
 						continue;
 					for (; ij < ppr->pr_ip4s; ij++)
 						if (ip4[ii].s_addr ==
 						    ppr->pr_ip4[ij].s_addr)
 							break;
 					if (ij == ppr->pr_ip4s)
 						break;
 				}
 				if (ij == ppr->pr_ip4s) {
 					error = EPERM;
 					goto done_deref_locked;
 				}
 			}
 		}
 		/*
 		 * Check for conflicting IP addresses.  We permit them
 		 * if there is no more than one IP on each jail.  If
 		 * there is a duplicate on a jail with more than one
 		 * IP stop checking and return error.
 		 */
 		tppr = ppr;
 #ifdef VIMAGE
 		for (; tppr != &prison0; tppr = tppr->pr_parent)
 			if (tppr->pr_flags & PR_VNET)
 				break;
 #endif
 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 			if (tpr == pr ||
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
 			    tpr->pr_uref == 0) {
 				descend = 0;
 				continue;
 			}
 			if (!(tpr->pr_flags & PR_IP4_USER))
 				continue;
 			descend = 0;
 			if (tpr->pr_ip4 == NULL ||
 			    (ip4s == 1 && tpr->pr_ip4s == 1))
 				continue;
 			for (ii = 0; ii < ip4s; ii++) {
 				if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
 				    0) {
 					error = EADDRINUSE;
 					vfs_opterror(opts,
 					    "IPv4 addresses clash");
 					goto done_deref_locked;
 				}
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	if (ip6s > 0) {
 		if (ppr->pr_flags & PR_IP6) {
 			/*
 			 * Make sure the new set of IP addresses is a
 			 * subset of the parent's list.
 			 */
 			for (ij = 0; ij < ppr->pr_ip6s; ij++)
 				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
 				    &ppr->pr_ip6[ij]))
 					break;
 			if (ij == ppr->pr_ip6s) {
 				error = EPERM;
 				goto done_deref_locked;
 			}
 			if (ip6s > 1) {
 				for (ii = ij = 1; ii < ip6s; ii++) {
 					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
 					     &ppr->pr_ip6[0]))
 						continue;
 					for (; ij < ppr->pr_ip6s; ij++)
 						if (IN6_ARE_ADDR_EQUAL(
 						    &ip6[ii], &ppr->pr_ip6[ij]))
 							break;
 					if (ij == ppr->pr_ip6s)
 						break;
 				}
 				if (ij == ppr->pr_ip6s) {
 					error = EPERM;
 					goto done_deref_locked;
 				}
 			}
 		}
 		/* Check for conflicting IP addresses. */
 		tppr = ppr;
 #ifdef VIMAGE
 		for (; tppr != &prison0; tppr = tppr->pr_parent)
 			if (tppr->pr_flags & PR_VNET)
 				break;
 #endif
 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 			if (tpr == pr ||
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
 			    tpr->pr_uref == 0) {
 				descend = 0;
 				continue;
 			}
 			if (!(tpr->pr_flags & PR_IP6_USER))
 				continue;
 			descend = 0;
 			if (tpr->pr_ip6 == NULL ||
 			    (ip6s == 1 && tpr->pr_ip6s == 1))
 				continue;
 			for (ii = 0; ii < ip6s; ii++) {
 				if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
 				    0) {
 					error = EADDRINUSE;
 					vfs_opterror(opts,
 					    "IPv6 addresses clash");
 					goto done_deref_locked;
 				}
 			}
 		}
 	}
 #endif
 	onamelen = namelen = 0;
 	if (namelc != NULL) {
 		/* Give a default name of the jid.  Also allow the name to be
 		 * explicitly the jid - but not any other number, and only in
 		 * normal form (no leading zero/etc).
 		 */
 		if (namelc[0] == '\0')
 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
 		else if ((strtoul(namelc, &p, 10) != jid ||
 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "name cannot be numeric (unless it is the jid)");
 			goto done_deref_locked;
 		}
 		/*
 		 * Make sure the name isn't too long for the prison or its
 		 * children.
 		 */
 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 		onamelen = strlen(pr->pr_name + pnamelen);
 		namelen = strlen(namelc);
 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
 			error = ENAMETOOLONG;
 			goto done_deref_locked;
 		}
 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
 			    sizeof(pr->pr_name)) {
 				error = ENAMETOOLONG;
 				goto done_deref_locked;
 			}
 		}
 	}
 	if (pr_allow & ~ppr->pr_allow) {
 		error = EPERM;
 		goto done_deref_locked;
 	}
 
 	/*
 	 * Let modules check their parameters.  This requires unlocking and
 	 * then re-locking the prison, but this is still a valid state as long
 	 * as allprison_lock remains xlocked.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
 	if (error != 0) {
 		prison_deref(pr, created
 		    ? PD_LIST_XLOCKED
 		    : PD_DEREF | PD_LIST_XLOCKED);
 		goto done_releroot;
 	}
 	mtx_lock(&pr->pr_mtx);
 
 	/* At this point, all valid parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_deref_locked;
 		}
 	}
 
 	/* Set the parameters of the prison. */
 #ifdef INET
 	redo_ip4 = 0;
 	if (pr_flags & PR_IP4_USER) {
 		pr->pr_flags |= PR_IP4;
 		free(pr->pr_ip4, M_PRISON);
 		pr->pr_ip4s = ip4s;
 		pr->pr_ip4 = ip4;
 		ip4 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip4(tpr, NULL)) {
 				redo_ip4 = 1;
 				descend = 0;
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	redo_ip6 = 0;
 	if (pr_flags & PR_IP6_USER) {
 		pr->pr_flags |= PR_IP6;
 		free(pr->pr_ip6, M_PRISON);
 		pr->pr_ip6s = ip6s;
 		pr->pr_ip6 = ip6;
 		ip6 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip6(tpr, NULL)) {
 				redo_ip6 = 1;
 				descend = 0;
 			}
 		}
 	}
 #endif
 	if (gotslevel) {
 		pr->pr_securelevel = slevel;
 		/* Set all child jails to be at least this level. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_securelevel < slevel)
 				tpr->pr_securelevel = slevel;
 	}
 	if (gotchildmax) {
 		pr->pr_childmax = childmax;
 		/* Set all child jails to under this limit. */
 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
 			if (tpr->pr_childmax > childmax - level)
 				tpr->pr_childmax = childmax > level
 				    ? childmax - level : 0;
 	}
 	if (gotenforce) {
 		pr->pr_enforce_statfs = enforce;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_enforce_statfs < enforce)
 				tpr->pr_enforce_statfs = enforce;
 	}
 	if (gotrsnum) {
 		pr->pr_devfs_rsnum = rsnum;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_devfs_rsnum = rsnum;
 	}
 	if (namelc != NULL) {
 		if (ppr == &prison0)
 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
 		else
 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
 			    ppr->pr_name, namelc);
 		/* Change this component of child names. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
 			    strlen(tpr->pr_name + onamelen) + 1);
 			bcopy(pr->pr_name, tpr->pr_name, namelen);
 		}
 	}
 	if (path != NULL) {
 		/* Try to keep a real-rooted full pathname. */
 		if (fullpath_disabled && path[0] == '/' &&
 		    strcmp(mypr->pr_path, "/"))
 			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
 			    mypr->pr_path, path);
 		else
 			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
 		pr->pr_root = root;
 	}
 	if (PR_HOST & ch_flags & ~pr_flags) {
 		if (pr->pr_flags & PR_HOST) {
 			/*
 			 * Copy the parent's host info.  As with pr_ip4 above,
 			 * the lack of a lock on the parent is not a problem;
 			 * it is always set with allprison_lock at least
 			 * shared, and is held exclusively here.
 			 */
 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
 			    sizeof(pr->pr_hostname));
 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
 			    sizeof(pr->pr_domainname));
 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
 			    sizeof(pr->pr_hostuuid));
 			pr->pr_hostid = pr->pr_parent->pr_hostid;
 		}
 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
 		/* Set this prison, and any descendants without PR_HOST. */
 		if (host != NULL)
 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
 		if (domain != NULL)
 			strlcpy(pr->pr_domainname, domain, 
 			    sizeof(pr->pr_domainname));
 		if (uuid != NULL)
 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
 		if (gothid)
 			pr->pr_hostid = hid;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			if (tpr->pr_flags & PR_HOST)
 				descend = 0;
 			else {
 				if (host != NULL)
 					strlcpy(tpr->pr_hostname,
 					    pr->pr_hostname,
 					    sizeof(tpr->pr_hostname));
 				if (domain != NULL)
 					strlcpy(tpr->pr_domainname, 
 					    pr->pr_domainname,
 					    sizeof(tpr->pr_domainname));
 				if (uuid != NULL)
 					strlcpy(tpr->pr_hostuuid,
 					    pr->pr_hostuuid,
 					    sizeof(tpr->pr_hostuuid));
 				if (gothid)
 					tpr->pr_hostid = hid;
 			}
 		}
 	}
 	if ((tallow = ch_allow & ~pr_allow)) {
 		/* Clear allow bits in all children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_allow &= ~tallow;
 	}
 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
 	/*
 	 * Persistent prisons get an extra reference, and prisons losing their
 	 * persist flag lose that reference.  Only do this for existing prisons
 	 * for now, so new ones will remain unseen until after the module
 	 * handlers have completed.
 	 */
 	born = pr->pr_uref == 0;
 	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
 		if (pr_flags & PR_PERSIST) {
 			pr->pr_ref++;
 			pr->pr_uref++;
 		} else {
 			pr->pr_ref--;
 			pr->pr_uref--;
 		}
 	}
 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
 	mtx_unlock(&pr->pr_mtx);
 
 #ifdef RACCT
 	if (racct_enable && created)
 		prison_racct_attach(pr);
 #endif
 
 	/* Locks may have prevented a complete restriction of child IP
 	 * addresses.  If so, allocate some more memory and try again.
 	 */
 #ifdef INET
 	while (redo_ip4) {
 		ip4s = pr->pr_ip4s;
 		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip4 = 0;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip4(tpr, ip4)) {
 				if (ip4 != NULL)
 					ip4 = NULL;
 				else
 					redo_ip4 = 1;
 			}
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 #ifdef INET6
 	while (redo_ip6) {
 		ip6s = pr->pr_ip6s;
 		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip6 = 0;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip6(tpr, ip6)) {
 				if (ip6 != NULL)
 					ip6 = NULL;
 				else
 					redo_ip6 = 1;
 			}
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 
 	/* Let the modules do their work. */
 	sx_downgrade(&allprison_lock);
 	if (born) {
 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 		if (error) {
 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 			prison_deref(pr, created
 			    ? PD_LIST_SLOCKED
 			    : PD_DEREF | PD_LIST_SLOCKED);
 			goto done_errmsg;
 		}
 	}
 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
 	if (error) {
 		if (born)
 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 		prison_deref(pr, created
 		    ? PD_LIST_SLOCKED
 		    : PD_DEREF | PD_LIST_SLOCKED);
 		goto done_errmsg;
 	}
 
 	/* Attach this process to the prison if requested. */
 	if (flags & JAIL_ATTACH) {
 		mtx_lock(&pr->pr_mtx);
 		error = do_jail_attach(td, pr);
 		if (error) {
 			vfs_opterror(opts, "attach failed");
 			if (!created)
 				prison_deref(pr, PD_DEREF);
 			goto done_errmsg;
 		}
 	}
 
 #ifdef RACCT
 	if (racct_enable && !created) {
 		if (!(flags & JAIL_ATTACH))
 			sx_sunlock(&allprison_lock);
 		prison_racct_modify(pr);
 		if (!(flags & JAIL_ATTACH))
 			sx_slock(&allprison_lock);
 	}
 #endif
 
 	td->td_retval[0] = pr->pr_id;
 
 	/*
 	 * Now that it is all there, drop the temporary reference from existing
 	 * prisons.  Or add a reference to newly created persistent prisons
 	 * (which was not done earlier so that the prison would not be publicly
 	 * visible).
 	 */
 	if (!created) {
 		prison_deref(pr, (flags & JAIL_ATTACH)
 		    ? PD_DEREF
 		    : PD_DEREF | PD_LIST_SLOCKED);
 	} else {
 		if (pr_flags & PR_PERSIST) {
 			mtx_lock(&pr->pr_mtx);
 			pr->pr_ref++;
 			pr->pr_uref++;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		if (!(flags & JAIL_ATTACH))
 			sx_sunlock(&allprison_lock);
 	}
 
 	goto done_free;
 
  done_deref_locked:
 	prison_deref(pr, created
 	    ? PD_LOCKED | PD_LIST_XLOCKED
 	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
 	goto done_releroot;
  done_unlock_list:
 	sx_xunlock(&allprison_lock);
  done_releroot:
 	if (root != NULL)
 		vrele(root);
  done_errmsg:
 	if (error) {
 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
 		    &errmsg_len) == 0 && errmsg_len > 0) {
 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
  done_free:
 #ifdef INET
 	free(ip4, M_PRISON);
 #endif
 #ifdef INET6
 	free(ip6, M_PRISON);
 #endif
 	if (g_path != NULL)
 		free(g_path, M_TEMP);
 	vfs_freeopts(opts);
 	return (error);
 }
 
 
 /*
  * struct jail_get_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_get(struct thread *td, struct jail_get_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		error = copyout(auio->uio_iov, uap->iovp,
 		    uap->iovcnt * sizeof (struct iovec));
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 {
 	struct prison *pr, *mypr;
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	char *errmsg, *name;
 	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
 
 	if (flags & ~JAIL_GET_MASK)
 		return (EINVAL);
 
 	/* Get the parameter list. */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
 	mypr = td->td_ucred->cr_prison;
 
 	/*
 	 * Find the prison specified by one of: lastjid, jid, name.
 	 */
 	sx_slock(&allprison_lock);
 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 	if (error == 0) {
 		TAILQ_FOREACH(pr, &allprison, pr_list) {
 			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
 				mtx_lock(&pr->pr_mtx);
 				if (pr->pr_ref > 0 &&
 				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
 					break;
 				mtx_unlock(&pr->pr_mtx);
 			}
 		}
 		if (pr != NULL)
 			goto found_prison;
 		error = ENOENT;
 		vfs_opterror(opts, "no jail after %d", jid);
 		goto done_unlock_list;
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == 0) {
 		if (jid != 0) {
 			pr = prison_find_child(mypr, jid);
 			if (pr != NULL) {
 				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
 					mtx_unlock(&pr->pr_mtx);
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_unlock_list;
 				}
 				goto found_prison;
 			}
 			error = ENOENT;
 			vfs_opterror(opts, "jail %d not found", jid);
 			goto done_unlock_list;
 		}
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == 0) {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_unlock_list;
 		}
 		pr = prison_find_name(mypr, name);
 		if (pr != NULL) {
 			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
 				mtx_unlock(&pr->pr_mtx);
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" is dying",
 				    name);
 				goto done_unlock_list;
 			}
 			goto found_prison;
 		}
 		error = ENOENT;
 		vfs_opterror(opts, "jail \"%s\" not found", name);
 		goto done_unlock_list;
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	vfs_opterror(opts, "no jail specified");
 	error = ENOENT;
 	goto done_unlock_list;
 
  found_prison:
 	/* Get the parameters of the prison. */
 	pr->pr_ref++;
 	locked = PD_LOCKED;
 	td->td_retval[0] = pr->pr_id;
 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
 	    sizeof(pr->pr_cpuset->cs_id));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #ifdef INET
 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
 	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #endif
 #ifdef INET6
 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
 	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #endif
 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
 	    sizeof(pr->pr_securelevel));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
 	    sizeof(pr->pr_childcount));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
 	    sizeof(pr->pr_childmax));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32 = pr->pr_hostid;
 
 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
 	} else
 #endif
 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
 	    sizeof(pr->pr_hostid));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
 	    sizeof(pr->pr_enforce_statfs));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
 	    sizeof(pr->pr_devfs_rsnum));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	for (fi = 0; fi < nitems(pr_flag_names); fi++) {
 		if (pr_flag_names[fi] == NULL)
 			continue;
 		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
 		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 		i = !i;
 		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) {
 		i = pr->pr_flags &
 		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
 		i = pr_flag_jailsys[fi].disable &&
 		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
 		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
 		    : JAIL_SYS_INHERIT;
 		error =
 		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	for (fi = 0; fi < nitems(pr_allow_names); fi++) {
 		if (pr_allow_names[fi] == NULL)
 			continue;
 		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
 		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 		i = !i;
 		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	i = (pr->pr_uref == 0);
 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	i = !i;
 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
 	    sizeof(pr->pr_osreldate));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 
 	/* Get the module parameters. */
 	mtx_unlock(&pr->pr_mtx);
 	locked = 0;
 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
 	if (error)
 		goto done_deref;
 	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
 
 	/* By now, all parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_errmsg;
 		}
 	}
 
 	/* Write the fetched parameters back to userspace. */
 	error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
 			pos = 2 * opt->pos + 1;
 			optuio->uio_iov[pos].iov_len = opt->len;
 			if (opt->value != NULL) {
 				if (optuio->uio_segflg == UIO_SYSSPACE) {
 					bcopy(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 				} else {
 					error = copyout(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 					if (error)
 						break;
 				}
 			}
 		}
 	}
 	goto done_errmsg;
 
  done_deref:
 	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
 	goto done_errmsg;
 
  done_unlock_list:
 	sx_sunlock(&allprison_lock);
  done_errmsg:
 	if (error && errmsg_pos >= 0) {
 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
 		errmsg_pos = 2 * errmsg_pos + 1;
 		if (errmsg_len > 0) {
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
 	vfs_freeopts(opts);
 	return (error);
 }
 
 
 /*
  * struct jail_remove_args {
  *	int jid;
  * };
  */
 int
 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 {
 	struct prison *pr, *cpr, *lpr, *tpr;
 	int descend, error;
 
 	error = priv_check(td, PRIV_JAIL_REMOVE);
 	if (error)
 		return (error);
 
 	sx_xlock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_xunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/* Remove all descendants of this prison, then remove this prison. */
 	pr->pr_ref++;
 	if (!LIST_EMPTY(&pr->pr_children)) {
 		mtx_unlock(&pr->pr_mtx);
 		lpr = NULL;
 		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 			mtx_lock(&cpr->pr_mtx);
 			if (cpr->pr_ref > 0) {
 				tpr = cpr;
 				cpr->pr_ref++;
 			} else {
 				/* Already removed - do not do it again. */
 				tpr = NULL;
 			}
 			mtx_unlock(&cpr->pr_mtx);
 			if (lpr != NULL) {
 				mtx_lock(&lpr->pr_mtx);
 				prison_remove_one(lpr);
 				sx_xlock(&allprison_lock);
 			}
 			lpr = tpr;
 		}
 		if (lpr != NULL) {
 			mtx_lock(&lpr->pr_mtx);
 			prison_remove_one(lpr);
 			sx_xlock(&allprison_lock);
 		}
 		mtx_lock(&pr->pr_mtx);
 	}
 	prison_remove_one(pr);
 	return (0);
 }
 
 static void
 prison_remove_one(struct prison *pr)
 {
 	struct proc *p;
 	int deuref;
 
 	/* If the prison was persistent, it is not anymore. */
 	deuref = 0;
 	if (pr->pr_flags & PR_PERSIST) {
 		pr->pr_ref--;
 		deuref = PD_DEUREF;
 		pr->pr_flags &= ~PR_PERSIST;
 	}
 
 	/*
 	 * jail_remove added a reference.  If that's the only one, remove
 	 * the prison now.
 	 */
 	KASSERT(pr->pr_ref > 0,
 	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
 	if (pr->pr_ref == 1) {
 		prison_deref(pr,
 		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
 		return;
 	}
 
 	mtx_unlock(&pr->pr_mtx);
 	sx_xunlock(&allprison_lock);
 	/*
 	 * Kill all processes unfortunate enough to be attached to this prison.
 	 */
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		PROC_LOCK(p);
 		if (p->p_state != PRS_NEW && p->p_ucred &&
 		    p->p_ucred->cr_prison == pr)
 			kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 	/* Remove the temporary reference added by jail_remove. */
 	prison_deref(pr, deuref | PD_DEREF);
 }
 
 
 /*
  * struct jail_attach_args {
  *	int jid;
  * };
  */
 int
 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 {
 	struct prison *pr;
 	int error;
 
 	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
 	/*
 	 * Start with exclusive hold on allprison_lock to ensure that a possible
 	 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
 	 * But then immediately downgrade it since we don't need to stop
 	 * readers.
 	 */
 	sx_xlock(&allprison_lock);
 	sx_downgrade(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/*
 	 * Do not allow a process to attach to a prison that is not
 	 * considered to be "alive".
 	 */
 	if (pr->pr_uref == 0) {
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	return (do_jail_attach(td, pr));
 }
 
 static int
 do_jail_attach(struct thread *td, struct prison *pr)
 {
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
 	 * different jails at the same time.  It is important for
 	 * user processes not to do this, or they might end up with
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
 	pr->pr_ref++;
 	pr->pr_uref++;
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Let modules do whatever they need to prepare for attaching. */
 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
 	if (error) {
 		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
 		return (error);
 	}
 	sx_sunlock(&allprison_lock);
 
 	/*
 	 * Reparent the newly attached process to this jail.
 	 */
 	p = td->td_proc;
 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
 	if (error)
 		goto e_revert_osd;
 
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
 #ifdef MAC
 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
 		goto e_unlock;
 #endif
 	VOP_UNLOCK(pr->pr_root, 0);
 	if ((error = pwd_chroot(td, pr->pr_root)))
 		goto e_revert_osd;
 
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 	newcred->cr_prison = pr;
 	proc_set_cred(p, newcred);
 	setsugid(p);
 	PROC_UNLOCK(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 #endif
 	prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
 	crfree(oldcred);
 	return (0);
 
  e_unlock:
 	VOP_UNLOCK(pr->pr_root, 0);
  e_revert_osd:
 	/* Tell modules this thread is still in its old jail after all. */
 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
 	prison_deref(pr, PD_DEREF | PD_DEUREF);
 	return (error);
 }
 
 
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
 struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0)
 				return (pr);
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_child(struct prison *mypr, int prid)
 {
 	struct prison *pr;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0)
 				return (pr);
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Look for the name relative to mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_name(struct prison *mypr, const char *name)
 {
 	struct prison *pr, *deadpr;
 	size_t mylen;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
  again:
 	deadpr = NULL;
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (!strcmp(pr->pr_name + mylen, name)) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0) {
 				if (pr->pr_uref > 0)
 					return (pr);
 				deadpr = pr;
 			}
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	/* There was no valid prison - perhaps there was a dying one. */
 	if (deadpr != NULL) {
 		mtx_lock(&deadpr->pr_mtx);
 		if (deadpr->pr_ref == 0) {
 			mtx_unlock(&deadpr->pr_mtx);
 			goto again;
 		}
 	}
 	return (deadpr);
 }
 
 /*
  * See if a prison has the specific flag set.
  */
 int
 prison_flag(struct ucred *cred, unsigned flag)
 {
 
 	/* This is an atomic read, so no locking is necessary. */
 	return (cred->cr_prison->pr_flags & flag);
 }
 
 int
 prison_allow(struct ucred *cred, unsigned flag)
 {
 
 	/* This is an atomic read, so no locking is necessary. */
 	return (cred->cr_prison->pr_allow & flag);
 }
 
 /*
  * Remove a prison reference.  If that was the last reference, remove the
  * prison itself - but not in this context in case there are locks held.
  */
 void
 prison_free_locked(struct prison *pr)
 {
 	int ref;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	ref = --pr->pr_ref;
 	mtx_unlock(&pr->pr_mtx);
 	if (ref == 0)
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 }
 
 void
 prison_free(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	prison_free_locked(pr);
 }
 
 /*
  * Complete a call to either prison_free or prison_proc_free.
  */
 static void
 prison_complete(void *context, int pending)
 {
 	struct prison *pr = context;
 
 	sx_xlock(&allprison_lock);
 	mtx_lock(&pr->pr_mtx);
 	prison_deref(pr, pr->pr_uref
 	    ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
 	    : PD_LOCKED | PD_LIST_XLOCKED);
 }
 
 /*
  * Remove a prison reference (usually).  This internal version assumes no
  * mutexes are held, except perhaps the prison itself.  If there are no more
  * references, release and delist the prison.  On completion, the prison lock
  * and the allprison lock are both unlocked.
  */
 static void
 prison_deref(struct prison *pr, int flags)
 {
 	struct prison *ppr, *tpr;
 	int ref, lasturef;
 
 	if (!(flags & PD_LOCKED))
 		mtx_lock(&pr->pr_mtx);
 	for (;;) {
 		if (flags & PD_DEUREF) {
 			KASSERT(pr->pr_uref > 0,
 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
 			     pr->pr_id));
 			pr->pr_uref--;
 			lasturef = pr->pr_uref == 0;
 			if (lasturef)
 				pr->pr_ref++;
 			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
 		} else
 			lasturef = 0;
 		if (flags & PD_DEREF) {
 			KASSERT(pr->pr_ref > 0,
 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
 			     pr->pr_id));
 			pr->pr_ref--;
 		}
 		ref = pr->pr_ref;
 		mtx_unlock(&pr->pr_mtx);
 
 		/*
 		 * Tell the modules if the last user reference was removed
 		 * (even it sticks around in dying state).
 		 */
 		if (lasturef) {
 			if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
 				sx_xlock(&allprison_lock);
 				flags |= PD_LIST_XLOCKED;
 			}
 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 			mtx_lock(&pr->pr_mtx);
 			ref = --pr->pr_ref;
 			mtx_unlock(&pr->pr_mtx);
 		}
 
 		/* If the prison still has references, nothing else to do. */
 		if (ref > 0) {
 			if (flags & PD_LIST_SLOCKED)
 				sx_sunlock(&allprison_lock);
 			else if (flags & PD_LIST_XLOCKED)
 				sx_xunlock(&allprison_lock);
 			return;
 		}
 
 		if (flags & PD_LIST_SLOCKED) {
 			if (!sx_try_upgrade(&allprison_lock)) {
 				sx_sunlock(&allprison_lock);
 				sx_xlock(&allprison_lock);
 			}
 		} else if (!(flags & PD_LIST_XLOCKED))
 			sx_xlock(&allprison_lock);
 
 		TAILQ_REMOVE(&allprison, pr, pr_list);
 		LIST_REMOVE(pr, pr_sibling);
 		ppr = pr->pr_parent;
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount--;
 		sx_xunlock(&allprison_lock);
 
 #ifdef VIMAGE
 		if (pr->pr_vnet != ppr->pr_vnet)
 			vnet_destroy(pr->pr_vnet);
 #endif
 		if (pr->pr_root != NULL)
 			vrele(pr->pr_root);
 		mtx_destroy(&pr->pr_mtx);
 #ifdef INET
 		free(pr->pr_ip4, M_PRISON);
 #endif
 #ifdef INET6
 		free(pr->pr_ip6, M_PRISON);
 #endif
 		if (pr->pr_cpuset != NULL)
 			cpuset_rel(pr->pr_cpuset);
 		osd_jail_exit(pr);
 #ifdef RACCT
 		if (racct_enable)
 			prison_racct_detach(pr);
 #endif
 		free(pr, M_PRISON);
 
 		/* Removing a prison frees a reference on its parent. */
 		pr = ppr;
 		mtx_lock(&pr->pr_mtx);
 		flags = PD_DEREF | PD_DEUREF;
 	}
 }
 
 void
 prison_hold_locked(struct prison *pr)
 {
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	KASSERT(pr->pr_ref > 0,
 	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
 	pr->pr_ref++;
 }
 
 void
 prison_hold(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 prison_proc_hold(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	KASSERT(pr->pr_uref > 0,
 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
 	pr->pr_uref++;
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 prison_proc_free(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	KASSERT(pr->pr_uref > 0,
 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
 	if (pr->pr_uref > 1)
 		pr->pr_uref--;
 	else {
 		/*
 		 * Don't remove the last user reference in this context, which
 		 * is expected to be a process that is not only locked, but
 		 * also half dead.
 		 */
 		pr->pr_ref++;
 		mtx_unlock(&pr->pr_mtx);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 		return;
 	}
 	mtx_unlock(&pr->pr_mtx);
 }
 
 /*
  * Check if a jail supports the given address family.
  *
  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
  * if not.
  */
 int
 prison_check_af(struct ucred *cred, int af)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 
 	pr = cred->cr_prison;
 #ifdef VIMAGE
 	/* Prisons with their own network stack are not limited. */
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (af)
 	{
 #ifdef INET
 	case AF_INET:
 		if (pr->pr_flags & PR_IP4)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (pr->pr_flags & PR_IP6)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 	case AF_LOCAL:
 	case AF_ROUTE:
 		break;
 	default:
 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Check if given address belongs to the jail referenced by cred (wrapper to
  * prison_check_ip[46]).
  *
  * Returns 0 if jail doesn't restrict the address family or if address belongs
  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
  */
 int
 prison_if(struct ucred *cred, struct sockaddr *sa)
 {
 #ifdef INET
 	struct sockaddr_in *sai;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sai6;
 #endif
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (sa->sa_family)
 	{
 #ifdef INET
 	case AF_INET:
 		sai = (struct sockaddr_in *)sa;
 		error = prison_check_ip4(cred, &sai->sin_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sai6 = (struct sockaddr_in6 *)sa;
 		error = prison_check_ip6(cred, &sai6->sin6_addr);
 		break;
 #endif
 	default:
 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
 int
 prison_check(struct ucred *cred1, struct ucred *cred2)
 {
 
 	return ((cred1->cr_prison == cred2->cr_prison ||
 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
 }
 
 /*
  * Return 1 if p2 is a child of p1, otherwise 0.
  */
 int
 prison_ischild(struct prison *pr1, struct prison *pr2)
 {
 
 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
 		if (pr1 == pr2)
 			return (1);
 	return (0);
 }
 
 /*
  * Return 1 if the passed credential is in a jail, otherwise 0.
  */
 int
 jailed(struct ucred *cred)
 {
 
 	return (cred->cr_prison != &prison0);
 }
 
 /*
  * Return 1 if the passed credential is in a jail and that jail does not
  * have its own virtual network stack, otherwise 0.
  */
 int
 jailed_without_vnet(struct ucred *cred)
 {
 
 	if (!jailed(cred))
 		return (0);
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	return (1);
 }
 
 /*
  * Return the correct hostname (domainname, et al) for the passed credential.
  */
 void
 getcredhostname(struct ucred *cred, char *buf, size_t size)
 {
 	struct prison *pr;
 
 	/*
 	 * A NULL credential can be used to shortcut to the physical
 	 * system's hostname.
 	 */
 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
 	mtx_lock(&pr->pr_mtx);
 	strlcpy(buf, pr->pr_hostname, size);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 getcreddomainname(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostid(struct ucred *cred, unsigned long *hostid)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	*hostid = cred->cr_prison->pr_hostid;
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 #ifdef VIMAGE
 /*
  * Determine whether the prison represented by cred owns
  * its vnet rather than having it inherited.
  *
  * Returns 1 in case the prison owns the vnet, 0 otherwise.
  */
 int
 prison_owns_vnet(struct ucred *cred)
 {
 
 	/*
 	 * vnets cannot be added/removed after jail creation,
 	 * so no need to lock here.
 	 */
 	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
 }
 #endif
 
 /*
  * Determine whether the subject represented by cred can "see"
  * status of a mount point.
  * Returns: 0 for permitted, ENOENT otherwise.
  * XXX: This function should be called cr_canseemount() and should be
  *      placed in kern_prot.c.
  */
 int
 prison_canseemount(struct ucred *cred, struct mount *mp)
 {
 	struct prison *pr;
 	struct statfs *sp;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return (0);
 	if (pr->pr_root->v_mount == mp)
 		return (0);
 	if (pr->pr_enforce_statfs == 2)
 		return (ENOENT);
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 * This is ugly check, but this is the only situation when jail's
 	 * directory ends with '/'.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return (0);
 	len = strlen(pr->pr_path);
 	sp = &mp->mnt_stat;
 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 		return (ENOENT);
 	/*
 	 * Be sure that we don't have situation where jail's root directory
 	 * is "/some/path" and mount point is "/some/pathpath".
 	 */
 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 		return (ENOENT);
 	return (0);
 }
 
 void
 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 {
 	char jpath[MAXPATHLEN];
 	struct prison *pr;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return;
 	if (prison_canseemount(cred, mp) != 0) {
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		strlcpy(sp->f_mntonname, "[restricted]",
 		    sizeof(sp->f_mntonname));
 		return;
 	}
 	if (pr->pr_root->v_mount == mp) {
 		/*
 		 * Clear current buffer data, so we are sure nothing from
 		 * the valid path left there.
 		 */
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		*sp->f_mntonname = '/';
 		return;
 	}
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return;
 	len = strlen(pr->pr_path);
 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 	/*
 	 * Clear current buffer data, so we are sure nothing from
 	 * the valid path left there.
 	 */
 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 	if (*jpath == '\0') {
 		/* Should never happen. */
 		*sp->f_mntonname = '/';
 	} else {
 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 	}
 }
 
 /*
  * Check with permission for a specific privilege is granted within jail.  We
  * have a specific list of accepted privileges; the rest are denied.
  */
 int
 prison_priv_check(struct ucred *cred, int priv)
 {
 
 	if (!jailed(cred))
 		return (0);
 
 #ifdef VIMAGE
 	/*
 	 * Privileges specific to prisons with a virtual network stack.
 	 * There might be a duplicate entry here in case the privilege
 	 * is only granted conditionally in the legacy jail case.
 	 */
 	switch (priv) {
 #ifdef notyet
 		/*
 		 * NFS-specific privileges.
 		 */
 	case PRIV_NFS_DAEMON:
 	case PRIV_NFS_LOCKD:
 #endif
 		/*
 		 * Network stack privileges.
 		 */
 	case PRIV_NET_BRIDGE:
 	case PRIV_NET_GRE:
 	case PRIV_NET_BPF:
 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
 	case PRIV_NET_ROUTE:
 	case PRIV_NET_TAP:
 	case PRIV_NET_SETIFMTU:
 	case PRIV_NET_SETIFFLAGS:
 	case PRIV_NET_SETIFCAP:
 	case PRIV_NET_SETIFDESCR:
 	case PRIV_NET_SETIFNAME	:
 	case PRIV_NET_SETIFMETRIC:
 	case PRIV_NET_SETIFPHYS:
 	case PRIV_NET_SETIFMAC:
 	case PRIV_NET_ADDMULTI:
 	case PRIV_NET_DELMULTI:
 	case PRIV_NET_HWIOCTL:
 	case PRIV_NET_SETLLADDR:
 	case PRIV_NET_ADDIFGROUP:
 	case PRIV_NET_DELIFGROUP:
 	case PRIV_NET_IFCREATE:
 	case PRIV_NET_IFDESTROY:
 	case PRIV_NET_ADDIFADDR:
 	case PRIV_NET_DELIFADDR:
 	case PRIV_NET_LAGG:
 	case PRIV_NET_GIF:
 	case PRIV_NET_SETIFVNET:
 	case PRIV_NET_SETIFFIB:
 
 		/*
 		 * 802.11-related privileges.
 		 */
 	case PRIV_NET80211_GETKEY:
 #ifdef notyet
 	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
 #endif
 
 #ifdef notyet
 		/*
 		 * ATM privileges.
 		 */
 	case PRIV_NETATM_CFG:
 	case PRIV_NETATM_ADD:
 	case PRIV_NETATM_DEL:
 	case PRIV_NETATM_SET:
 
 		/*
 		 * Bluetooth privileges.
 		 */
 	case PRIV_NETBLUETOOTH_RAW:
 #endif
 
 		/*
 		 * Netgraph and netgraph module privileges.
 		 */
 	case PRIV_NETGRAPH_CONTROL:
 #ifdef notyet
 	case PRIV_NETGRAPH_TTY:
 #endif
 
 		/*
 		 * IPv4 and IPv6 privileges.
 		 */
 	case PRIV_NETINET_IPFW:
 	case PRIV_NETINET_DIVERT:
 	case PRIV_NETINET_PF:
 	case PRIV_NETINET_DUMMYNET:
 	case PRIV_NETINET_CARP:
 	case PRIV_NETINET_MROUTE:
 	case PRIV_NETINET_RAW:
 	case PRIV_NETINET_ADDRCTRL6:
 	case PRIV_NETINET_ND6:
 	case PRIV_NETINET_SCOPE6:
 	case PRIV_NETINET_ALIFETIME6:
 	case PRIV_NETINET_IPSEC:
 	case PRIV_NETINET_BINDANY:
 
 #ifdef notyet
 		/*
 		 * NCP privileges.
 		 */
 	case PRIV_NETNCP:
 
 		/*
 		 * SMB privileges.
 		 */
 	case PRIV_NETSMB:
 #endif
 
 	/*
 	 * No default: or deny here.
 	 * In case of no permit fall through to next switch().
 	 */
 		if (cred->cr_prison->pr_flags & PR_VNET)
 			return (0);
 	}
 #endif /* VIMAGE */
 
 	switch (priv) {
 
 		/*
 		 * Allow ktrace privileges for root in jail.
 		 */
 	case PRIV_KTRACE:
 
 #if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
 		 * want to further refine the relationship between audit and
 		 * jail.
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
 		/*
 		 * Allow jailed processes to manipulate process UNIX
 		 * credentials in any way they see fit.
 		 */
 	case PRIV_CRED_SETUID:
 	case PRIV_CRED_SETEUID:
 	case PRIV_CRED_SETGID:
 	case PRIV_CRED_SETEGID:
 	case PRIV_CRED_SETGROUPS:
 	case PRIV_CRED_SETREUID:
 	case PRIV_CRED_SETREGID:
 	case PRIV_CRED_SETRESUID:
 	case PRIV_CRED_SETRESGID:
 
 		/*
 		 * Jail implements visibility constraints already, so allow
 		 * jailed root to override uid/gid-based constraints.
 		 */
 	case PRIV_SEEOTHERGIDS:
 	case PRIV_SEEOTHERUIDS:
 
 		/*
 		 * Jail implements inter-process debugging limits already, so
 		 * allow jailed root various debugging privileges.
 		 */
 	case PRIV_DEBUG_DIFFCRED:
 	case PRIV_DEBUG_SUGID:
 	case PRIV_DEBUG_UNPRIV:
 
 		/*
 		 * Allow jail to set various resource limits and login
 		 * properties, and for now, exceed process resource limits.
 		 */
 	case PRIV_PROC_LIMIT:
 	case PRIV_PROC_SETLOGIN:
 	case PRIV_PROC_SETRLIMIT:
 
 		/*
 		 * System V and POSIX IPC privileges are granted in jail.
 		 */
 	case PRIV_IPC_READ:
 	case PRIV_IPC_WRITE:
 	case PRIV_IPC_ADMIN:
 	case PRIV_IPC_MSGSIZE:
 	case PRIV_MQ_ADMIN:
 
 		/*
 		 * Jail operations within a jail work on child jails.
 		 */
 	case PRIV_JAIL_ATTACH:
 	case PRIV_JAIL_SET:
 	case PRIV_JAIL_REMOVE:
 
 		/*
 		 * Jail implements its own inter-process limits, so allow
 		 * root processes in jail to change scheduling on other
 		 * processes in the same jail.  Likewise for signalling.
 		 */
 	case PRIV_SCHED_DIFFCRED:
 	case PRIV_SCHED_CPUSET:
 	case PRIV_SIGNAL_DIFFCRED:
 	case PRIV_SIGNAL_SUGID:
 
 		/*
 		 * Allow jailed processes to write to sysctls marked as jail
 		 * writable.
 		 */
 	case PRIV_SYSCTL_WRITEJAIL:
 
 		/*
 		 * Allow root in jail to manage a variety of quota
 		 * properties.  These should likely be conditional on a
 		 * configuration option.
 		 */
 	case PRIV_VFS_GETQUOTA:
 	case PRIV_VFS_SETQUOTA:
 
 		/*
 		 * Since Jail relies on chroot() to implement file system
 		 * protections, grant many VFS privileges to root in jail.
 		 * Be careful to exclude mount-related and NFS-related
 		 * privileges.
 		 */
 	case PRIV_VFS_READ:
 	case PRIV_VFS_WRITE:
 	case PRIV_VFS_ADMIN:
 	case PRIV_VFS_EXEC:
 	case PRIV_VFS_LOOKUP:
 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
 	case PRIV_VFS_CHFLAGS_DEV:
 	case PRIV_VFS_CHOWN:
 	case PRIV_VFS_CHROOT:
 	case PRIV_VFS_RETAINSUGID:
 	case PRIV_VFS_FCHROOT:
 	case PRIV_VFS_LINK:
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STAT:
 	case PRIV_VFS_STICKYFILE:
 
 		/*
 		 * As in the non-jail case, non-root users are expected to be
 		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
 		 * exists in the jail and they have permission to access it).
 		 */
 	case PRIV_KMEM_READ:
 		return (0);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * setting system flags.
 		 */
 	case PRIV_VFS_SYSFLAGS:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * mounting/unmounting file systems.
 		 */
 	case PRIV_VFS_MOUNT:
 	case PRIV_VFS_UNMOUNT:
 	case PRIV_VFS_MOUNT_NONUSER:
 	case PRIV_VFS_MOUNT_OWNER:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
 		    cred->cr_prison->pr_enforce_statfs < 2)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Conditionally allow jailed root to bind reserved ports.
 		 */
 	case PRIV_NETINET_RESERVEDPORT:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Allow jailed root to reuse in-use ports.
 		 */
 	case PRIV_NETINET_REUSEPORT:
 		return (0);
 
 		/*
 		 * Allow jailed root to set certain IPv4/6 (option) headers.
 		 */
 	case PRIV_NETINET_SETHDROPTS:
 		return (0);
 
 		/*
 		 * Conditionally allow creating raw sockets in jail.
 		 */
 	case PRIV_NETINET_RAW:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Since jail implements its own visibility limits on netstat
 		 * sysctls, allow getcred.  This allows identd to work in
 		 * jail.
 		 */
 	case PRIV_NETINET_GETCRED:
 		return (0);
 
 		/*
 		 * Allow jailed root to set loginclass.
 		 */
 	case PRIV_PROC_SETLOGINCLASS:
 		return (0);
 
 	default:
 		/*
 		 * In all remaining cases, deny the privilege request.  This
 		 * includes almost all network privileges, many system
 		 * configuration privileges.
 		 */
 		return (EPERM);
 	}
 }
 
 /*
  * Return the part of pr2's name that is relative to pr1, or the whole name
  * if it does not directly follow.
  */
 
 char *
 prison_name(struct prison *pr1, struct prison *pr2)
 {
 	char *name;
 
 	/* Jails see themselves as "0" (if they see themselves at all). */
 	if (pr1 == pr2)
 		return "0";
 	name = pr2->pr_name;
 	if (prison_ischild(pr1, pr2)) {
 		/*
 		 * pr1 isn't locked (and allprison_lock may not be either)
 		 * so its length can't be counted on.  But the number of dots
 		 * can be counted on - and counted.
 		 */
 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
 			name = strchr(name, '.') + 1;
 	}
 	return (name);
 }
 
 /*
  * Return the part of pr2's path that is relative to pr1, or the whole path
  * if it does not directly follow.
  */
 static char *
 prison_path(struct prison *pr1, struct prison *pr2)
 {
 	char *path1, *path2;
 	int len1;
 
 	path1 = pr1->pr_path;
 	path2 = pr2->pr_path;
 	if (!strcmp(path1, "/"))
 		return (path2);
 	len1 = strlen(path1);
 	if (strncmp(path1, path2, len1))
 		return (path2);
 	if (path2[len1] == '\0')
 		return "/";
 	if (path2[len1] == '/')
 		return (path2 + len1);
 	return (path2);
 }
 
 
 /*
  * Jail-related sysctls.
  */
 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
     "Jails");
 
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
 	struct xprison *xp;
 	struct prison *pr, *cpr;
 #ifdef INET
 	struct in_addr *ip4 = NULL;
 	int ip4s = 0;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6 = NULL;
 	int ip6s = 0;
 #endif
 	int descend, error;
 
 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
 	pr = req->td->td_ucred->cr_prison;
 	error = 0;
 	sx_slock(&allprison_lock);
 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 #if defined(INET) || defined(INET6)
  again:
 #endif
 		mtx_lock(&cpr->pr_mtx);
 #ifdef INET
 		if (cpr->pr_ip4s > 0) {
 			if (ip4s < cpr->pr_ip4s) {
 				ip4s = cpr->pr_ip4s;
 				mtx_unlock(&cpr->pr_mtx);
 				ip4 = realloc(ip4, ip4s *
 				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
 				goto again;
 			}
 			bcopy(cpr->pr_ip4, ip4,
 			    cpr->pr_ip4s * sizeof(struct in_addr));
 		}
 #endif
 #ifdef INET6
 		if (cpr->pr_ip6s > 0) {
 			if (ip6s < cpr->pr_ip6s) {
 				ip6s = cpr->pr_ip6s;
 				mtx_unlock(&cpr->pr_mtx);
 				ip6 = realloc(ip6, ip6s *
 				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
 				goto again;
 			}
 			bcopy(cpr->pr_ip6, ip6,
 			    cpr->pr_ip6s * sizeof(struct in6_addr));
 		}
 #endif
 		if (cpr->pr_ref == 0) {
 			mtx_unlock(&cpr->pr_mtx);
 			continue;
 		}
 		bzero(xp, sizeof(*xp));
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = cpr->pr_id;
 		xp->pr_state = cpr->pr_uref > 0
 		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
 #ifdef INET
 		xp->pr_ip4s = cpr->pr_ip4s;
 #endif
 #ifdef INET6
 		xp->pr_ip6s = cpr->pr_ip6s;
 #endif
 		mtx_unlock(&cpr->pr_mtx);
 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
 		if (error)
 			break;
 #ifdef INET
 		if (xp->pr_ip4s > 0) {
 			error = SYSCTL_OUT(req, ip4,
 			    xp->pr_ip4s * sizeof(struct in_addr));
 			if (error)
 				break;
 		}
 #endif
 #ifdef INET6
 		if (xp->pr_ip6s > 0) {
 			error = SYSCTL_OUT(req, ip6,
 			    xp->pr_ip6s * sizeof(struct in6_addr));
 			if (error)
 				break;
 		}
 #endif
 	}
 	sx_sunlock(&allprison_lock);
 	free(xp, M_TEMP);
 #ifdef INET
 	free(ip4, M_TEMP);
 #endif
 #ifdef INET6
 	free(ip6, M_TEMP);
 #endif
 	return (error);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_list, "S", "List of active jails");
 
 static int
 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 {
 	int error, injail;
 
 	injail = jailed(req->td->td_ucred);
 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_jailed, "I", "Process in jail?");
 
 static int
 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 {
 	int error, havevnet;
 #ifdef VIMAGE
 	struct ucred *cred = req->td->td_ucred;
 
 	havevnet = jailed(cred) && prison_owns_vnet(cred);
 #else
 	havevnet = 0;
 #endif
 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_vnet, "I", "Jail owns VNET?");
 
 #if defined(INET) || defined(INET6)
 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
     &jail_max_af_ips, 0,
     "Number of IP addresses a jail may have at most per address family (deprecated)");
 #endif
 
 /*
  * Default parameters for jail(2) compatibility.  For historical reasons,
  * the sysctl names have varying similarity to the parameter names.  Prisons
  * just see their own parameters, and can't change them.
  */
 static int
 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int allow, error, i;
 
 	pr = req->td->td_ucred->cr_prison;
 	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
 
 	/* Get the current flag value, and convert it to a boolean. */
 	i = (allow & arg2) ? 1 : 0;
 	if (arg1 != NULL)
 		i = !i;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	i = i ? arg2 : 0;
 	if (arg1 != NULL)
 		i ^= arg2;
 	/*
 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
 	 * for writing.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	jail_default_allow = (jail_default_allow & ~arg2) | i;
 	mtx_unlock(&prison0.pr_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
     "Processes in jail can set their hostnames (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
     "Processes in jail can use System V IPC primitives (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
     "Prison root can create raw sockets (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
     "Processes in jail can alter system file flags (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the devfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the fdescfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the nullfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the procfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the linprocfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the linsysfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the tmpfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the zfs file system (deprecated)");
 
 static int
 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int level, error;
 
 	pr = req->td->td_ucred->cr_prison;
 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	*(int *)arg1 = level;
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
     sysctl_jail_default_level, "I",
     "Processes in jail cannot see all mounted file systems (deprecated)");
 
 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
     sysctl_jail_default_level, "I",
     "Ruleset for the devfs filesystem in jail (deprecated)");
 
 /*
  * Nodes to describe jail parameters.  Maximum length of string parameters
  * is returned in the string itself, and the other parameters exist merely
  * to make themselves and their types known.
  */
 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
     "Jail parameters");
 
 int
 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	long l;
 	size_t s;
 	char numbuf[12];
 
 	switch (oidp->oid_kind & CTLTYPE)
 	{
 	case CTLTYPE_LONG:
 	case CTLTYPE_ULONG:
 		l = 0;
 #ifdef SCTL_MASK32
 		if (!(req->flags & SCTL_MASK32))
 #endif
 			return (SYSCTL_OUT(req, &l, sizeof(l)));
 	case CTLTYPE_INT:
 	case CTLTYPE_UINT:
 		i = 0;
 		return (SYSCTL_OUT(req, &i, sizeof(i)));
 	case CTLTYPE_STRING:
 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
 		return
 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
 	case CTLTYPE_STRUCT:
 		s = (size_t)arg2;
 		return (SYSCTL_OUT(req, &s, sizeof(s)));
 	}
 	return (0);
 }
 
 /*
  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
  * jail creation time but cannot be changed in an existing jail.
  */
 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail secure level");
 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 
     "Jail value for kern.osreldate and uname -K");
 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 
     "Jail value for kern.osrelease and uname -r");
 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail cannot see all mounted file systems");
 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Ruleset for in-jail devfs mounts");
 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail persistence");
 #ifdef VIMAGE
 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
     "E,jailsys", "Virtual network stack");
 #endif
 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
     "B", "Jail is in the process of shutting down");
 
 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
     "I", "Current number of child jails");
 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Maximum number of child jails");
 
 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail hostname");
 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail NIS domainname");
 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
     "Jail host UUID");
 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
     "LU", "Jail host ID");
 
 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
 
 #ifdef INET
 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
     "Jail IPv4 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
     "S,in_addr,a", "Jail IPv4 addresses");
 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv4 source address selection rather than the "
     "primary jail IPv4 address.");
 #endif
 #ifdef INET6
 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
     "Jail IPv6 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
     "S,in6_addr,a", "Jail IPv6 addresses");
 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv6 source address selection rather than the "
     "primary jail IPv6 address.");
 #endif
 
 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set hostname");
 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may use SYSV IPC");
 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create raw sockets");
 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may alter system file flags");
 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set file quotas");
 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may bind sockets to reserved ports");
 
 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount/unmount jail-friendly file systems in general");
 SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the devfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the fdescfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the nullfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the procfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the linprocfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the linsysfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the tmpfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the zfs file system");
 
 #ifdef RACCT
 void
 prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allprison_lock);
 	if (pre != NULL)
 		(pre)();
 	LIST_FOREACH(prr, &allprison_racct, prr_next)
 		(callback)(prr->prr_racct, arg2, arg3);
 	if (post != NULL)
 		(post)();
 	sx_sunlock(&allprison_lock);
 }
 
 static struct prison_racct *
 prison_racct_find_locked(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
 		return (NULL);
 
 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
 		if (strcmp(name, prr->prr_name) != 0)
 			continue;
 
 		/* Found prison_racct with a matching name? */
 		prison_racct_hold(prr);
 		return (prr);
 	}
 
 	/* Add new prison_racct. */
 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
 	racct_create(&prr->prr_racct);
 
 	strcpy(prr->prr_name, name);
 	refcount_init(&prr->prr_refcount, 1);
 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
 
 	return (prr);
 }
 
 struct prison_racct *
 prison_racct_find(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_xlock(&allprison_lock);
 	prr = prison_racct_find_locked(name);
 	sx_xunlock(&allprison_lock);
 	return (prr);
 }
 
 void
 prison_racct_hold(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	refcount_acquire(&prr->prr_refcount);
 }
 
 static void
 prison_racct_free_locked(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (refcount_release(&prr->prr_refcount)) {
 		racct_destroy(&prr->prr_racct);
 		LIST_REMOVE(prr, prr_next);
 		free(prr, M_PRISON_RACCT);
 	}
 }
 
 void
 prison_racct_free(struct prison_racct *prr)
 {
 	int old;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	old = prr->prr_refcount;
 	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
 		return;
 
 	sx_xlock(&allprison_lock);
 	prison_racct_free_locked(prr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_attach(struct prison *pr)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	prr = prison_racct_find_locked(pr->pr_name);
 	KASSERT(prr != NULL, ("cannot find prison_racct"));
 
 	pr->pr_prison_racct = prr;
 }
 
 /*
  * Handle jail renaming.  From the racct point of view, renaming means
  * moving from one prison_racct to another.
  */
 static void
 prison_racct_modify(struct prison *pr)
 {
 	struct proc *p;
 	struct ucred *cred;
 	struct prison_racct *oldprr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
 		sx_xunlock(&allprison_lock);
 		sx_sunlock(&allproc_lock);
 		return;
 	}
 
 	oldprr = pr->pr_prison_racct;
 	pr->pr_prison_racct = NULL;
 
 	prison_racct_attach(pr);
 
 	/*
 	 * Move resource utilisation records.
 	 */
 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
 
 	/*
 	 * Force rctl to reattach rules to processes.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		cred = crhold(p->p_ucred);
 		PROC_UNLOCK(p);
 		racct_proc_ucred_changed(p, cred, cred);
 		crfree(cred);
 	}
 
 	sx_sunlock(&allproc_lock);
 	prison_racct_free_locked(oldprr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_detach(struct prison *pr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (pr->pr_prison_racct == NULL)
 		return;
 	prison_racct_free(pr->pr_prison_racct);
 	pr->pr_prison_racct = NULL;
 }
 #endif /* RACCT */
 
 #ifdef DDB
 
 static void
 db_show_prison(struct prison *pr)
 {
 	int fi;
 #if defined(INET) || defined(INET6)
 	int ii;
 #endif
 	unsigned jsf;
 #ifdef INET
 	char ip4buf[INET_ADDRSTRLEN];
 #endif
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	db_printf("prison %p:\n", pr);
 	db_printf(" jid             = %d\n", pr->pr_id);
 	db_printf(" name            = %s\n", pr->pr_name);
 	db_printf(" parent          = %p\n", pr->pr_parent);
 	db_printf(" ref             = %d\n", pr->pr_ref);
 	db_printf(" uref            = %d\n", pr->pr_uref);
 	db_printf(" path            = %s\n", pr->pr_path);
 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
 	    ? pr->pr_cpuset->cs_id : -1);
 #ifdef VIMAGE
 	db_printf(" vnet            = %p\n", pr->pr_vnet);
 #endif
 	db_printf(" root            = %p\n", pr->pr_root);
 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
 	db_printf(" children.max    = %d\n", pr->pr_childmax);
 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
 	db_printf(" flags           = 0x%x", pr->pr_flags);
 	for (fi = 0; fi < nitems(pr_flag_names); fi++)
 		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
 			db_printf(" %s", pr_flag_names[fi]);
 	for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) {
 		jsf = pr->pr_flags &
 		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
 		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
 		    pr_flag_jailsys[fi].disable && 
 		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
 		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
 		    : "inherit");
 	}
 	db_printf(" allow           = 0x%x", pr->pr_allow);
 	for (fi = 0; fi < nitems(pr_allow_names); fi++)
 		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
 			db_printf(" %s", pr_allow_names[fi]);
 	db_printf("\n");
 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
 #ifdef INET
 	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
 	for (ii = 0; ii < pr->pr_ip4s; ii++)
 		db_printf(" %s %s\n",
 		    ii == 0 ? "ip4.addr        =" : "                 ",
 		    inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
 #endif
 #ifdef INET6
 	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
 	for (ii = 0; ii < pr->pr_ip6s; ii++)
 		db_printf(" %s %s\n",
 		    ii == 0 ? "ip6.addr        =" : "                 ",
 		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
 #endif
 }
 
 DB_SHOW_COMMAND(prison, db_show_prison_command)
 {
 	struct prison *pr;
 
 	if (!have_addr) {
 		/*
 		 * Show all prisons in the list, and prison0 which is not
 		 * listed.
 		 */
 		db_show_prison(&prison0);
 		if (!db_pager_quit) {
 			TAILQ_FOREACH(pr, &allprison, pr_list) {
 				db_show_prison(pr);
 				if (db_pager_quit)
 					break;
 			}
 		}
 		return;
 	}
 
 	if (addr == 0)
 		pr = &prison0;
 	else {
 		/* Look for a prison with the ID and with references. */
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			if (pr->pr_id == addr && pr->pr_ref > 0)
 				break;
 		if (pr == NULL)
 			/* Look again, without requiring a reference. */
 			TAILQ_FOREACH(pr, &allprison, pr_list)
 				if (pr->pr_id == addr)
 					break;
 		if (pr == NULL)
 			/* Assume address points to a valid prison. */
 			pr = (struct prison *)addr;
 	}
 	db_show_prison(pr);
 }
 
 #endif /* DDB */
Index: head/sys/kern/kern_khelp.c
===================================================================
--- head/sys/kern/kern_khelp.c	(revision 326270)
+++ head/sys/kern/kern_khelp.c	(revision 326271)
@@ -1,372 +1,374 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2010,2013 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Lawrence Stewart while studying at the Centre
  * for Advanced Internet Architectures, Swinburne University of Technology,
  * made possible in part by grants from the FreeBSD Foundation and Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
 #include <sys/khelp.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/module_khelp.h>
 #include <sys/osd.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/systm.h>
 
 static struct rwlock khelp_list_lock;
 RW_SYSINIT(khelplistlock, &khelp_list_lock, "helper list lock");
 
 static TAILQ_HEAD(helper_head, helper) helpers = TAILQ_HEAD_INITIALIZER(helpers);
 
 /* Private function prototypes. */
 static inline void khelp_remove_osd(struct helper *h, struct osd *hosd);
 void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
 
 #define	KHELP_LIST_WLOCK() rw_wlock(&khelp_list_lock)
 #define	KHELP_LIST_WUNLOCK() rw_wunlock(&khelp_list_lock)
 #define	KHELP_LIST_RLOCK() rw_rlock(&khelp_list_lock)
 #define	KHELP_LIST_RUNLOCK() rw_runlock(&khelp_list_lock)
 #define	KHELP_LIST_LOCK_ASSERT() rw_assert(&khelp_list_lock, RA_LOCKED)
 
 int
 khelp_register_helper(struct helper *h)
 {
 	struct helper *tmph;
 	int error, i, inserted;
 
 	error = inserted = 0;
 	refcount_init(&h->h_refcount, 0);
 	h->h_id = osd_register(OSD_KHELP, NULL, NULL);
 
 	/* It's only safe to add the hooks after osd_register(). */
 	for (i = 0; i < h->h_nhooks && !error; i++) {
 		/* We don't require the module to assign hook_helper. */
 		h->h_hooks[i].hook_helper = h;
 		error = hhook_add_hook_lookup(&h->h_hooks[i], HHOOK_WAITOK);
 		if (error)
 			printf("%s: \"%s\" khelp module unable to "
 			    "hook type %d id %d due to error %d\n", __func__,
 			    h->h_name, h->h_hooks[i].hook_type,
 			    h->h_hooks[i].hook_id, error);
 	}
 
 	if (error) {
 		for (i--; i >= 0; i--)
 			hhook_remove_hook_lookup(&h->h_hooks[i]);
 		osd_deregister(OSD_KHELP, h->h_id);
 	} else {
 		KHELP_LIST_WLOCK();
 		/*
 		 * Keep list of helpers sorted in descending h_id order. Due to
 		 * the way osd_set() works, a sorted list ensures
 		 * khelp_init_osd() will operate with improved efficiency.
 		 */
 		TAILQ_FOREACH(tmph, &helpers, h_next) {
 			if (tmph->h_id < h->h_id) {
 				TAILQ_INSERT_BEFORE(tmph, h, h_next);
 				inserted = 1;
 				break;
 			}
 		}
 
 		if (!inserted)
 			TAILQ_INSERT_TAIL(&helpers, h, h_next);
 		KHELP_LIST_WUNLOCK();
 	}
 
 	return (error);
 }
 
 int
 khelp_deregister_helper(struct helper *h)
 {
 	struct helper *tmph;
 	int error, i;
 
 	KHELP_LIST_WLOCK();
 	if (h->h_refcount > 0)
 		error = EBUSY;
 	else {
 		error = ENOENT;
 		TAILQ_FOREACH(tmph, &helpers, h_next) {
 			if (tmph == h) {
 				TAILQ_REMOVE(&helpers, h, h_next);
 				error = 0;
 				break;
 			}
 		}
 	}
 	KHELP_LIST_WUNLOCK();
 
 	if (!error) {
 		for (i = 0; i < h->h_nhooks; i++)
 			hhook_remove_hook_lookup(&h->h_hooks[i]);
 		osd_deregister(OSD_KHELP, h->h_id);
 	}
 
 	return (error);
 }
 
 int
 khelp_init_osd(uint32_t classes, struct osd *hosd)
 {
 	struct helper *h;
 	void *hdata;
 	int error;
 
 	KASSERT(hosd != NULL, ("struct osd not initialised!"));
 
 	error = 0;
 
 	KHELP_LIST_RLOCK();
 	TAILQ_FOREACH(h, &helpers, h_next) {
 		/* If helper is correct class and needs to store OSD... */
 		if (h->h_classes & classes && h->h_flags & HELPER_NEEDS_OSD) {
 			hdata = uma_zalloc(h->h_zone, M_NOWAIT);
 			if (hdata == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			osd_set(OSD_KHELP, hosd, h->h_id, hdata);
 			refcount_acquire(&h->h_refcount);
 		}
 	}
 
 	if (error) {
 		/* Delete OSD that was assigned prior to the error. */
 		TAILQ_FOREACH(h, &helpers, h_next) {
 			if (h->h_classes & classes)
 				khelp_remove_osd(h, hosd);
 		}
 	}
 	KHELP_LIST_RUNLOCK();
 
 	return (error);
 }
 
 int
 khelp_destroy_osd(struct osd *hosd)
 {
 	struct helper *h;
 	int error;
 
 	KASSERT(hosd != NULL, ("struct osd not initialised!"));
 
 	error = 0;
 
 	KHELP_LIST_RLOCK();
 	/*
 	 * Clean up all khelp related OSD.
 	 *
 	 * XXXLAS: Would be nice to use something like osd_exit() here but it
 	 * doesn't have the right semantics for this purpose.
 	 */
 	TAILQ_FOREACH(h, &helpers, h_next)
 		khelp_remove_osd(h, hosd);
 	KHELP_LIST_RUNLOCK();
 
 	return (error);
 }
 
 static inline void
 khelp_remove_osd(struct helper *h, struct osd *hosd)
 {
 	void *hdata;
 
 	if (h->h_flags & HELPER_NEEDS_OSD) {
 		/*
 		 * If the current helper uses OSD and calling osd_get()
 		 * on the helper's h_id returns non-NULL, the helper has
 		 * OSD attached to 'hosd' which needs to be cleaned up.
 		 */
 		hdata = osd_get(OSD_KHELP, hosd, h->h_id);
 		if (hdata != NULL) {
 			uma_zfree(h->h_zone, hdata);
 			osd_del(OSD_KHELP, hosd, h->h_id);
 			refcount_release(&h->h_refcount);
 		}
 	}
 }
 
 void *
 khelp_get_osd(struct osd *hosd, int32_t id)
 {
 
 	return (osd_get(OSD_KHELP, hosd, id));
 }
 
 int32_t
 khelp_get_id(char *hname)
 {
 	struct helper *h;
 	int32_t id;
 
 	id = -1;
 
 	KHELP_LIST_RLOCK();
 	TAILQ_FOREACH(h, &helpers, h_next) {
 		if (strncmp(h->h_name, hname, HELPER_NAME_MAXLEN) == 0) {
 			id = h->h_id;
 			break;
 		}
 	}
 	KHELP_LIST_RUNLOCK();
 
 	return (id);
 }
 
 int
 khelp_add_hhook(struct hookinfo *hki, uint32_t flags)
 {
 	int error;
 
 	/*
 	 * XXXLAS: Should probably include the functionality to update the
 	 * helper's h_hooks struct member.
 	 */
 	error = hhook_add_hook_lookup(hki, flags);
 
 	return (error);
 }
 
 int
 khelp_remove_hhook(struct hookinfo *hki)
 {
 	int error;
 
 	/*
 	 * XXXLAS: Should probably include the functionality to update the
 	 * helper's h_hooks struct member.
 	 */
 	error = hhook_remove_hook_lookup(hki);
 
 	return (error);
 }
 
 /*
  * Private KPI between hhook and khelp that allows khelp modules to insert hook
  * functions into hhook points which register after the modules were loaded.
  */
 void
 khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags)
 {
 	struct helper *h;
 	int error, i;
 
 	KHELP_LIST_RLOCK();
 	TAILQ_FOREACH(h, &helpers, h_next) {
 		for (i = 0; i < h->h_nhooks; i++) {
 			if (hhh->hhh_type != h->h_hooks[i].hook_type ||
 			    hhh->hhh_id != h->h_hooks[i].hook_id)
 				continue;
 			error = hhook_add_hook(hhh, &h->h_hooks[i], flags);
 			if (error) {
 				printf("%s: \"%s\" khelp module unable to "
 				    "hook type %d id %d due to error %d\n",
 				    __func__, h->h_name,
 				    h->h_hooks[i].hook_type,
 				    h->h_hooks[i].hook_id, error);
 				error = 0;
 			}
 		}
 	}
 	KHELP_LIST_RUNLOCK();
 }
 
 int
 khelp_modevent(module_t mod, int event_type, void *data)
 {
 	struct khelp_modevent_data *kmd;
 	int error;
 
 	kmd = (struct khelp_modevent_data *)data;
 	error = 0;
 
 	switch(event_type) {
 	case MOD_LOAD:
 		if (kmd->helper->h_flags & HELPER_NEEDS_OSD) {
 			if (kmd->uma_zsize <= 0) {
 				printf("Use KHELP_DECLARE_MOD_UMA() instead!\n");
 				error = EDOOFUS;
 				break;
 			}
 			kmd->helper->h_zone = uma_zcreate(kmd->name,
 			    kmd->uma_zsize, kmd->umactor, kmd->umadtor, NULL,
 			    NULL, 0, 0);
 			if (kmd->helper->h_zone == NULL) {
 				error = ENOMEM;
 				break;
 			}
 		}
 		strlcpy(kmd->helper->h_name, kmd->name, HELPER_NAME_MAXLEN);
 		kmd->helper->h_hooks = kmd->hooks;
 		kmd->helper->h_nhooks = kmd->nhooks;
 		if (kmd->helper->mod_init != NULL)
 			error = kmd->helper->mod_init();
 		if (!error)
 			error = khelp_register_helper(kmd->helper);
 		break;
 
 	case MOD_QUIESCE:
 	case MOD_SHUTDOWN:
 	case MOD_UNLOAD:
 		error = khelp_deregister_helper(kmd->helper);
 		if (!error) {
 			if (kmd->helper->h_flags & HELPER_NEEDS_OSD)
 				uma_zdestroy(kmd->helper->h_zone);
 			if (kmd->helper->mod_destroy != NULL)
 				kmd->helper->mod_destroy();
 		} else if (error == ENOENT)
 			/* Do nothing and allow unload if helper not in list. */
 			error = 0;
 		else if (error == EBUSY)
 			printf("Khelp module \"%s\" can't unload until its "
 			    "refcount drops from %d to 0.\n", kmd->name,
 			    kmd->helper->h_refcount);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
Index: head/sys/kern/kern_kthread.c
===================================================================
--- head/sys/kern/kern_kthread.c	(revision 326270)
+++ head/sys/kern/kern_kthread.c	(revision 326271)
@@ -1,470 +1,472 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuset.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/umtx.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <sys/sched.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <machine/stdarg.h>
 
 /*
  * Start a kernel process.  This is called after a fork() call in
  * mi_startup() in the file kern/init_main.c.
  *
  * This function is used to start "internal" daemons and intended
  * to be called from SYSINIT().
  */
 void
 kproc_start(const void *udata)
 {
 	const struct kproc_desc	*kp = udata;
 	int error;
 
 	error = kproc_create((void (*)(void *))kp->func, NULL,
 		    kp->global_procpp, 0, 0, "%s", kp->arg0);
 	if (error)
 		panic("kproc_start: %s: error %d", kp->arg0, error);
 }
 
 /*
  * Create a kernel process/thread/whatever.  It shares its address space
  * with proc0 - ie: kernel only.
  *
  * func is the function to start.
  * arg is the parameter to pass to function on first startup.
  * newpp is the return value pointing to the thread's struct proc.
  * flags are flags to fork1 (in unistd.h)
  * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.).
  */
 int
 kproc_create(void (*func)(void *), void *arg,
     struct proc **newpp, int flags, int pages, const char *fmt, ...)
 {
 	struct fork_req fr;
 	int error;
 	va_list ap;
 	struct thread *td;
 	struct proc *p2;
 
 	if (!proc0.p_stats)
 		panic("kproc_create called too soon");
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = RFMEM | RFFDG | RFPROC | RFSTOPPED | flags;
 	fr.fr_pages = pages;
 	fr.fr_procp = &p2;
 	error = fork1(&thread0, &fr);
 	if (error)
 		return error;
 
 	/* save a global descriptor, if desired */
 	if (newpp != NULL)
 		*newpp = p2;
 
 	/* this is a non-swapped system process */
 	PROC_LOCK(p2);
 	td = FIRST_THREAD_IN_PROC(p2);
 	p2->p_flag |= P_SYSTEM | P_KPROC;
 	td->td_pflags |= TDP_KTHREAD;
 	mtx_lock(&p2->p_sigacts->ps_mtx);
 	p2->p_sigacts->ps_flag |= PS_NOCLDWAIT;
 	mtx_unlock(&p2->p_sigacts->ps_mtx);
 	PROC_UNLOCK(p2);
 
 	/* set up arg0 for 'ps', et al */
 	va_start(ap, fmt);
 	vsnprintf(p2->p_comm, sizeof(p2->p_comm), fmt, ap);
 	va_end(ap);
 	/* set up arg0 for 'ps', et al */
 	va_start(ap, fmt);
 	vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap);
 	va_end(ap);
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 
 	/* call the processes' main()... */
 	cpu_fork_kthread_handler(td, func, arg);
 
 	/* Avoid inheriting affinity from a random parent. */
 	cpuset_setthread(td->td_tid, cpuset_root);
 	thread_lock(td);
 	TD_SET_CAN_RUN(td);
 	sched_prio(td, PVM);
 	sched_user_prio(td, PUSER);
 
 	/* Delay putting it on the run queue until now. */
 	if (!(flags & RFSTOPPED))
 		sched_add(td, SRQ_BORING); 
 	thread_unlock(td);
 
 	return 0;
 }
 
 void
 kproc_exit(int ecode)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 
 	/*
 	 * Reparent curthread from proc0 to init so that the zombie
 	 * is harvested.
 	 */
 	sx_xlock(&proctree_lock);
 	PROC_LOCK(p);
 	proc_reparent(p, initproc);
 	PROC_UNLOCK(p);
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * Wakeup anyone waiting for us to exit.
 	 */
 	wakeup(p);
 
 	/* Buh-bye! */
 	exit1(td, ecode, 0);
 }
 
 /*
  * Advise a kernel process to suspend (or resume) in its main loop.
  * Participation is voluntary.
  */
 int
 kproc_suspend(struct proc *p, int timo)
 {
 	/*
 	 * Make sure this is indeed a system process and we can safely
 	 * use the p_siglist field.
 	 */
 	PROC_LOCK(p);
 	if ((p->p_flag & P_KPROC) == 0) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	SIGADDSET(p->p_siglist, SIGSTOP);
 	wakeup(p);
 	return msleep(&p->p_siglist, &p->p_mtx, PPAUSE | PDROP, "suspkp", timo);
 }
 
 int
 kproc_resume(struct proc *p)
 {
 	/*
 	 * Make sure this is indeed a system process and we can safely
 	 * use the p_siglist field.
 	 */
 	PROC_LOCK(p);
 	if ((p->p_flag & P_KPROC) == 0) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	SIGDELSET(p->p_siglist, SIGSTOP);
 	PROC_UNLOCK(p);
 	wakeup(&p->p_siglist);
 	return (0);
 }
 
 void
 kproc_suspend_check(struct proc *p)
 {
 	PROC_LOCK(p);
 	while (SIGISMEMBER(p->p_siglist, SIGSTOP)) {
 		wakeup(&p->p_siglist);
 		msleep(&p->p_siglist, &p->p_mtx, PPAUSE, "kpsusp", 0);
 	}
 	PROC_UNLOCK(p);
 }
 
 
 /*
  * Start a kernel thread.  
  *
  * This function is used to start "internal" daemons and intended
  * to be called from SYSINIT().
  */
 
 void
 kthread_start(const void *udata)
 {
 	const struct kthread_desc	*kp = udata;
 	int error;
 
 	error = kthread_add((void (*)(void *))kp->func, NULL,
 		    NULL, kp->global_threadpp, 0, 0, "%s", kp->arg0);
 	if (error)
 		panic("kthread_start: %s: error %d", kp->arg0, error);
 }
 
 /*
  * Create a kernel thread.  It shares its address space
  * with proc0 - ie: kernel only.
  *
  * func is the function to start.
  * arg is the parameter to pass to function on first startup.
  * newtdp is the return value pointing to the thread's struct thread.
  *  ** XXX fix this --> flags are flags to fork1 (in unistd.h) 
  * fmt and following will be *printf'd into (*newtd)->td_name (for ps, etc.).
  */
 int
 kthread_add(void (*func)(void *), void *arg, struct proc *p,
     struct thread **newtdp, int flags, int pages, const char *fmt, ...)
 {
 	va_list ap;
 	struct thread *newtd, *oldtd;
 
 	if (!proc0.p_stats)
 		panic("kthread_add called too soon");
 
 	/* If no process supplied, put it on proc0 */
 	if (p == NULL)
 		p = &proc0;
 
 	/* Initialize our new td  */
 	newtd = thread_alloc(pages);
 	if (newtd == NULL)
 		return (ENOMEM);
 
 	PROC_LOCK(p);
 	oldtd = FIRST_THREAD_IN_PROC(p);
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
 	/* set up arg0 for 'ps', et al */
 	va_start(ap, fmt);
 	vsnprintf(newtd->td_name, sizeof(newtd->td_name), fmt, ap);
 	va_end(ap);
 
 	newtd->td_proc = p;  /* needed for cpu_copy_thread */
 	/* might be further optimized for kthread */
 	cpu_copy_thread(newtd, oldtd);
 	/* put the designated function(arg) as the resume context */
 	cpu_fork_kthread_handler(newtd, func, arg);
 
 	newtd->td_pflags |= TDP_KTHREAD;
 	thread_cow_get_proc(newtd, p);
 
 	/* this code almost the same as create_thread() in kern_thr.c */
 	p->p_flag |= P_HADTHREADS;
 	thread_link(newtd, p);
 	thread_lock(oldtd);
 	/* let the scheduler know about these things. */
 	sched_fork_thread(oldtd, newtd);
 	TD_SET_CAN_RUN(newtd);
 	thread_unlock(oldtd);
 	PROC_UNLOCK(p);
 
 	tidhash_add(newtd);
 
 	/* Avoid inheriting affinity from a random parent. */
 	cpuset_setthread(newtd->td_tid, cpuset_root);
 
 	/* Delay putting it on the run queue until now. */
 	if (!(flags & RFSTOPPED)) {
 		thread_lock(newtd);
 		sched_add(newtd, SRQ_BORING); 
 		thread_unlock(newtd);
 	}
 	if (newtdp)
 		*newtdp = newtd;
 	return 0;
 }
 
 void
 kthread_exit(void)
 {
 	struct proc *p;
 	struct thread *td;
 
 	td = curthread;
 	p = td->td_proc;
 
 	/* A module may be waiting for us to exit. */
 	wakeup(td);
 
 	/*
 	 * The last exiting thread in a kernel process must tear down
 	 * the whole process.
 	 */
 	rw_wlock(&tidhash_lock);
 	PROC_LOCK(p);
 	if (p->p_numthreads == 1) {
 		PROC_UNLOCK(p);
 		rw_wunlock(&tidhash_lock);
 		kproc_exit(0);
 	}
 	LIST_REMOVE(td, td_hash);
 	rw_wunlock(&tidhash_lock);
 	umtx_thread_exit(td);
 	tdsigcleanup(td);
 	PROC_SLOCK(p);
 	thread_exit();
 }
 
 /*
  * Advise a kernel process to suspend (or resume) in its main loop.
  * Participation is voluntary.
  */
 int
 kthread_suspend(struct thread *td, int timo)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	/*
 	 * td_pflags should not be read by any thread other than
 	 * curthread, but as long as this flag is invariant during the
 	 * thread's lifetime, it is OK to check its state.
 	 */
 	if ((td->td_pflags & TDP_KTHREAD) == 0)
 		return (EINVAL);
 
 	/*
 	 * The caller of the primitive should have already checked that the
 	 * thread is up and running, thus not being blocked by other
 	 * conditions.
 	 */
 	PROC_LOCK(p);
 	thread_lock(td);
 	td->td_flags |= TDF_KTH_SUSP;
 	thread_unlock(td);
 	return (msleep(&td->td_flags, &p->p_mtx, PPAUSE | PDROP, "suspkt",
 	    timo));
 }
 
 /*
  * Resume a thread previously put asleep with kthread_suspend().
  */
 int
 kthread_resume(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	/*
 	 * td_pflags should not be read by any thread other than
 	 * curthread, but as long as this flag is invariant during the
 	 * thread's lifetime, it is OK to check its state.
 	 */
 	if ((td->td_pflags & TDP_KTHREAD) == 0)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	thread_lock(td);
 	td->td_flags &= ~TDF_KTH_SUSP;
 	thread_unlock(td);
 	wakeup(&td->td_flags);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Used by the thread to poll as to whether it should yield/sleep
  * and notify the caller that is has happened.
  */
 void
 kthread_suspend_check(void)
 {
 	struct proc *p;
 	struct thread *td;
 
 	td = curthread;
 	p = td->td_proc;
 
 	if ((td->td_pflags & TDP_KTHREAD) == 0)
 		panic("%s: curthread is not a valid kthread", __func__);
 
 	/*
 	 * As long as the double-lock protection is used when accessing the
 	 * TDF_KTH_SUSP flag, synchronizing the read operation via proc mutex
 	 * is fine.
 	 */
 	PROC_LOCK(p);
 	while (td->td_flags & TDF_KTH_SUSP) {
 		wakeup(&td->td_flags);
 		msleep(&td->td_flags, &p->p_mtx, PPAUSE, "ktsusp", 0);
 	}
 	PROC_UNLOCK(p);
 }
 
 int
 kproc_kthread_add(void (*func)(void *), void *arg,
             struct proc **procptr, struct thread **tdptr,
             int flags, int pages, const char *procname, const char *fmt, ...) 
 {
 	int error;
 	va_list ap;
 	char buf[100];
 	struct thread *td;
 
 	if (*procptr == NULL) {
 		error = kproc_create(func, arg,
 		    	procptr, flags, pages, "%s", procname);
 		if (error)
 			return (error);
 		td = FIRST_THREAD_IN_PROC(*procptr);
 		if (tdptr)
 			*tdptr = td;
 		va_start(ap, fmt);
 		vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap);
 		va_end(ap);
 #ifdef KTR
 		sched_clear_tdname(td);
 #endif
 		return (0); 
 	}
 	va_start(ap, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 	error = kthread_add(func, arg, *procptr,
 		    tdptr, flags, pages, "%s", buf);
 	return (error);
 }
Index: head/sys/kern/kern_ktr.c
===================================================================
--- head/sys/kern/kern_ktr.c	(revision 326270)
+++ head/sys/kern/kern_ktr.c	(revision 326271)
@@ -1,474 +1,476 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This module holds the global variables used by KTR and the ktr_tracepoint()
  * function that does the actual tracing.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ktr.h"
 #include "opt_alq.h"
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/alq.h>
 #include <sys/cons.h>
 #include <sys/cpuset.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/libkern.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <machine/cpu.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_output.h>
 #endif
 
 #ifndef KTR_BOOT_ENTRIES
 #define	KTR_BOOT_ENTRIES	1024
 #endif
 
 #ifndef KTR_ENTRIES
 #define	KTR_ENTRIES	1024
 #endif
 
 /* Limit the allocations to something manageable. */
 #define	KTR_ENTRIES_MAX	(8 * 1024 * 1024)
 
 #ifndef KTR_MASK
 #define	KTR_MASK	(0)
 #endif
 
 #ifndef KTR_CPUMASK
 #define	KTR_CPUMASK	CPUSET_FSET
 #endif
 
 #ifndef KTR_TIME
 #define	KTR_TIME	get_cyclecount()
 #endif
 
 #ifndef KTR_CPU
 #define	KTR_CPU		PCPU_GET(cpuid)
 #endif
 
 static MALLOC_DEFINE(M_KTR, "KTR", "KTR");
 
 FEATURE(ktr, "Kernel support for KTR kernel tracing facility");
 
 volatile int	ktr_idx = 0;
 uint64_t ktr_mask = KTR_MASK;
 uint64_t ktr_compile = KTR_COMPILE;
 int	ktr_entries = KTR_BOOT_ENTRIES;
 int	ktr_version = KTR_VERSION;
 struct	ktr_entry ktr_buf_init[KTR_BOOT_ENTRIES];
 struct	ktr_entry *ktr_buf = ktr_buf_init;
 cpuset_t ktr_cpumask = CPUSET_T_INITIALIZER(KTR_CPUMASK);
 
 static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
 
 SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD,
     &ktr_version, 0, "Version of the KTR interface");
 
 SYSCTL_UQUAD(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD,
     &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel");
 
 static int
 sysctl_debug_ktr_cpumask(SYSCTL_HANDLER_ARGS)
 {
 	char lktr_cpumask_str[CPUSETBUFSIZ];
 	cpuset_t imask;
 	int error;
 
 	cpusetobj_strprint(lktr_cpumask_str, &ktr_cpumask);
 	error = sysctl_handle_string(oidp, lktr_cpumask_str,
 	    sizeof(lktr_cpumask_str), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (cpusetobj_strscan(&imask, lktr_cpumask_str) == -1)
 		return (EINVAL);
 	CPU_COPY(&imask, &ktr_cpumask);
 
 	return (error);
 }
 SYSCTL_PROC(_debug_ktr, OID_AUTO, cpumask,
     CTLFLAG_RWTUN | CTLFLAG_MPSAFE | CTLTYPE_STRING, NULL, 0,
     sysctl_debug_ktr_cpumask, "S",
     "Bitmask of CPUs on which KTR logging is enabled");
 
 static int
 sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS)
 {
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (clear) {
 		bzero(ktr_buf, sizeof(*ktr_buf) * ktr_entries);
 		ktr_idx = 0;
 	}
 
 	return (error);
 }
 SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
     sysctl_debug_ktr_clear, "I", "Clear KTR Buffer");
 
 /*
  * This is a sysctl proc so that it is serialized as !MPSAFE along with
  * the other ktr sysctl procs.
  */
 static int
 sysctl_debug_ktr_mask(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t mask;
 	int error;
 
 	mask = ktr_mask;
 	error = sysctl_handle_64(oidp, &mask, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	ktr_mask = mask;
 	return (error);
 }
 
 SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_U64 | CTLFLAG_RWTUN, 0, 0,
     sysctl_debug_ktr_mask, "QU",
     "Bitmask of KTR event classes for which logging is enabled");
 
 #if KTR_ENTRIES > KTR_BOOT_ENTRIES
 /*
  * A simplified version of sysctl_debug_ktr_entries.
  * No need to care about SMP, scheduling, etc.
  */
 static void
 ktr_entries_initializer(void *dummy __unused)
 {
 	uint64_t mask;
 
 	/* Temporarily disable ktr in case malloc() is being traced. */
 	mask = ktr_mask;
 	ktr_mask = 0;
 	ktr_buf = malloc(sizeof(*ktr_buf) * KTR_ENTRIES, M_KTR,
 	    M_WAITOK | M_ZERO);
 	memcpy(ktr_buf, ktr_buf_init + ktr_idx,
 	    (KTR_BOOT_ENTRIES - ktr_idx) * sizeof(*ktr_buf));
 	if (ktr_idx != 0) {
 		memcpy(ktr_buf + KTR_BOOT_ENTRIES - ktr_idx, ktr_buf_init,
 		    ktr_idx * sizeof(*ktr_buf));
 		ktr_idx = KTR_BOOT_ENTRIES;
 	}
 	ktr_entries = KTR_ENTRIES;
 	ktr_mask = mask;
 }
 SYSINIT(ktr_entries_initializer, SI_SUB_KMEM, SI_ORDER_ANY,
     ktr_entries_initializer, NULL);
 #endif
 
 static int
 sysctl_debug_ktr_entries(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t mask;
 	int entries, error;
 	struct ktr_entry *buf, *oldbuf;
 
 	entries = ktr_entries;
 	error = sysctl_handle_int(oidp, &entries, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (entries > KTR_ENTRIES_MAX)
 		return (ERANGE);
 	/* Disable ktr temporarily. */
 	mask = ktr_mask;
 	ktr_mask = 0;
 	/* Wait for threads to go idle. */
 	if ((error = quiesce_all_cpus("ktrent", PCATCH)) != 0) {
 		ktr_mask = mask;
 		return (error);
 	}
 	if (ktr_buf != ktr_buf_init)
 		oldbuf = ktr_buf;
 	else
 		oldbuf = NULL;
 	/* Allocate a new buffer. */
 	buf = malloc(sizeof(*buf) * entries, M_KTR, M_WAITOK | M_ZERO);
 	/* Install the new buffer and restart ktr. */
 	ktr_buf = buf;
 	ktr_entries = entries;
 	ktr_idx = 0;
 	ktr_mask = mask;
 	if (oldbuf != NULL)
 		free(oldbuf, M_KTR);
 
 	return (error);
 }
 
 SYSCTL_PROC(_debug_ktr, OID_AUTO, entries, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
     sysctl_debug_ktr_entries, "I", "Number of entries in the KTR buffer");
 
 #ifdef KTR_VERBOSE
 int	ktr_verbose = KTR_VERBOSE;
 TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
 SYSCTL_INT(_debug_ktr, OID_AUTO, verbose, CTLFLAG_RW, &ktr_verbose, 0, "");
 #endif
 
 #ifdef KTR_ALQ
 struct alq *ktr_alq;
 char	ktr_alq_file[MAXPATHLEN] = "/tmp/ktr.out";
 int	ktr_alq_cnt = 0;
 int	ktr_alq_depth = KTR_ENTRIES;
 int	ktr_alq_enabled = 0;
 int	ktr_alq_failed = 0;
 int	ktr_alq_max = 0;
 
 SYSCTL_INT(_debug_ktr, OID_AUTO, alq_max, CTLFLAG_RW, &ktr_alq_max, 0,
     "Maximum number of entries to write");
 SYSCTL_INT(_debug_ktr, OID_AUTO, alq_cnt, CTLFLAG_RD, &ktr_alq_cnt, 0,
     "Current number of written entries");
 SYSCTL_INT(_debug_ktr, OID_AUTO, alq_failed, CTLFLAG_RD, &ktr_alq_failed, 0,
     "Number of times we overran the buffer");
 SYSCTL_INT(_debug_ktr, OID_AUTO, alq_depth, CTLFLAG_RW, &ktr_alq_depth, 0,
     "Number of items in the write buffer");
 SYSCTL_STRING(_debug_ktr, OID_AUTO, alq_file, CTLFLAG_RW, ktr_alq_file,
     sizeof(ktr_alq_file), "KTR logging file");
 
 static int
 sysctl_debug_ktr_alq_enable(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int enable;
 
 	enable = ktr_alq_enabled;
 
 	error = sysctl_handle_int(oidp, &enable, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (enable) {
 		if (ktr_alq_enabled)
 			return (0);
 		error = alq_open(&ktr_alq, (const char *)ktr_alq_file,
 		    req->td->td_ucred, ALQ_DEFAULT_CMODE,
 		    sizeof(struct ktr_entry), ktr_alq_depth);
 		if (error == 0) {
 			ktr_alq_cnt = 0;
 			ktr_alq_failed = 0;
 			ktr_alq_enabled = 1;
 		}
 	} else {
 		if (ktr_alq_enabled == 0)
 			return (0);
 		ktr_alq_enabled = 0;
 		alq_close(ktr_alq);
 		ktr_alq = NULL;
 	}
 
 	return (error);
 }
 SYSCTL_PROC(_debug_ktr, OID_AUTO, alq_enable,
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_debug_ktr_alq_enable,
     "I", "Enable KTR logging");
 #endif
 
 void
 ktr_tracepoint(uint64_t mask, const char *file, int line, const char *format,
     u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5,
     u_long arg6)
 {
 	struct ktr_entry *entry;
 #ifdef KTR_ALQ
 	struct ale *ale = NULL;
 #endif
 	int newindex, saveindex;
 #if defined(KTR_VERBOSE) || defined(KTR_ALQ)
 	struct thread *td;
 #endif
 	int cpu;
 
 	if (panicstr || kdb_active)
 		return;
 	if ((ktr_mask & mask) == 0 || ktr_buf == NULL)
 		return;
 	cpu = KTR_CPU;
 	if (!CPU_ISSET(cpu, &ktr_cpumask))
 		return;
 #if defined(KTR_VERBOSE) || defined(KTR_ALQ)
 	td = curthread;
 	if (td->td_pflags & TDP_INKTR)
 		return;
 	td->td_pflags |= TDP_INKTR;
 #endif
 #ifdef KTR_ALQ
 	if (ktr_alq_enabled) {
 		if (td->td_critnest == 0 &&
 		    (td->td_flags & TDF_IDLETD) == 0 &&
 		    td != ald_thread) {
 			if (ktr_alq_max && ktr_alq_cnt > ktr_alq_max)
 				goto done;
 			if ((ale = alq_get(ktr_alq, ALQ_NOWAIT)) == NULL) {
 				ktr_alq_failed++;
 				goto done;
 			}
 			ktr_alq_cnt++;
 			entry = (struct ktr_entry *)ale->ae_data;
 		} else {
 			goto done;
 		}
 	} else
 #endif
 	{
 		do {
 			saveindex = ktr_idx;
 			newindex = (saveindex + 1) % ktr_entries;
 		} while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0);
 		entry = &ktr_buf[saveindex];
 	}
 	entry->ktr_timestamp = KTR_TIME;
 	entry->ktr_cpu = cpu;
 	entry->ktr_thread = curthread;
 	if (file != NULL)
 		while (strncmp(file, "../", 3) == 0)
 			file += 3;
 	entry->ktr_file = file;
 	entry->ktr_line = line;
 #ifdef KTR_VERBOSE
 	if (ktr_verbose) {
 #ifdef SMP
 		printf("cpu%d ", cpu);
 #endif
 		if (ktr_verbose > 1) {
 			printf("%s.%d\t", entry->ktr_file,
 			    entry->ktr_line);
 		}
 		printf(format, arg1, arg2, arg3, arg4, arg5, arg6);
 		printf("\n");
 	}
 #endif
 	entry->ktr_desc = format;
 	entry->ktr_parms[0] = arg1;
 	entry->ktr_parms[1] = arg2;
 	entry->ktr_parms[2] = arg3;
 	entry->ktr_parms[3] = arg4;
 	entry->ktr_parms[4] = arg5;
 	entry->ktr_parms[5] = arg6;
 #ifdef KTR_ALQ
 	if (ktr_alq_enabled && ale)
 		alq_post(ktr_alq, ale);
 done:
 #endif
 #if defined(KTR_VERBOSE) || defined(KTR_ALQ)
 	td->td_pflags &= ~TDP_INKTR;
 #endif
 }
 
 #ifdef DDB
 
 struct tstate {
 	int	cur;
 	int	first;
 };
 static	struct tstate tstate;
 static	int db_ktr_verbose;
 static	int db_mach_vtrace(void);
 
 DB_SHOW_COMMAND(ktr, db_ktr_all)
 {
 	
 	tstate.cur = (ktr_idx - 1) % ktr_entries;
 	tstate.first = -1;
 	db_ktr_verbose = 0;
 	db_ktr_verbose |= (strchr(modif, 'v') != NULL) ? 2 : 0;
 	db_ktr_verbose |= (strchr(modif, 'V') != NULL) ? 1 : 0; /* just timestamp please */
 	if (strchr(modif, 'a') != NULL) {
 		db_disable_pager();
 		while (cncheckc() == -1)
 			if (db_mach_vtrace() == 0)
 				break;
 	} else {
 		while (!db_pager_quit)
 			if (db_mach_vtrace() == 0)
 				break;
 	}
 }
 
 static int
 db_mach_vtrace(void)
 {
 	struct ktr_entry	*kp;
 
 	if (tstate.cur == tstate.first || ktr_buf == NULL) {
 		db_printf("--- End of trace buffer ---\n");
 		return (0);
 	}
 	kp = &ktr_buf[tstate.cur];
 
 	/* Skip over unused entries. */
 	if (kp->ktr_desc == NULL) {
 		db_printf("--- End of trace buffer ---\n");
 		return (0);
 	}
 	db_printf("%d (%p", tstate.cur, kp->ktr_thread);
 #ifdef SMP
 	db_printf(":cpu%d", kp->ktr_cpu);
 #endif
 	db_printf(")");
 	if (db_ktr_verbose >= 1) {
 		db_printf(" %10.10lld", (long long)kp->ktr_timestamp);
 	}
 	if (db_ktr_verbose >= 2) {
 		db_printf(" %s.%d", kp->ktr_file, kp->ktr_line);
 	}
 	db_printf(": ");
 	db_printf(kp->ktr_desc, kp->ktr_parms[0], kp->ktr_parms[1],
 	    kp->ktr_parms[2], kp->ktr_parms[3], kp->ktr_parms[4],
 	    kp->ktr_parms[5]);
 	db_printf("\n");
 
 	if (tstate.first == -1)
 		tstate.first = tstate.cur;
 
 	if (--tstate.cur < 0)
 		tstate.cur = ktr_entries - 1;
 
 	return (1);
 }
 
 #endif	/* DDB */
Index: head/sys/kern/kern_linker.c
===================================================================
--- head/sys/kern/kern_linker.c	(revision 326270)
+++ head/sys/kern/kern_linker.c	(revision 326271)
@@ -1,2235 +1,2237 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1997-2000 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kld.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/linker.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/libkern.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 
 #include "linker_if.h"
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KLD_DEBUG
 int kld_debug = 0;
 SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RWTUN,
     &kld_debug, 0, "Set various levels of KLD debug");
 #endif
 
 /* These variables are used by kernel debuggers to enumerate loaded files. */
 const int kld_off_address = offsetof(struct linker_file, address);
 const int kld_off_filename = offsetof(struct linker_file, filename);
 const int kld_off_pathname = offsetof(struct linker_file, pathname);
 const int kld_off_next = offsetof(struct linker_file, link.tqe_next);
 
 /*
  * static char *linker_search_path(const char *name, struct mod_depend
  * *verinfo);
  */
 static const char 	*linker_basename(const char *path);
 
 /*
  * Find a currently loaded file given its filename.
  */
 static linker_file_t linker_find_file_by_name(const char* _filename);
 
 /*
  * Find a currently loaded file given its file id.
  */
 static linker_file_t linker_find_file_by_id(int _fileid);
 
 /* Metadata from the static kernel */
 SET_DECLARE(modmetadata_set, struct mod_metadata);
 
 MALLOC_DEFINE(M_LINKER, "linker", "kernel linker");
 
 linker_file_t linker_kernel_file;
 
 static struct sx kld_sx;	/* kernel linker lock */
 
 /*
  * Load counter used by clients to determine if a linker file has been
  * re-loaded. This counter is incremented for each file load.
  */
 static int loadcnt;
 
 static linker_class_list_t classes;
 static linker_file_list_t linker_files;
 static int next_file_id = 1;
 static int linker_no_more_classes = 0;
 
 #define	LINKER_GET_NEXT_FILE_ID(a) do {					\
 	linker_file_t lftmp;						\
 									\
 	if (!cold)							\
 		sx_assert(&kld_sx, SA_XLOCKED);				\
 retry:									\
 	TAILQ_FOREACH(lftmp, &linker_files, link) {			\
 		if (next_file_id == lftmp->id) {			\
 			next_file_id++;					\
 			goto retry;					\
 		}							\
 	}								\
 	(a) = next_file_id;						\
 } while(0)
 
 
 /* XXX wrong name; we're looking at version provision tags here, not modules */
 typedef TAILQ_HEAD(, modlist) modlisthead_t;
 struct modlist {
 	TAILQ_ENTRY(modlist) link;	/* chain together all modules */
 	linker_file_t   container;
 	const char 	*name;
 	int             version;
 };
 typedef struct modlist *modlist_t;
 static modlisthead_t found_modules;
 
 static int	linker_file_add_dependency(linker_file_t file,
 		    linker_file_t dep);
 static caddr_t	linker_file_lookup_symbol_internal(linker_file_t file,
 		    const char* name, int deps);
 static int	linker_load_module(const char *kldname,
 		    const char *modname, struct linker_file *parent,
 		    const struct mod_depend *verinfo, struct linker_file **lfpp);
 static modlist_t modlist_lookup2(const char *name, const struct mod_depend *verinfo);
 
 static void
 linker_init(void *arg)
 {
 
 	sx_init(&kld_sx, "kernel linker");
 	TAILQ_INIT(&classes);
 	TAILQ_INIT(&linker_files);
 }
 
 SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0);
 
 static void
 linker_stop_class_add(void *arg)
 {
 
 	linker_no_more_classes = 1;
 }
 
 SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL);
 
 int
 linker_add_class(linker_class_t lc)
 {
 
 	/*
 	 * We disallow any class registration past SI_ORDER_ANY
 	 * of SI_SUB_KLD.  We bump the reference count to keep the
 	 * ops from being freed.
 	 */
 	if (linker_no_more_classes == 1)
 		return (EPERM);
 	kobj_class_compile((kobj_class_t) lc);
 	((kobj_class_t)lc)->refs++;	/* XXX: kobj_mtx */
 	TAILQ_INSERT_TAIL(&classes, lc, link);
 	return (0);
 }
 
 static void
 linker_file_sysinit(linker_file_t lf)
 {
 	struct sysinit **start, **stop, **sipp, **xipp, *save;
 
 	KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
 	    lf->filename));
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 
 	if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0)
 		return;
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 *
 	 * Since some things care about execution order, this is the operation
 	 * which ensures continued function.
 	 */
 	for (sipp = start; sipp < stop; sipp++) {
 		for (xipp = sipp + 1; xipp < stop; xipp++) {
 			if ((*sipp)->subsystem < (*xipp)->subsystem ||
 			    ((*sipp)->subsystem == (*xipp)->subsystem &&
 			    (*sipp)->order <= (*xipp)->order))
 				continue;	/* skip */
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 */
 	sx_xunlock(&kld_sx);
 	mtx_lock(&Giant);
 	for (sipp = start; sipp < stop; sipp++) {
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s) */
 
 		/* Call function */
 		(*((*sipp)->func)) ((*sipp)->udata);
 	}
 	mtx_unlock(&Giant);
 	sx_xlock(&kld_sx);
 }
 
 static void
 linker_file_sysuninit(linker_file_t lf)
 {
 	struct sysinit **start, **stop, **sipp, **xipp, *save;
 
 	KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
 	    lf->filename));
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 
 	if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop,
 	    NULL) != 0)
 		return;
 
 	/*
 	 * Perform a reverse bubble sort of the system initialization objects
 	 * by their subsystem (primary key) and order (secondary key).
 	 *
 	 * Since some things care about execution order, this is the operation
 	 * which ensures continued function.
 	 */
 	for (sipp = start; sipp < stop; sipp++) {
 		for (xipp = sipp + 1; xipp < stop; xipp++) {
 			if ((*sipp)->subsystem > (*xipp)->subsystem ||
 			    ((*sipp)->subsystem == (*xipp)->subsystem &&
 			    (*sipp)->order >= (*xipp)->order))
 				continue;	/* skip */
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 */
 	sx_xunlock(&kld_sx);
 	mtx_lock(&Giant);
 	for (sipp = start; sipp < stop; sipp++) {
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s) */
 
 		/* Call function */
 		(*((*sipp)->func)) ((*sipp)->udata);
 	}
 	mtx_unlock(&Giant);
 	sx_xlock(&kld_sx);
 }
 
 static void
 linker_file_register_sysctls(linker_file_t lf, bool enable)
 {
 	struct sysctl_oid **start, **stop, **oidp;
 
 	KLD_DPF(FILE,
 	    ("linker_file_register_sysctls: registering SYSCTLs for %s\n",
 	    lf->filename));
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
 	sx_xunlock(&kld_sx);
 	sysctl_wlock();
 	for (oidp = start; oidp < stop; oidp++) {
 		if (enable)
 			sysctl_register_oid(*oidp);
 		else
 			sysctl_register_disabled_oid(*oidp);
 	}
 	sysctl_wunlock();
 	sx_xlock(&kld_sx);
 }
 
 static void
 linker_file_enable_sysctls(linker_file_t lf)
 {
 	struct sysctl_oid **start, **stop, **oidp;
 
 	KLD_DPF(FILE,
 	    ("linker_file_enable_sysctls: enable SYSCTLs for %s\n",
 	    lf->filename));
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
 	sx_xunlock(&kld_sx);
 	sysctl_wlock();
 	for (oidp = start; oidp < stop; oidp++)
 		sysctl_enable_oid(*oidp);
 	sysctl_wunlock();
 	sx_xlock(&kld_sx);
 }
 
 static void
 linker_file_unregister_sysctls(linker_file_t lf)
 {
 	struct sysctl_oid **start, **stop, **oidp;
 
 	KLD_DPF(FILE, ("linker_file_unregister_sysctls: unregistering SYSCTLs"
 	    " for %s\n", lf->filename));
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
 	sx_xunlock(&kld_sx);
 	sysctl_wlock();
 	for (oidp = start; oidp < stop; oidp++)
 		sysctl_unregister_oid(*oidp);
 	sysctl_wunlock();
 	sx_xlock(&kld_sx);
 }
 
 static int
 linker_file_register_modules(linker_file_t lf)
 {
 	struct mod_metadata **start, **stop, **mdp;
 	const moduledata_t *moddata;
 	int first_error, error;
 
 	KLD_DPF(FILE, ("linker_file_register_modules: registering modules"
 	    " in %s\n", lf->filename));
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 
 	if (linker_file_lookup_set(lf, "modmetadata_set", &start,
 	    &stop, NULL) != 0) {
 		/*
 		 * This fallback should be unnecessary, but if we get booted
 		 * from boot2 instead of loader and we are missing our
 		 * metadata then we have to try the best we can.
 		 */
 		if (lf == linker_kernel_file) {
 			start = SET_BEGIN(modmetadata_set);
 			stop = SET_LIMIT(modmetadata_set);
 		} else
 			return (0);
 	}
 	first_error = 0;
 	for (mdp = start; mdp < stop; mdp++) {
 		if ((*mdp)->md_type != MDT_MODULE)
 			continue;
 		moddata = (*mdp)->md_data;
 		KLD_DPF(FILE, ("Registering module %s in %s\n",
 		    moddata->name, lf->filename));
 		error = module_register(moddata, lf);
 		if (error) {
 			printf("Module %s failed to register: %d\n",
 			    moddata->name, error);
 			if (first_error == 0)
 				first_error = error;
 		}
 	}
 	return (first_error);
 }
 
 static void
 linker_init_kernel_modules(void)
 {
 
 	sx_xlock(&kld_sx);
 	linker_file_register_modules(linker_kernel_file);
 	sx_xunlock(&kld_sx);
 }
 
 SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules,
     0);
 
 static int
 linker_load_file(const char *filename, linker_file_t *result)
 {
 	linker_class_t lc;
 	linker_file_t lf;
 	int foundfile, error, modules;
 
 	/* Refuse to load modules if securelevel raised */
 	if (prison0.pr_securelevel > 0)
 		return (EPERM);
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 	lf = linker_find_file_by_name(filename);
 	if (lf) {
 		KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
 		    " incrementing refs\n", filename));
 		*result = lf;
 		lf->refs++;
 		return (0);
 	}
 	foundfile = 0;
 	error = 0;
 
 	/*
 	 * We do not need to protect (lock) classes here because there is
 	 * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY)
 	 * and there is no class deregistration mechanism at this time.
 	 */
 	TAILQ_FOREACH(lc, &classes, link) {
 		KLD_DPF(FILE, ("linker_load_file: trying to load %s\n",
 		    filename));
 		error = LINKER_LOAD_FILE(lc, filename, &lf);
 		/*
 		 * If we got something other than ENOENT, then it exists but
 		 * we cannot load it for some other reason.
 		 */
 		if (error != ENOENT)
 			foundfile = 1;
 		if (lf) {
 			error = linker_file_register_modules(lf);
 			if (error == EEXIST) {
 				linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 				return (error);
 			}
 			modules = !TAILQ_EMPTY(&lf->modules);
 			linker_file_register_sysctls(lf, false);
 			linker_file_sysinit(lf);
 			lf->flags |= LINKER_FILE_LINKED;
 
 			/*
 			 * If all of the modules in this file failed
 			 * to load, unload the file and return an
 			 * error of ENOEXEC.
 			 */
 			if (modules && TAILQ_EMPTY(&lf->modules)) {
 				linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 				return (ENOEXEC);
 			}
 			linker_file_enable_sysctls(lf);
 			EVENTHANDLER_INVOKE(kld_load, lf);
 			*result = lf;
 			return (0);
 		}
 	}
 	/*
 	 * Less than ideal, but tells the user whether it failed to load or
 	 * the module was not found.
 	 */
 	if (foundfile) {
 
 		/*
 		 * If the file type has not been recognized by the last try
 		 * printout a message before to fail.
 		 */
 		if (error == ENOSYS)
 			printf("%s: %s - unsupported file type\n",
 			    __func__, filename);
 
 		/*
 		 * Format not recognized or otherwise unloadable.
 		 * When loading a module that is statically built into
 		 * the kernel EEXIST percolates back up as the return
 		 * value.  Preserve this so that apps like sysinstall
 		 * can recognize this special case and not post bogus
 		 * dialog boxes.
 		 */
 		if (error != EEXIST)
 			error = ENOEXEC;
 	} else
 		error = ENOENT;		/* Nothing found */
 	return (error);
 }
 
 int
 linker_reference_module(const char *modname, struct mod_depend *verinfo,
     linker_file_t *result)
 {
 	modlist_t mod;
 	int error;
 
 	sx_xlock(&kld_sx);
 	if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
 		*result = mod->container;
 		(*result)->refs++;
 		sx_xunlock(&kld_sx);
 		return (0);
 	}
 
 	error = linker_load_module(NULL, modname, NULL, verinfo, result);
 	sx_xunlock(&kld_sx);
 	return (error);
 }
 
 int
 linker_release_module(const char *modname, struct mod_depend *verinfo,
     linker_file_t lf)
 {
 	modlist_t mod;
 	int error;
 
 	sx_xlock(&kld_sx);
 	if (lf == NULL) {
 		KASSERT(modname != NULL,
 		    ("linker_release_module: no file or name"));
 		mod = modlist_lookup2(modname, verinfo);
 		if (mod == NULL) {
 			sx_xunlock(&kld_sx);
 			return (ESRCH);
 		}
 		lf = mod->container;
 	} else
 		KASSERT(modname == NULL && verinfo == NULL,
 		    ("linker_release_module: both file and name"));
 	error =	linker_file_unload(lf, LINKER_UNLOAD_NORMAL);
 	sx_xunlock(&kld_sx);
 	return (error);
 }
 
 static linker_file_t
 linker_find_file_by_name(const char *filename)
 {
 	linker_file_t lf;
 	char *koname;
 
 	koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
 	sprintf(koname, "%s.ko", filename);
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (strcmp(lf->filename, koname) == 0)
 			break;
 		if (strcmp(lf->filename, filename) == 0)
 			break;
 	}
 	free(koname, M_LINKER);
 	return (lf);
 }
 
 static linker_file_t
 linker_find_file_by_id(int fileid)
 {
 	linker_file_t lf;
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 	TAILQ_FOREACH(lf, &linker_files, link)
 		if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED)
 			break;
 	return (lf);
 }
 
 int
 linker_file_foreach(linker_predicate_t *predicate, void *context)
 {
 	linker_file_t lf;
 	int retval = 0;
 
 	sx_xlock(&kld_sx);
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		retval = predicate(lf, context);
 		if (retval != 0)
 			break;
 	}
 	sx_xunlock(&kld_sx);
 	return (retval);
 }
 
 linker_file_t
 linker_make_file(const char *pathname, linker_class_t lc)
 {
 	linker_file_t lf;
 	const char *filename;
 
 	if (!cold)
 		sx_assert(&kld_sx, SA_XLOCKED);
 	filename = linker_basename(pathname);
 
 	KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname));
 	lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
 	if (lf == NULL)
 		return (NULL);
 	lf->ctors_addr = 0;
 	lf->ctors_size = 0;
 	lf->refs = 1;
 	lf->userrefs = 0;
 	lf->flags = 0;
 	lf->filename = strdup(filename, M_LINKER);
 	lf->pathname = strdup(pathname, M_LINKER);
 	LINKER_GET_NEXT_FILE_ID(lf->id);
 	lf->ndeps = 0;
 	lf->deps = NULL;
 	lf->loadcnt = ++loadcnt;
 	STAILQ_INIT(&lf->common);
 	TAILQ_INIT(&lf->modules);
 	TAILQ_INSERT_TAIL(&linker_files, lf, link);
 	return (lf);
 }
 
 int
 linker_file_unload(linker_file_t file, int flags)
 {
 	module_t mod, next;
 	modlist_t ml, nextml;
 	struct common_symbol *cp;
 	int error, i;
 
 	/* Refuse to unload modules if securelevel raised. */
 	if (prison0.pr_securelevel > 0)
 		return (EPERM);
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 	KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
 
 	/* Easy case of just dropping a reference. */
 	if (file->refs > 1) {
 		file->refs--;
 		return (0);
 	}
 
 	/* Give eventhandlers a chance to prevent the unload. */
 	error = 0;
 	EVENTHANDLER_INVOKE(kld_unload_try, file, &error);
 	if (error != 0)
 		return (EBUSY);
 
 	KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
 	    " informing modules\n"));
 
 	/*
 	 * Quiesce all the modules to give them a chance to veto the unload.
 	 */
 	MOD_SLOCK;
 	for (mod = TAILQ_FIRST(&file->modules); mod;
 	     mod = module_getfnext(mod)) {
 
 		error = module_quiesce(mod);
 		if (error != 0 && flags != LINKER_UNLOAD_FORCE) {
 			KLD_DPF(FILE, ("linker_file_unload: module %s"
 			    " vetoed unload\n", module_getname(mod)));
 			/*
 			 * XXX: Do we need to tell all the quiesced modules
 			 * that they can resume work now via a new module
 			 * event?
 			 */
 			MOD_SUNLOCK;
 			return (error);
 		}
 	}
 	MOD_SUNLOCK;
 
 	/*
 	 * Inform any modules associated with this file that they are
 	 * being unloaded.
 	 */
 	MOD_XLOCK;
 	for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
 		next = module_getfnext(mod);
 		MOD_XUNLOCK;
 
 		/*
 		 * Give the module a chance to veto the unload.
 		 */
 		if ((error = module_unload(mod)) != 0) {
 #ifdef KLD_DEBUG
 			MOD_SLOCK;
 			KLD_DPF(FILE, ("linker_file_unload: module %s"
 			    " failed unload\n", module_getname(mod)));
 			MOD_SUNLOCK;
 #endif
 			return (error);
 		}
 		MOD_XLOCK;
 		module_release(mod);
 	}
 	MOD_XUNLOCK;
 
 	TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) {
 		if (ml->container == file) {
 			TAILQ_REMOVE(&found_modules, ml, link);
 			free(ml, M_LINKER);
 		}
 	}
 
 	/*
 	 * Don't try to run SYSUNINITs if we are unloaded due to a
 	 * link error.
 	 */
 	if (file->flags & LINKER_FILE_LINKED) {
 		file->flags &= ~LINKER_FILE_LINKED;
 		linker_file_unregister_sysctls(file);
 		linker_file_sysuninit(file);
 	}
 	TAILQ_REMOVE(&linker_files, file, link);
 
 	if (file->deps) {
 		for (i = 0; i < file->ndeps; i++)
 			linker_file_unload(file->deps[i], flags);
 		free(file->deps, M_LINKER);
 		file->deps = NULL;
 	}
 	while ((cp = STAILQ_FIRST(&file->common)) != NULL) {
 		STAILQ_REMOVE_HEAD(&file->common, link);
 		free(cp, M_LINKER);
 	}
 
 	LINKER_UNLOAD(file);
 
 	EVENTHANDLER_INVOKE(kld_unload, file->filename, file->address,
 	    file->size);
 
 	if (file->filename) {
 		free(file->filename, M_LINKER);
 		file->filename = NULL;
 	}
 	if (file->pathname) {
 		free(file->pathname, M_LINKER);
 		file->pathname = NULL;
 	}
 	kobj_delete((kobj_t) file, M_LINKER);
 	return (0);
 }
 
 int
 linker_ctf_get(linker_file_t file, linker_ctf_t *lc)
 {
 	return (LINKER_CTF_GET(file, lc));
 }
 
 static int
 linker_file_add_dependency(linker_file_t file, linker_file_t dep)
 {
 	linker_file_t *newdeps;
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 	file->deps = realloc(file->deps, (file->ndeps + 1) * sizeof(*newdeps),
 	    M_LINKER, M_WAITOK | M_ZERO);
 	file->deps[file->ndeps] = dep;
 	file->ndeps++;
 	KLD_DPF(FILE, ("linker_file_add_dependency:"
 	    " adding %s as dependency for %s\n", 
 	    dep->filename, file->filename));
 	return (0);
 }
 
 /*
  * Locate a linker set and its contents.  This is a helper function to avoid
  * linker_if.h exposure elsewhere.  Note: firstp and lastp are really void **.
  * This function is used in this file so we can avoid having lots of (void **)
  * casts.
  */
 int
 linker_file_lookup_set(linker_file_t file, const char *name,
     void *firstp, void *lastp, int *countp)
 {
 
 	sx_assert(&kld_sx, SA_LOCKED);
 	return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp));
 }
 
 /*
  * List all functions in a file.
  */
 int
 linker_file_function_listall(linker_file_t lf,
     linker_function_nameval_callback_t callback_func, void *arg)
 {
 	return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg));
 }
 
 caddr_t
 linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
 {
 	caddr_t sym;
 	int locked;
 
 	locked = sx_xlocked(&kld_sx);
 	if (!locked)
 		sx_xlock(&kld_sx);
 	sym = linker_file_lookup_symbol_internal(file, name, deps);
 	if (!locked)
 		sx_xunlock(&kld_sx);
 	return (sym);
 }
 
 static caddr_t
 linker_file_lookup_symbol_internal(linker_file_t file, const char *name,
     int deps)
 {
 	c_linker_sym_t sym;
 	linker_symval_t symval;
 	caddr_t address;
 	size_t common_size = 0;
 	int i;
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 	KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n",
 	    file, name, deps));
 
 	if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) {
 		LINKER_SYMBOL_VALUES(file, sym, &symval);
 		if (symval.value == 0)
 			/*
 			 * For commons, first look them up in the
 			 * dependencies and only allocate space if not found
 			 * there.
 			 */
 			common_size = symval.size;
 		else {
 			KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol"
 			    ".value=%p\n", symval.value));
 			return (symval.value);
 		}
 	}
 	if (deps) {
 		for (i = 0; i < file->ndeps; i++) {
 			address = linker_file_lookup_symbol_internal(
 			    file->deps[i], name, 0);
 			if (address) {
 				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
 				    " deps value=%p\n", address));
 				return (address);
 			}
 		}
 	}
 	if (common_size > 0) {
 		/*
 		 * This is a common symbol which was not found in the
 		 * dependencies.  We maintain a simple common symbol table in
 		 * the file object.
 		 */
 		struct common_symbol *cp;
 
 		STAILQ_FOREACH(cp, &file->common, link) {
 			if (strcmp(cp->name, name) == 0) {
 				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
 				    " old common value=%p\n", cp->address));
 				return (cp->address);
 			}
 		}
 		/*
 		 * Round the symbol size up to align.
 		 */
 		common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
 		cp = malloc(sizeof(struct common_symbol)
 		    + common_size + strlen(name) + 1, M_LINKER,
 		    M_WAITOK | M_ZERO);
 		cp->address = (caddr_t)(cp + 1);
 		cp->name = cp->address + common_size;
 		strcpy(cp->name, name);
 		bzero(cp->address, common_size);
 		STAILQ_INSERT_TAIL(&file->common, cp, link);
 
 		KLD_DPF(SYM, ("linker_file_lookup_symbol: new common"
 		    " value=%p\n", cp->address));
 		return (cp->address);
 	}
 	KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
 	return (0);
 }
 
 /*
  * Both DDB and stack(9) rely on the kernel linker to provide forward and
  * backward lookup of symbols.  However, DDB and sometimes stack(9) need to
  * do this in a lockfree manner.  We provide a set of internal helper
  * routines to perform these operations without locks, and then wrappers that
  * optionally lock.
  *
  * linker_debug_lookup() is ifdef DDB as currently it's only used by DDB.
  */
 #ifdef DDB
 static int
 linker_debug_lookup(const char *symstr, c_linker_sym_t *sym)
 {
 	linker_file_t lf;
 
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0)
 			return (0);
 	}
 	return (ENOENT);
 }
 #endif
 
 static int
 linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
 {
 	linker_file_t lf;
 	c_linker_sym_t best, es;
 	u_long diff, bestdiff, off;
 
 	best = 0;
 	off = (uintptr_t)value;
 	bestdiff = off;
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0)
 			continue;
 		if (es != 0 && diff < bestdiff) {
 			best = es;
 			bestdiff = diff;
 		}
 		if (bestdiff == 0)
 			break;
 	}
 	if (best) {
 		*sym = best;
 		*diffp = bestdiff;
 		return (0);
 	} else {
 		*sym = 0;
 		*diffp = off;
 		return (ENOENT);
 	}
 }
 
 static int
 linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
 {
 	linker_file_t lf;
 
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0)
 			return (0);
 	}
 	return (ENOENT);
 }
 
 static int
 linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen,
     long *offset)
 {
 	linker_symval_t symval;
 	c_linker_sym_t sym;
 	int error;
 
 	*offset = 0;
 	error = linker_debug_search_symbol(value, &sym, offset);
 	if (error)
 		return (error);
 	error = linker_debug_symbol_values(sym, &symval);
 	if (error)
 		return (error);
 	strlcpy(buf, symval.name, buflen);
 	return (0);
 }
 
 /*
  * DDB Helpers.  DDB has to look across multiple files with their own symbol
  * tables and string tables.
  *
  * Note that we do not obey list locking protocols here.  We really don't need
  * DDB to hang because somebody's got the lock held.  We'll take the chance
  * that the files list is inconsistent instead.
  */
 #ifdef DDB
 int
 linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym)
 {
 
 	return (linker_debug_lookup(symstr, sym));
 }
 #endif
 
 int
 linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
 {
 
 	return (linker_debug_search_symbol(value, sym, diffp));
 }
 
 int
 linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
 {
 
 	return (linker_debug_symbol_values(sym, symval));
 }
 
 int
 linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen,
     long *offset)
 {
 
 	return (linker_debug_search_symbol_name(value, buf, buflen, offset));
 }
 
 /*
  * stack(9) helper for non-debugging environemnts.  Unlike DDB helpers, we do
  * obey locking protocols, and offer a significantly less complex interface.
  */
 int
 linker_search_symbol_name(caddr_t value, char *buf, u_int buflen,
     long *offset)
 {
 	int error;
 
 	sx_slock(&kld_sx);
 	error = linker_debug_search_symbol_name(value, buf, buflen, offset);
 	sx_sunlock(&kld_sx);
 	return (error);
 }
 
 /*
  * Syscalls.
  */
 int
 kern_kldload(struct thread *td, const char *file, int *fileid)
 {
 	const char *kldname, *modname;
 	linker_file_t lf;
 	int error;
 
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
 
 	if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0)
 		return (error);
 
 	/*
 	 * It is possible that kldloaded module will attach a new ifnet,
 	 * so vnet context must be set when this ocurs.
 	 */
 	CURVNET_SET(TD_TO_VNET(td));
 
 	/*
 	 * If file does not contain a qualified name or any dot in it
 	 * (kldname.ko, or kldname.ver.ko) treat it as an interface
 	 * name.
 	 */
 	if (strchr(file, '/') || strchr(file, '.')) {
 		kldname = file;
 		modname = NULL;
 	} else {
 		kldname = NULL;
 		modname = file;
 	}
 
 	sx_xlock(&kld_sx);
 	error = linker_load_module(kldname, modname, NULL, NULL, &lf);
 	if (error) {
 		sx_xunlock(&kld_sx);
 		goto done;
 	}
 	lf->userrefs++;
 	if (fileid != NULL)
 		*fileid = lf->id;
 	sx_xunlock(&kld_sx);
 
 done:
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 sys_kldload(struct thread *td, struct kldload_args *uap)
 {
 	char *pathname = NULL;
 	int error, fileid;
 
 	td->td_retval[0] = -1;
 
 	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL);
 	if (error == 0) {
 		error = kern_kldload(td, pathname, &fileid);
 		if (error == 0)
 			td->td_retval[0] = fileid;
 	}
 	free(pathname, M_TEMP);
 	return (error);
 }
 
 int
 kern_kldunload(struct thread *td, int fileid, int flags)
 {
 	linker_file_t lf;
 	int error = 0;
 
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
 
 	if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0)
 		return (error);
 
 	CURVNET_SET(TD_TO_VNET(td));
 	sx_xlock(&kld_sx);
 	lf = linker_find_file_by_id(fileid);
 	if (lf) {
 		KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
 
 		if (lf->userrefs == 0) {
 			/*
 			 * XXX: maybe LINKER_UNLOAD_FORCE should override ?
 			 */
 			printf("kldunload: attempt to unload file that was"
 			    " loaded by the kernel\n");
 			error = EBUSY;
 		} else {
 			lf->userrefs--;
 			error = linker_file_unload(lf, flags);
 			if (error)
 				lf->userrefs++;
 		}
 	} else
 		error = ENOENT;
 	sx_xunlock(&kld_sx);
 
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 sys_kldunload(struct thread *td, struct kldunload_args *uap)
 {
 
 	return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL));
 }
 
 int
 sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap)
 {
 
 	if (uap->flags != LINKER_UNLOAD_NORMAL &&
 	    uap->flags != LINKER_UNLOAD_FORCE)
 		return (EINVAL);
 	return (kern_kldunload(td, uap->fileid, uap->flags));
 }
 
 int
 sys_kldfind(struct thread *td, struct kldfind_args *uap)
 {
 	char *pathname;
 	const char *filename;
 	linker_file_t lf;
 	int error;
 
 #ifdef MAC
 	error = mac_kld_check_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	td->td_retval[0] = -1;
 
 	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0)
 		goto out;
 
 	filename = linker_basename(pathname);
 	sx_xlock(&kld_sx);
 	lf = linker_find_file_by_name(filename);
 	if (lf)
 		td->td_retval[0] = lf->id;
 	else
 		error = ENOENT;
 	sx_xunlock(&kld_sx);
 out:
 	free(pathname, M_TEMP);
 	return (error);
 }
 
 int
 sys_kldnext(struct thread *td, struct kldnext_args *uap)
 {
 	linker_file_t lf;
 	int error = 0;
 
 #ifdef MAC
 	error = mac_kld_check_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	sx_xlock(&kld_sx);
 	if (uap->fileid == 0)
 		lf = TAILQ_FIRST(&linker_files);
 	else {
 		lf = linker_find_file_by_id(uap->fileid);
 		if (lf == NULL) {
 			error = ENOENT;
 			goto out;
 		}
 		lf = TAILQ_NEXT(lf, link);
 	}
 
 	/* Skip partially loaded files. */
 	while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED))
 		lf = TAILQ_NEXT(lf, link);
 
 	if (lf)
 		td->td_retval[0] = lf->id;
 	else
 		td->td_retval[0] = 0;
 out:
 	sx_xunlock(&kld_sx);
 	return (error);
 }
 
 int
 sys_kldstat(struct thread *td, struct kldstat_args *uap)
 {
 	struct kld_file_stat *stat;
 	int error, version;
 
 	/*
 	 * Check the version of the user's structure.
 	 */
 	if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
 	    != 0)
 		return (error);
 	if (version != sizeof(struct kld_file_stat_1) &&
 	    version != sizeof(struct kld_file_stat))
 		return (EINVAL);
 
 	stat = malloc(sizeof(*stat), M_TEMP, M_WAITOK | M_ZERO);
 	error = kern_kldstat(td, uap->fileid, stat);
 	if (error == 0)
 		error = copyout(stat, uap->stat, version);
 	free(stat, M_TEMP);
 	return (error);
 }
 
 int
 kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat)
 {
 	linker_file_t lf;
 	int namelen;
 #ifdef MAC
 	int error;
 
 	error = mac_kld_check_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	sx_xlock(&kld_sx);
 	lf = linker_find_file_by_id(fileid);
 	if (lf == NULL) {
 		sx_xunlock(&kld_sx);
 		return (ENOENT);
 	}
 
 	/* Version 1 fields: */
 	namelen = strlen(lf->filename) + 1;
 	if (namelen > sizeof(stat->name))
 		namelen = sizeof(stat->name);
 	bcopy(lf->filename, &stat->name[0], namelen);
 	stat->refs = lf->refs;
 	stat->id = lf->id;
 	stat->address = lf->address;
 	stat->size = lf->size;
 	/* Version 2 fields: */
 	namelen = strlen(lf->pathname) + 1;
 	if (namelen > sizeof(stat->pathname))
 		namelen = sizeof(stat->pathname);
 	bcopy(lf->pathname, &stat->pathname[0], namelen);
 	sx_xunlock(&kld_sx);
 
 	td->td_retval[0] = 0;
 	return (0);
 }
 
 #ifdef DDB
 DB_COMMAND(kldstat, db_kldstat)
 {
 	linker_file_t lf;
 
 #define	POINTER_WIDTH	((int)(sizeof(void *) * 2 + 2))
 	db_printf("Id Refs Address%*c Size     Name\n", POINTER_WIDTH - 7, ' ');
 #undef	POINTER_WIDTH
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (db_pager_quit)
 			return;
 		db_printf("%2d %4d %p %-8zx %s\n", lf->id, lf->refs,
 		    lf->address, lf->size, lf->filename);
 	}
 }
 #endif /* DDB */
 
 int
 sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
 {
 	linker_file_t lf;
 	module_t mp;
 	int error = 0;
 
 #ifdef MAC
 	error = mac_kld_check_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	sx_xlock(&kld_sx);
 	lf = linker_find_file_by_id(uap->fileid);
 	if (lf) {
 		MOD_SLOCK;
 		mp = TAILQ_FIRST(&lf->modules);
 		if (mp != NULL)
 			td->td_retval[0] = module_getid(mp);
 		else
 			td->td_retval[0] = 0;
 		MOD_SUNLOCK;
 	} else
 		error = ENOENT;
 	sx_xunlock(&kld_sx);
 	return (error);
 }
 
 int
 sys_kldsym(struct thread *td, struct kldsym_args *uap)
 {
 	char *symstr = NULL;
 	c_linker_sym_t sym;
 	linker_symval_t symval;
 	linker_file_t lf;
 	struct kld_sym_lookup lookup;
 	int error = 0;
 
 #ifdef MAC
 	error = mac_kld_check_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0)
 		return (error);
 	if (lookup.version != sizeof(lookup) ||
 	    uap->cmd != KLDSYM_LOOKUP)
 		return (EINVAL);
 	symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
 		goto out;
 	sx_xlock(&kld_sx);
 	if (uap->fileid != 0) {
 		lf = linker_find_file_by_id(uap->fileid);
 		if (lf == NULL)
 			error = ENOENT;
 		else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
 		    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
 			lookup.symvalue = (uintptr_t) symval.value;
 			lookup.symsize = symval.size;
 			error = copyout(&lookup, uap->data, sizeof(lookup));
 		} else
 			error = ENOENT;
 	} else {
 		TAILQ_FOREACH(lf, &linker_files, link) {
 			if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
 			    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
 				lookup.symvalue = (uintptr_t)symval.value;
 				lookup.symsize = symval.size;
 				error = copyout(&lookup, uap->data,
 				    sizeof(lookup));
 				break;
 			}
 		}
 		if (lf == NULL)
 			error = ENOENT;
 	}
 	sx_xunlock(&kld_sx);
 out:
 	free(symstr, M_TEMP);
 	return (error);
 }
 
 /*
  * Preloaded module support
  */
 
 static modlist_t
 modlist_lookup(const char *name, int ver)
 {
 	modlist_t mod;
 
 	TAILQ_FOREACH(mod, &found_modules, link) {
 		if (strcmp(mod->name, name) == 0 &&
 		    (ver == 0 || mod->version == ver))
 			return (mod);
 	}
 	return (NULL);
 }
 
 static modlist_t
 modlist_lookup2(const char *name, const struct mod_depend *verinfo)
 {
 	modlist_t mod, bestmod;
 	int ver;
 
 	if (verinfo == NULL)
 		return (modlist_lookup(name, 0));
 	bestmod = NULL;
 	TAILQ_FOREACH(mod, &found_modules, link) {
 		if (strcmp(mod->name, name) != 0)
 			continue;
 		ver = mod->version;
 		if (ver == verinfo->md_ver_preferred)
 			return (mod);
 		if (ver >= verinfo->md_ver_minimum &&
 		    ver <= verinfo->md_ver_maximum &&
 		    (bestmod == NULL || ver > bestmod->version))
 			bestmod = mod;
 	}
 	return (bestmod);
 }
 
 static modlist_t
 modlist_newmodule(const char *modname, int version, linker_file_t container)
 {
 	modlist_t mod;
 
 	mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO);
 	if (mod == NULL)
 		panic("no memory for module list");
 	mod->container = container;
 	mod->name = modname;
 	mod->version = version;
 	TAILQ_INSERT_TAIL(&found_modules, mod, link);
 	return (mod);
 }
 
 static void
 linker_addmodules(linker_file_t lf, struct mod_metadata **start,
     struct mod_metadata **stop, int preload)
 {
 	struct mod_metadata *mp, **mdp;
 	const char *modname;
 	int ver;
 
 	for (mdp = start; mdp < stop; mdp++) {
 		mp = *mdp;
 		if (mp->md_type != MDT_VERSION)
 			continue;
 		modname = mp->md_cval;
 		ver = ((const struct mod_version *)mp->md_data)->mv_version;
 		if (modlist_lookup(modname, ver) != NULL) {
 			printf("module %s already present!\n", modname);
 			/* XXX what can we do? this is a build error. :-( */
 			continue;
 		}
 		modlist_newmodule(modname, ver, lf);
 	}
 }
 
 static void
 linker_preload(void *arg)
 {
 	caddr_t modptr;
 	const char *modname, *nmodname;
 	char *modtype;
 	linker_file_t lf, nlf;
 	linker_class_t lc;
 	int error;
 	linker_file_list_t loaded_files;
 	linker_file_list_t depended_files;
 	struct mod_metadata *mp, *nmp;
 	struct mod_metadata **start, **stop, **mdp, **nmdp;
 	const struct mod_depend *verinfo;
 	int nver;
 	int resolves;
 	modlist_t mod;
 	struct sysinit **si_start, **si_stop;
 
 	TAILQ_INIT(&loaded_files);
 	TAILQ_INIT(&depended_files);
 	TAILQ_INIT(&found_modules);
 	error = 0;
 
 	modptr = NULL;
 	sx_xlock(&kld_sx);
 	while ((modptr = preload_search_next_name(modptr)) != NULL) {
 		modname = (char *)preload_search_info(modptr, MODINFO_NAME);
 		modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
 		if (modname == NULL) {
 			printf("Preloaded module at %p does not have a"
 			    " name!\n", modptr);
 			continue;
 		}
 		if (modtype == NULL) {
 			printf("Preloaded module at %p does not have a type!\n",
 			    modptr);
 			continue;
 		}
 		if (bootverbose)
 			printf("Preloaded %s \"%s\" at %p.\n", modtype, modname,
 			    modptr);
 		lf = NULL;
 		TAILQ_FOREACH(lc, &classes, link) {
 			error = LINKER_LINK_PRELOAD(lc, modname, &lf);
 			if (!error)
 				break;
 			lf = NULL;
 		}
 		if (lf)
 			TAILQ_INSERT_TAIL(&loaded_files, lf, loaded);
 	}
 
 	/*
 	 * First get a list of stuff in the kernel.
 	 */
 	if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start,
 	    &stop, NULL) == 0)
 		linker_addmodules(linker_kernel_file, start, stop, 1);
 
 	/*
 	 * This is a once-off kinky bubble sort to resolve relocation
 	 * dependency requirements.
 	 */
 restart:
 	TAILQ_FOREACH(lf, &loaded_files, loaded) {
 		error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
 		    &stop, NULL);
 		/*
 		 * First, look to see if we would successfully link with this
 		 * stuff.
 		 */
 		resolves = 1;	/* unless we know otherwise */
 		if (!error) {
 			for (mdp = start; mdp < stop; mdp++) {
 				mp = *mdp;
 				if (mp->md_type != MDT_DEPEND)
 					continue;
 				modname = mp->md_cval;
 				verinfo = mp->md_data;
 				for (nmdp = start; nmdp < stop; nmdp++) {
 					nmp = *nmdp;
 					if (nmp->md_type != MDT_VERSION)
 						continue;
 					nmodname = nmp->md_cval;
 					if (strcmp(modname, nmodname) == 0)
 						break;
 				}
 				if (nmdp < stop)   /* it's a self reference */
 					continue;
 
 				/*
 				 * ok, the module isn't here yet, we
 				 * are not finished
 				 */
 				if (modlist_lookup2(modname, verinfo) == NULL)
 					resolves = 0;
 			}
 		}
 		/*
 		 * OK, if we found our modules, we can link.  So, "provide"
 		 * the modules inside and add it to the end of the link order
 		 * list.
 		 */
 		if (resolves) {
 			if (!error) {
 				for (mdp = start; mdp < stop; mdp++) {
 					mp = *mdp;
 					if (mp->md_type != MDT_VERSION)
 						continue;
 					modname = mp->md_cval;
 					nver = ((const struct mod_version *)
 					    mp->md_data)->mv_version;
 					if (modlist_lookup(modname,
 					    nver) != NULL) {
 						printf("module %s already"
 						    " present!\n", modname);
 						TAILQ_REMOVE(&loaded_files,
 						    lf, loaded);
 						linker_file_unload(lf,
 						    LINKER_UNLOAD_FORCE);
 						/* we changed tailq next ptr */
 						goto restart;
 					}
 					modlist_newmodule(modname, nver, lf);
 				}
 			}
 			TAILQ_REMOVE(&loaded_files, lf, loaded);
 			TAILQ_INSERT_TAIL(&depended_files, lf, loaded);
 			/*
 			 * Since we provided modules, we need to restart the
 			 * sort so that the previous files that depend on us
 			 * have a chance. Also, we've busted the tailq next
 			 * pointer with the REMOVE.
 			 */
 			goto restart;
 		}
 	}
 
 	/*
 	 * At this point, we check to see what could not be resolved..
 	 */
 	while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) {
 		TAILQ_REMOVE(&loaded_files, lf, loaded);
 		printf("KLD file %s is missing dependencies\n", lf->filename);
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 	}
 
 	/*
 	 * We made it. Finish off the linking in the order we determined.
 	 */
 	TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) {
 		if (linker_kernel_file) {
 			linker_kernel_file->refs++;
 			error = linker_file_add_dependency(lf,
 			    linker_kernel_file);
 			if (error)
 				panic("cannot add dependency");
 		}
 		error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
 		    &stop, NULL);
 		if (!error) {
 			for (mdp = start; mdp < stop; mdp++) {
 				mp = *mdp;
 				if (mp->md_type != MDT_DEPEND)
 					continue;
 				modname = mp->md_cval;
 				verinfo = mp->md_data;
 				mod = modlist_lookup2(modname, verinfo);
 				if (mod == NULL) {
 					printf("KLD file %s - cannot find "
 					    "dependency \"%s\"\n",
 					    lf->filename, modname);
 					goto fail;
 				}
 				/* Don't count self-dependencies */
 				if (lf == mod->container)
 					continue;
 				mod->container->refs++;
 				error = linker_file_add_dependency(lf,
 				    mod->container);
 				if (error)
 					panic("cannot add dependency");
 			}
 		}
 		/*
 		 * Now do relocation etc using the symbol search paths
 		 * established by the dependencies
 		 */
 		error = LINKER_LINK_PRELOAD_FINISH(lf);
 		if (error) {
 			printf("KLD file %s - could not finalize loading\n",
 			    lf->filename);
 			goto fail;
 		}
 		linker_file_register_modules(lf);
 		if (!TAILQ_EMPTY(&lf->modules))
 			lf->flags |= LINKER_FILE_MODULES;
 		if (linker_file_lookup_set(lf, "sysinit_set", &si_start,
 		    &si_stop, NULL) == 0)
 			sysinit_add(si_start, si_stop);
 		linker_file_register_sysctls(lf, true);
 		lf->flags |= LINKER_FILE_LINKED;
 		continue;
 fail:
 		TAILQ_REMOVE(&depended_files, lf, loaded);
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 	}
 	sx_xunlock(&kld_sx);
 	/* woohoo! we made it! */
 }
 
 SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0);
 
 /*
  * Handle preload files that failed to load any modules.
  */
 static void
 linker_preload_finish(void *arg)
 {
 	linker_file_t lf, nlf;
 
 	sx_xlock(&kld_sx);
 	TAILQ_FOREACH_SAFE(lf, &linker_files, link, nlf) {
 		/*
 		 * If all of the modules in this file failed to load, unload
 		 * the file and return an error of ENOEXEC.  (Parity with
 		 * linker_load_file.)
 		 */
 		if ((lf->flags & LINKER_FILE_MODULES) != 0 &&
 		    TAILQ_EMPTY(&lf->modules)) {
 			linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 			continue;
 		}
 
 		lf->flags &= ~LINKER_FILE_MODULES;
 		lf->userrefs++;	/* so we can (try to) kldunload it */
 	}
 	sx_xunlock(&kld_sx);
 }
 
 /*
  * Attempt to run after all DECLARE_MODULE SYSINITs.  Unfortunately they can be
  * scheduled at any subsystem and order, so run this as late as possible.  init
  * becomes runnable in SI_SUB_KTHREAD_INIT, so go slightly before that.
  */
 SYSINIT(preload_finish, SI_SUB_KTHREAD_INIT - 100, SI_ORDER_MIDDLE,
     linker_preload_finish, 0);
 
 /*
  * Search for a not-loaded module by name.
  *
  * Modules may be found in the following locations:
  *
  * - preloaded (result is just the module name) - on disk (result is full path
  * to module)
  *
  * If the module name is qualified in any way (contains path, etc.) the we
  * simply return a copy of it.
  *
  * The search path can be manipulated via sysctl.  Note that we use the ';'
  * character as a separator to be consistent with the bootloader.
  */
 
 static char linker_hintfile[] = "linker.hints";
 static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules";
 
 SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RWTUN, linker_path,
     sizeof(linker_path), "module load search path");
 
 TUNABLE_STR("module_path", linker_path, sizeof(linker_path));
 
 static char *linker_ext_list[] = {
 	"",
 	".ko",
 	NULL
 };
 
 /*
  * Check if file actually exists either with or without extension listed in
  * the linker_ext_list. (probably should be generic for the rest of the
  * kernel)
  */
 static char *
 linker_lookup_file(const char *path, int pathlen, const char *name,
     int namelen, struct vattr *vap)
 {
 	struct nameidata nd;
 	struct thread *td = curthread;	/* XXX */
 	char *result, **cpp, *sep;
 	int error, len, extlen, reclen, flags;
 	enum vtype type;
 
 	extlen = 0;
 	for (cpp = linker_ext_list; *cpp; cpp++) {
 		len = strlen(*cpp);
 		if (len > extlen)
 			extlen = len;
 	}
 	extlen++;		/* trailing '\0' */
 	sep = (path[pathlen - 1] != '/') ? "/" : "";
 
 	reclen = pathlen + strlen(sep) + namelen + extlen + 1;
 	result = malloc(reclen, M_LINKER, M_WAITOK);
 	for (cpp = linker_ext_list; *cpp; cpp++) {
 		snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep,
 		    namelen, name, *cpp);
 		/*
 		 * Attempt to open the file, and return the path if
 		 * we succeed and it's a regular file.
 		 */
 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td);
 		flags = FREAD;
 		error = vn_open(&nd, &flags, 0, NULL);
 		if (error == 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			type = nd.ni_vp->v_type;
 			if (vap)
 				VOP_GETATTR(nd.ni_vp, vap, td->td_ucred);
 			VOP_UNLOCK(nd.ni_vp, 0);
 			vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
 			if (type == VREG)
 				return (result);
 		}
 	}
 	free(result, M_LINKER);
 	return (NULL);
 }
 
 #define	INT_ALIGN(base, ptr)	ptr =					\
 	(base) + roundup2((ptr) - (base), sizeof(int))
 
 /*
  * Lookup KLD which contains requested module in the "linker.hints" file. If
  * version specification is available, then try to find the best KLD.
  * Otherwise just find the latest one.
  */
 static char *
 linker_hints_lookup(const char *path, int pathlen, const char *modname,
     int modnamelen, const struct mod_depend *verinfo)
 {
 	struct thread *td = curthread;	/* XXX */
 	struct ucred *cred = td ? td->td_ucred : NULL;
 	struct nameidata nd;
 	struct vattr vattr, mattr;
 	u_char *hints = NULL;
 	u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
 	int error, ival, bestver, *intp, found, flags, clen, blen;
 	ssize_t reclen;
 
 	result = NULL;
 	bestver = found = 0;
 
 	sep = (path[pathlen - 1] != '/') ? "/" : "";
 	reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen +
 	    strlen(sep) + 1;
 	pathbuf = malloc(reclen, M_LINKER, M_WAITOK);
 	snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
 	    linker_hintfile);
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td);
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		goto bad;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG)
 		goto bad;
 	best = cp = NULL;
 	error = VOP_GETATTR(nd.ni_vp, &vattr, cred);
 	if (error)
 		goto bad;
 	/*
 	 * XXX: we need to limit this number to some reasonable value
 	 */
 	if (vattr.va_size > LINKER_HINTS_MAX) {
 		printf("hints file too large %ld\n", (long)vattr.va_size);
 		goto bad;
 	}
 	hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
 	error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
 	    UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td);
 	if (error)
 		goto bad;
 	VOP_UNLOCK(nd.ni_vp, 0);
 	vn_close(nd.ni_vp, FREAD, cred, td);
 	nd.ni_vp = NULL;
 	if (reclen != 0) {
 		printf("can't read %zd\n", reclen);
 		goto bad;
 	}
 	intp = (int *)hints;
 	ival = *intp++;
 	if (ival != LINKER_HINTS_VERSION) {
 		printf("hints file version mismatch %d\n", ival);
 		goto bad;
 	}
 	bufend = hints + vattr.va_size;
 	recptr = (u_char *)intp;
 	clen = blen = 0;
 	while (recptr < bufend && !found) {
 		intp = (int *)recptr;
 		reclen = *intp++;
 		ival = *intp++;
 		cp = (char *)intp;
 		switch (ival) {
 		case MDT_VERSION:
 			clen = *cp++;
 			if (clen != modnamelen || bcmp(cp, modname, clen) != 0)
 				break;
 			cp += clen;
 			INT_ALIGN(hints, cp);
 			ival = *(int *)cp;
 			cp += sizeof(int);
 			clen = *cp++;
 			if (verinfo == NULL ||
 			    ival == verinfo->md_ver_preferred) {
 				found = 1;
 				break;
 			}
 			if (ival >= verinfo->md_ver_minimum &&
 			    ival <= verinfo->md_ver_maximum &&
 			    ival > bestver) {
 				bestver = ival;
 				best = cp;
 				blen = clen;
 			}
 			break;
 		default:
 			break;
 		}
 		recptr += reclen + sizeof(int);
 	}
 	/*
 	 * Finally check if KLD is in the place
 	 */
 	if (found)
 		result = linker_lookup_file(path, pathlen, cp, clen, &mattr);
 	else if (best)
 		result = linker_lookup_file(path, pathlen, best, blen, &mattr);
 
 	/*
 	 * KLD is newer than hints file. What we should do now?
 	 */
 	if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >))
 		printf("warning: KLD '%s' is newer than the linker.hints"
 		    " file\n", result);
 bad:
 	free(pathbuf, M_LINKER);
 	if (hints)
 		free(hints, M_TEMP);
 	if (nd.ni_vp != NULL) {
 		VOP_UNLOCK(nd.ni_vp, 0);
 		vn_close(nd.ni_vp, FREAD, cred, td);
 	}
 	/*
 	 * If nothing found or hints is absent - fallback to the old
 	 * way by using "kldname[.ko]" as module name.
 	 */
 	if (!found && !bestver && result == NULL)
 		result = linker_lookup_file(path, pathlen, modname,
 		    modnamelen, NULL);
 	return (result);
 }
 
 /*
  * Lookup KLD which contains requested module in the all directories.
  */
 static char *
 linker_search_module(const char *modname, int modnamelen,
     const struct mod_depend *verinfo)
 {
 	char *cp, *ep, *result;
 
 	/*
 	 * traverse the linker path
 	 */
 	for (cp = linker_path; *cp; cp = ep + 1) {
 		/* find the end of this component */
 		for (ep = cp; (*ep != 0) && (*ep != ';'); ep++);
 		result = linker_hints_lookup(cp, ep - cp, modname,
 		    modnamelen, verinfo);
 		if (result != NULL)
 			return (result);
 		if (*ep == 0)
 			break;
 	}
 	return (NULL);
 }
 
 /*
  * Search for module in all directories listed in the linker_path.
  */
 static char *
 linker_search_kld(const char *name)
 {
 	char *cp, *ep, *result;
 	int len;
 
 	/* qualified at all? */
 	if (strchr(name, '/'))
 		return (strdup(name, M_LINKER));
 
 	/* traverse the linker path */
 	len = strlen(name);
 	for (ep = linker_path; *ep; ep++) {
 		cp = ep;
 		/* find the end of this component */
 		for (; *ep != 0 && *ep != ';'; ep++);
 		result = linker_lookup_file(cp, ep - cp, name, len, NULL);
 		if (result != NULL)
 			return (result);
 	}
 	return (NULL);
 }
 
 static const char *
 linker_basename(const char *path)
 {
 	const char *filename;
 
 	filename = strrchr(path, '/');
 	if (filename == NULL)
 		return path;
 	if (filename[1])
 		filename++;
 	return (filename);
 }
 
 #ifdef HWPMC_HOOKS
 /*
  * Inform hwpmc about the set of kernel modules currently loaded.
  */
 void *
 linker_hwpmc_list_objects(void)
 {
 	linker_file_t lf;
 	struct pmckern_map_in *kobase;
 	int i, nmappings;
 
 	nmappings = 0;
 	sx_slock(&kld_sx);
 	TAILQ_FOREACH(lf, &linker_files, link)
 		nmappings++;
 
 	/* Allocate nmappings + 1 entries. */
 	kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in),
 	    M_LINKER, M_WAITOK | M_ZERO);
 	i = 0;
 	TAILQ_FOREACH(lf, &linker_files, link) {
 
 		/* Save the info for this linker file. */
 		kobase[i].pm_file = lf->filename;
 		kobase[i].pm_address = (uintptr_t)lf->address;
 		i++;
 	}
 	sx_sunlock(&kld_sx);
 
 	KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?"));
 
 	/* The last entry of the malloced area comprises of all zeros. */
 	KASSERT(kobase[i].pm_file == NULL,
 	    ("linker_hwpmc_list_objects: last object not NULL"));
 
 	return ((void *)kobase);
 }
 #endif
 
 /*
  * Find a file which contains given module and load it, if "parent" is not
  * NULL, register a reference to it.
  */
 static int
 linker_load_module(const char *kldname, const char *modname,
     struct linker_file *parent, const struct mod_depend *verinfo,
     struct linker_file **lfpp)
 {
 	linker_file_t lfdep;
 	const char *filename;
 	char *pathname;
 	int error;
 
 	sx_assert(&kld_sx, SA_XLOCKED);
 	if (modname == NULL) {
 		/*
  		 * We have to load KLD
  		 */
 		KASSERT(verinfo == NULL, ("linker_load_module: verinfo"
 		    " is not NULL"));
 		pathname = linker_search_kld(kldname);
 	} else {
 		if (modlist_lookup2(modname, verinfo) != NULL)
 			return (EEXIST);
 		if (kldname != NULL)
 			pathname = strdup(kldname, M_LINKER);
 		else if (rootvnode == NULL)
 			pathname = NULL;
 		else
 			/*
 			 * Need to find a KLD with required module
 			 */
 			pathname = linker_search_module(modname,
 			    strlen(modname), verinfo);
 	}
 	if (pathname == NULL)
 		return (ENOENT);
 
 	/*
 	 * Can't load more than one file with the same basename XXX:
 	 * Actually it should be possible to have multiple KLDs with
 	 * the same basename but different path because they can
 	 * provide different versions of the same modules.
 	 */
 	filename = linker_basename(pathname);
 	if (linker_find_file_by_name(filename))
 		error = EEXIST;
 	else do {
 		error = linker_load_file(pathname, &lfdep);
 		if (error)
 			break;
 		if (modname && verinfo &&
 		    modlist_lookup2(modname, verinfo) == NULL) {
 			linker_file_unload(lfdep, LINKER_UNLOAD_FORCE);
 			error = ENOENT;
 			break;
 		}
 		if (parent) {
 			error = linker_file_add_dependency(parent, lfdep);
 			if (error)
 				break;
 		}
 		if (lfpp)
 			*lfpp = lfdep;
 	} while (0);
 	free(pathname, M_LINKER);
 	return (error);
 }
 
 /*
  * This routine is responsible for finding dependencies of userland initiated
  * kldload(2)'s of files.
  */
 int
 linker_load_dependencies(linker_file_t lf)
 {
 	linker_file_t lfdep;
 	struct mod_metadata **start, **stop, **mdp, **nmdp;
 	struct mod_metadata *mp, *nmp;
 	const struct mod_depend *verinfo;
 	modlist_t mod;
 	const char *modname, *nmodname;
 	int ver, error = 0, count;
 
 	/*
 	 * All files are dependent on /kernel.
 	 */
 	sx_assert(&kld_sx, SA_XLOCKED);
 	if (linker_kernel_file) {
 		linker_kernel_file->refs++;
 		error = linker_file_add_dependency(lf, linker_kernel_file);
 		if (error)
 			return (error);
 	}
 	if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop,
 	    &count) != 0)
 		return (0);
 	for (mdp = start; mdp < stop; mdp++) {
 		mp = *mdp;
 		if (mp->md_type != MDT_VERSION)
 			continue;
 		modname = mp->md_cval;
 		ver = ((const struct mod_version *)mp->md_data)->mv_version;
 		mod = modlist_lookup(modname, ver);
 		if (mod != NULL) {
 			printf("interface %s.%d already present in the KLD"
 			    " '%s'!\n", modname, ver,
 			    mod->container->filename);
 			return (EEXIST);
 		}
 	}
 
 	for (mdp = start; mdp < stop; mdp++) {
 		mp = *mdp;
 		if (mp->md_type != MDT_DEPEND)
 			continue;
 		modname = mp->md_cval;
 		verinfo = mp->md_data;
 		nmodname = NULL;
 		for (nmdp = start; nmdp < stop; nmdp++) {
 			nmp = *nmdp;
 			if (nmp->md_type != MDT_VERSION)
 				continue;
 			nmodname = nmp->md_cval;
 			if (strcmp(modname, nmodname) == 0)
 				break;
 		}
 		if (nmdp < stop)/* early exit, it's a self reference */
 			continue;
 		mod = modlist_lookup2(modname, verinfo);
 		if (mod) {	/* woohoo, it's loaded already */
 			lfdep = mod->container;
 			lfdep->refs++;
 			error = linker_file_add_dependency(lf, lfdep);
 			if (error)
 				break;
 			continue;
 		}
 		error = linker_load_module(NULL, modname, lf, verinfo, NULL);
 		if (error) {
 			printf("KLD %s: depends on %s - not available or"
 			    " version mismatch\n", lf->filename, modname);
 			break;
 		}
 	}
 
 	if (error)
 		return (error);
 	linker_addmodules(lf, start, stop, 0);
 	return (error);
 }
 
 static int
 sysctl_kern_function_list_iterate(const char *name, void *opaque)
 {
 	struct sysctl_req *req;
 
 	req = opaque;
 	return (SYSCTL_OUT(req, name, strlen(name) + 1));
 }
 
 /*
  * Export a nul-separated, double-nul-terminated list of all function names
  * in the kernel.
  */
 static int
 sysctl_kern_function_list(SYSCTL_HANDLER_ARGS)
 {
 	linker_file_t lf;
 	int error;
 
 #ifdef MAC
 	error = mac_kld_check_stat(req->td->td_ucred);
 	if (error)
 		return (error);
 #endif
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sx_xlock(&kld_sx);
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		error = LINKER_EACH_FUNCTION_NAME(lf,
 		    sysctl_kern_function_list_iterate, req);
 		if (error) {
 			sx_xunlock(&kld_sx);
 			return (error);
 		}
 	}
 	sx_xunlock(&kld_sx);
 	return (SYSCTL_OUT(req, "", 1));
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_kern_function_list, "", "kernel function list");
Index: head/sys/kern/kern_lock.c
===================================================================
--- head/sys/kern/kern_lock.c	(revision 326270)
+++ head/sys/kern/kern_lock.c	(revision 326271)
@@ -1,1547 +1,1549 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008 Attilio Rao <attilio@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/lock_profile.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sleepqueue.h>
 #ifdef DEBUG_LOCKS
 #include <sys/stack.h>
 #endif
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) ==
     (LK_ADAPTIVE | LK_NOSHARE));
 CTASSERT(LK_UNLOCKED == (LK_UNLOCKED &
     ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS)));
 
 #define	SQ_EXCLUSIVE_QUEUE	0
 #define	SQ_SHARED_QUEUE		1
 
 #ifndef INVARIANTS
 #define	_lockmgr_assert(lk, what, file, line)
 #endif
 
 #define	TD_SLOCKS_INC(td)	((td)->td_lk_slocks++)
 #define	TD_SLOCKS_DEC(td)	((td)->td_lk_slocks--)
 
 #ifndef DEBUG_LOCKS
 #define	STACK_PRINT(lk)
 #define	STACK_SAVE(lk)
 #define	STACK_ZERO(lk)
 #else
 #define	STACK_PRINT(lk)	stack_print_ddb(&(lk)->lk_stack)
 #define	STACK_SAVE(lk)	stack_save(&(lk)->lk_stack)
 #define	STACK_ZERO(lk)	stack_zero(&(lk)->lk_stack)
 #endif
 
 #define	LOCK_LOG2(lk, string, arg1, arg2)				\
 	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
 		CTR2(KTR_LOCK, (string), (arg1), (arg2))
 #define	LOCK_LOG3(lk, string, arg1, arg2, arg3)				\
 	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
 		CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3))
 
 #define	GIANT_DECLARE							\
 	int _i = 0;							\
 	WITNESS_SAVE_DECL(Giant)
 #define	GIANT_RESTORE() do {						\
 	if (_i > 0) {							\
 		while (_i--)						\
 			mtx_lock(&Giant);				\
 		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
 	}								\
 } while (0)
 #define	GIANT_SAVE() do {						\
 	if (mtx_owned(&Giant)) {					\
 		WITNESS_SAVE(&Giant.lock_object, Giant);		\
 		while (mtx_owned(&Giant)) {				\
 			_i++;						\
 			mtx_unlock(&Giant);				\
 		}							\
 	}								\
 } while (0)
 
 #define	LK_CAN_SHARE(x, flags)						\
 	(((x) & LK_SHARE) &&						\
 	(((x) & (LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) == 0 ||	\
 	(curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) ||	\
 	(curthread->td_pflags & TDP_DEADLKTREAT)))
 #define	LK_TRYOP(x)							\
 	((x) & LK_NOWAIT)
 
 #define	LK_CAN_WITNESS(x)						\
 	(((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x))
 #define	LK_TRYWIT(x)							\
 	(LK_TRYOP(x) ? LOP_TRYLOCK : 0)
 
 #define	LK_CAN_ADAPT(lk, f)						\
 	(((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 &&		\
 	((f) & LK_SLEEPFAIL) == 0)
 
 #define	lockmgr_disowned(lk)						\
 	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC)
 
 #define	lockmgr_xlocked(lk)						\
 	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread)
 
 static void	assert_lockmgr(const struct lock_object *lock, int how);
 #ifdef DDB
 static void	db_show_lockmgr(const struct lock_object *lock);
 #endif
 static void	lock_lockmgr(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_lockmgr(const struct lock_object *lock,
 		    struct thread **owner);
 #endif
 static uintptr_t unlock_lockmgr(struct lock_object *lock);
 
 struct lock_class lock_class_lockmgr = {
 	.lc_name = "lockmgr",
 	.lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE,
 	.lc_assert = assert_lockmgr,
 #ifdef DDB
 	.lc_ddb_show = db_show_lockmgr,
 #endif
 	.lc_lock = lock_lockmgr,
 	.lc_unlock = unlock_lockmgr,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_lockmgr,
 #endif
 };
 
 static bool __always_inline lockmgr_slock_try(struct lock *lk, uintptr_t *xp,
     int flags);
 static bool __always_inline lockmgr_sunlock_try(struct lock *lk, uintptr_t x);
 
 static void
 lockmgr_note_shared_acquire(struct lock *lk, int contested,
     uint64_t waittime, const char *file, int line, int flags)
 {
 
 	lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime,
 	    file, line);
 	LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file, line);
 	WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file, line);
 	TD_LOCKS_INC(curthread);
 	TD_SLOCKS_INC(curthread);
 	STACK_SAVE(lk);
 }
 
 static void
 lockmgr_note_shared_release(struct lock *lk, const char *file, int line)
 {
 
 	lock_profile_release_lock(&lk->lock_object);
 	WITNESS_UNLOCK(&lk->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line);
 	TD_LOCKS_DEC(curthread);
 	TD_SLOCKS_DEC(curthread);
 }
 
 static void
 lockmgr_note_exclusive_acquire(struct lock *lk, int contested,
     uint64_t waittime, const char *file, int line, int flags)
 {
 
 	lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime,
 	    file, line);
 	LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line);
 	WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file,
 	    line);
 	TD_LOCKS_INC(curthread);
 	STACK_SAVE(lk);
 }
 
 static void
 lockmgr_note_exclusive_release(struct lock *lk, const char *file, int line)
 {
 
 	lock_profile_release_lock(&lk->lock_object);
 	LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file,
 	    line);
 	WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
 	TD_LOCKS_DEC(curthread);
 }
 
 static void
 lockmgr_note_exclusive_upgrade(struct lock *lk, const char *file, int line,
     int flags)
 {
 
 	LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file,
 	    line);
 	WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE |
 	    LK_TRYWIT(flags), file, line);
 	TD_SLOCKS_DEC(curthread);
 }
 
 static __inline struct thread *
 lockmgr_xholder(const struct lock *lk)
 {
 	uintptr_t x;
 
 	x = lk->lk_lock;
 	return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x));
 }
 
 /*
  * It assumes sleepq_lock held and returns with this one unheld.
  * It also assumes the generic interlock is sane and previously checked.
  * If LK_INTERLOCK is specified the interlock is not reacquired after the
  * sleep.
  */
 static __inline int
 sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,
     const char *wmesg, int pri, int timo, int queue)
 {
 	GIANT_DECLARE;
 	struct lock_class *class;
 	int catch, error;
 
 	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
 	catch = pri & PCATCH;
 	pri &= PRIMASK;
 	error = 0;
 
 	LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk,
 	    (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared");
 
 	if (flags & LK_INTERLOCK)
 		class->lc_unlock(ilk);
 	if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0)
 		lk->lk_exslpfail++;
 	GIANT_SAVE();
 	sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
 	    SLEEPQ_INTERRUPTIBLE : 0), queue);
 	if ((flags & LK_TIMELOCK) && timo)
 		sleepq_set_timeout(&lk->lock_object, timo);
 
 	/*
 	 * Decisional switch for real sleeping.
 	 */
 	if ((flags & LK_TIMELOCK) && timo && catch)
 		error = sleepq_timedwait_sig(&lk->lock_object, pri);
 	else if ((flags & LK_TIMELOCK) && timo)
 		error = sleepq_timedwait(&lk->lock_object, pri);
 	else if (catch)
 		error = sleepq_wait_sig(&lk->lock_object, pri);
 	else
 		sleepq_wait(&lk->lock_object, pri);
 	GIANT_RESTORE();
 	if ((flags & LK_SLEEPFAIL) && error == 0)
 		error = ENOLCK;
 
 	return (error);
 }
 
 static __inline int
 wakeupshlk(struct lock *lk, const char *file, int line)
 {
 	uintptr_t v, x;
 	u_int realexslp;
 	int queue, wakeup_swapper;
 
 	wakeup_swapper = 0;
 	for (;;) {
 		x = lk->lk_lock;
 		if (lockmgr_sunlock_try(lk, x))
 			break;
 
 		/*
 		 * We should have a sharer with waiters, so enter the hard
 		 * path in order to handle wakeups correctly.
 		 */
 		sleepq_lock(&lk->lock_object);
 		x = lk->lk_lock & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 		v = LK_UNLOCKED;
 
 		/*
 		 * If the lock has exclusive waiters, give them preference in
 		 * order to avoid deadlock with shared runners up.
 		 * If interruptible sleeps left the exclusive queue empty
 		 * avoid a starvation for the threads sleeping on the shared
 		 * queue by giving them precedence and cleaning up the
 		 * exclusive waiters bit anyway.
 		 * Please note that lk_exslpfail count may be lying about
 		 * the real number of waiters with the LK_SLEEPFAIL flag on
 		 * because they may be used in conjunction with interruptible
 		 * sleeps so lk_exslpfail might be considered an 'upper limit'
 		 * bound, including the edge cases.
 		 */
 		realexslp = sleepq_sleepcnt(&lk->lock_object,
 		    SQ_EXCLUSIVE_QUEUE);
 		if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
 			if (lk->lk_exslpfail < realexslp) {
 				lk->lk_exslpfail = 0;
 				queue = SQ_EXCLUSIVE_QUEUE;
 				v |= (x & LK_SHARED_WAITERS);
 			} else {
 				lk->lk_exslpfail = 0;
 				LOCK_LOG2(lk,
 				    "%s: %p has only LK_SLEEPFAIL sleepers",
 				    __func__, lk);
 				LOCK_LOG2(lk,
 			    "%s: %p waking up threads on the exclusive queue",
 				    __func__, lk);
 				wakeup_swapper =
 				    sleepq_broadcast(&lk->lock_object,
 				    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
 				queue = SQ_SHARED_QUEUE;
 			}
 				
 		} else {
 
 			/*
 			 * Exclusive waiters sleeping with LK_SLEEPFAIL on
 			 * and using interruptible sleeps/timeout may have
 			 * left spourious lk_exslpfail counts on, so clean
 			 * it up anyway.
 			 */
 			lk->lk_exslpfail = 0;
 			queue = SQ_SHARED_QUEUE;
 		}
 
 		if (!atomic_cmpset_rel_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x,
 		    v)) {
 			sleepq_release(&lk->lock_object);
 			continue;
 		}
 		LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
 		    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
 		    "exclusive");
 		wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
 		    0, queue);
 		sleepq_release(&lk->lock_object);
 		break;
 	}
 
 	lockmgr_note_shared_release(lk, file, line);
 	return (wakeup_swapper);
 }
 
 static void
 assert_lockmgr(const struct lock_object *lock, int what)
 {
 
 	panic("lockmgr locks do not support assertions");
 }
 
 static void
 lock_lockmgr(struct lock_object *lock, uintptr_t how)
 {
 
 	panic("lockmgr locks do not support sleep interlocking");
 }
 
 static uintptr_t
 unlock_lockmgr(struct lock_object *lock)
 {
 
 	panic("lockmgr locks do not support sleep interlocking");
 }
 
 #ifdef KDTRACE_HOOKS
 static int
 owner_lockmgr(const struct lock_object *lock, struct thread **owner)
 {
 
 	panic("lockmgr locks do not support owner inquiring");
 }
 #endif
 
 void
 lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags)
 {
 	int iflags;
 
 	MPASS((flags & ~LK_INIT_MASK) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock,
             ("%s: lockmgr not aligned for %s: %p", __func__, wmesg,
             &lk->lk_lock));
 
 	iflags = LO_SLEEPABLE | LO_UPGRADABLE;
 	if (flags & LK_CANRECURSE)
 		iflags |= LO_RECURSABLE;
 	if ((flags & LK_NODUP) == 0)
 		iflags |= LO_DUPOK;
 	if (flags & LK_NOPROFILE)
 		iflags |= LO_NOPROFILE;
 	if ((flags & LK_NOWITNESS) == 0)
 		iflags |= LO_WITNESS;
 	if (flags & LK_QUIET)
 		iflags |= LO_QUIET;
 	if (flags & LK_IS_VNODE)
 		iflags |= LO_IS_VNODE;
 	iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE);
 
 	lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags);
 	lk->lk_lock = LK_UNLOCKED;
 	lk->lk_recurse = 0;
 	lk->lk_exslpfail = 0;
 	lk->lk_timo = timo;
 	lk->lk_pri = pri;
 	STACK_ZERO(lk);
 }
 
 /*
  * XXX: Gross hacks to manipulate external lock flags after
  * initialization.  Used for certain vnode and buf locks.
  */
 void
 lockallowshare(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags &= ~LK_NOSHARE;
 }
 
 void
 lockdisableshare(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags |= LK_NOSHARE;
 }
 
 void
 lockallowrecurse(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags |= LO_RECURSABLE;
 }
 
 void
 lockdisablerecurse(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags &= ~LO_RECURSABLE;
 }
 
 void
 lockdestroy(struct lock *lk)
 {
 
 	KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held"));
 	KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed"));
 	KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters"));
 	lock_destroy(&lk->lock_object);
 }
 
 static bool __always_inline
 lockmgr_slock_try(struct lock *lk, uintptr_t *xp, int flags)
 {
 
 	/*
 	 * If no other thread has an exclusive lock, or
 	 * no exclusive waiter is present, bump the count of
 	 * sharers.  Since we have to preserve the state of
 	 * waiters, if we fail to acquire the shared lock
 	 * loop back and retry.
 	 */
 	*xp = lk->lk_lock;
 	while (LK_CAN_SHARE(*xp, flags)) {
 		if (atomic_fcmpset_acq_ptr(&lk->lk_lock, xp,
 		    *xp + LK_ONE_SHARER)) {
 			return (true);
 		}
 	}
 	return (false);
 }
 
 static bool __always_inline
 lockmgr_sunlock_try(struct lock *lk, uintptr_t x)
 {
 
 	for (;;) {
 		/*
 		 * If there is more than one shared lock held, just drop one
 		 * and return.
 		 */
 		if (LK_SHARERS(x) > 1) {
 			if (atomic_fcmpset_rel_ptr(&lk->lk_lock, &x,
 			    x - LK_ONE_SHARER))
 				return (true);
 			continue;
 		}
 
 		/*
 		 * If there are not waiters on the exclusive queue, drop the
 		 * lock quickly.
 		 */
 		if ((x & LK_ALL_WAITERS) == 0) {
 			MPASS((x & ~LK_EXCLUSIVE_SPINNERS) ==
 			    LK_SHARERS_LOCK(1));
 			if (atomic_fcmpset_rel_ptr(&lk->lk_lock, &x,
 			    LK_UNLOCKED))
 				return (true);
 			continue;
 		}
 		break;
 	}
 	return (false);
 }
 
 int
 lockmgr_lock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk,
     const char *file, int line)
 {
 	struct lock_class *class;
 	uintptr_t x, v, tid;
 	u_int op;
 	bool locked;
 
 	op = flags & LK_TYPE_MASK;
 	locked = false;
 	switch (op) {
 	case LK_SHARED:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
 			    file, line, flags & LK_INTERLOCK ? ilk : NULL);
 		if (__predict_false(lk->lock_object.lo_flags & LK_NOSHARE))
 			break;
 		if (lockmgr_slock_try(lk, &x, flags)) {
 			lockmgr_note_shared_acquire(lk, 0, 0,
 			    file, line, flags);
 			locked = true;
 		}
 		break;
 	case LK_EXCLUSIVE:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
 			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
 			    ilk : NULL);
 		tid = (uintptr_t)curthread;
 		if (lk->lk_lock == LK_UNLOCKED &&
 		    atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) {
 			lockmgr_note_exclusive_acquire(lk, 0, 0, file, line,
 			    flags);
 			locked = true;
 		}
 		break;
 	case LK_UPGRADE:
 	case LK_TRYUPGRADE:
 		_lockmgr_assert(lk, KA_SLOCKED, file, line);
 		tid = (uintptr_t)curthread;
 		v = lk->lk_lock;
 		x = v & LK_ALL_WAITERS;
 		v &= LK_EXCLUSIVE_SPINNERS;
 		if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v,
 		    tid | x)) {
 			lockmgr_note_exclusive_upgrade(lk, file, line, flags);
 			locked = true;
 		}
 		break;
 	default:
 		break;
 	}
 	if (__predict_true(locked)) {
 		if (__predict_false(flags & LK_INTERLOCK)) {
 			class = LOCK_CLASS(ilk);
 			class->lc_unlock(ilk);
 		}
 		return (0);
 	} else {
 		return (__lockmgr_args(lk, flags, ilk, LK_WMESG_DEFAULT,
 		    LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, file, line));
 	}
 }
 
 int
 lockmgr_unlock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk)
 {
 	struct lock_class *class;
 	uintptr_t x, tid;
 	bool unlocked;
 	const char *file;
 	int line;
 
 	file = __FILE__;
 	line = __LINE__;
 
 	_lockmgr_assert(lk, KA_LOCKED, file, line);
 	unlocked = false;
 	x = lk->lk_lock;
 	if (__predict_true(x & LK_SHARE) != 0) {
 		if (lockmgr_sunlock_try(lk, x)) {
 			lockmgr_note_shared_release(lk, file, line);
 			unlocked = true;
 		}
 	} else {
 		tid = (uintptr_t)curthread;
 		if (!lockmgr_recursed(lk) &&
 		    atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) {
 			lockmgr_note_exclusive_release(lk, file, line);
 			unlocked = true;
 		}
 	}
 	if (__predict_true(unlocked)) {
 		if (__predict_false(flags & LK_INTERLOCK)) {
 			class = LOCK_CLASS(ilk);
 			class->lc_unlock(ilk);
 		}
 		return (0);
 	} else {
 		return (__lockmgr_args(lk, flags | LK_RELEASE, ilk, LK_WMESG_DEFAULT,
 		    LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, LOCK_FILE, LOCK_LINE));
 	}
 }
 
 int
 __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
     const char *wmesg, int pri, int timo, const char *file, int line)
 {
 	GIANT_DECLARE;
 	struct lock_class *class;
 	const char *iwmesg;
 	uintptr_t tid, v, x;
 	u_int op, realexslp;
 	int error, ipri, itimo, queue, wakeup_swapper;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 
 	error = 0;
 	tid = (uintptr_t)curthread;
 	op = (flags & LK_TYPE_MASK);
 	iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg;
 	ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri;
 	itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo;
 
 	MPASS((flags & ~LK_TOTAL_MASK) == 0);
 	KASSERT((op & (op - 1)) == 0,
 	    ("%s: Invalid requested operation @ %s:%d", __func__, file, line));
 	KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 ||
 	    (op != LK_DOWNGRADE && op != LK_RELEASE),
 	    ("%s: Invalid flags in regard of the operation desired @ %s:%d",
 	    __func__, file, line));
 	KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL,
 	    ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d",
 	    __func__, file, line));
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread,
 	    lk->lock_object.lo_name, file, line));
 
 	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
 	if (panicstr != NULL) {
 		if (flags & LK_INTERLOCK)
 			class->lc_unlock(ilk);
 		return (0);
 	}
 
 	if (lk->lock_object.lo_flags & LK_NOSHARE) {
 		switch (op) {
 		case LK_SHARED:
 			op = LK_EXCLUSIVE;
 			break;
 		case LK_UPGRADE:
 		case LK_TRYUPGRADE:
 		case LK_DOWNGRADE:
 			_lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED,
 			    file, line);
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			return (0);
 		}
 	}
 
 	wakeup_swapper = 0;
 	switch (op) {
 	case LK_SHARED:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
 			    file, line, flags & LK_INTERLOCK ? ilk : NULL);
 		for (;;) {
 			if (lockmgr_slock_try(lk, &x, flags))
 				break;
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is already held by curthread in
 			 * exclusive way avoid a deadlock.
 			 */
 			if (LK_HOLDER(x) == tid) {
 				LOCK_LOG2(lk,
 				    "%s: %p already held in exclusive mode",
 				    __func__, lk);
 				error = EDEADLK;
 				break;
 			}
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock can be acquired in shared mode, try
 			 * again.
 			 */
 			if (LK_CAN_SHARE(x, flags)) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 			/*
 			 * Try to set the LK_SHARED_WAITERS flag.  If we fail,
 			 * loop back and retry.
 			 */
 			if ((x & LK_SHARED_WAITERS) == 0) {
 				if (!atomic_cmpset_acq_ptr(&lk->lk_lock, x,
 				    x | LK_SHARED_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set shared waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * shared lock and the shared waiters flag is set,
 			 * we will sleep.
 			 */
 			error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
 			    SQ_SHARED_QUEUE);
 			flags &= ~LK_INTERLOCK;
 			if (error) {
 				LOCK_LOG3(lk,
 				    "%s: interrupted sleep for %p with %d",
 				    __func__, lk, error);
 				break;
 			}
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 		if (error == 0) {
 #ifdef LOCK_PROFILING
 			lockmgr_note_shared_acquire(lk, contested, waittime,
 			    file, line, flags);
 #else
 			lockmgr_note_shared_acquire(lk, 0, 0, file, line,
 			    flags);
 #endif
 		}
 		break;
 	case LK_UPGRADE:
 	case LK_TRYUPGRADE:
 		_lockmgr_assert(lk, KA_SLOCKED, file, line);
 		v = lk->lk_lock;
 		x = v & LK_ALL_WAITERS;
 		v &= LK_EXCLUSIVE_SPINNERS;
 
 		/*
 		 * Try to switch from one shared lock to an exclusive one.
 		 * We need to preserve waiters flags during the operation.
 		 */
 		if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v,
 		    tid | x)) {
 			LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file,
 			    line);
 			WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_SLOCKS_DEC(curthread);
 			break;
 		}
 
 		/*
 		 * In LK_TRYUPGRADE mode, do not drop the lock,
 		 * returning EBUSY instead.
 		 */
 		if (op == LK_TRYUPGRADE) {
 			LOCK_LOG2(lk, "%s: %p failed the nowait upgrade",
 			    __func__, lk);
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * We have been unable to succeed in upgrading, so just
 		 * give up the shared lock.
 		 */
 		wakeup_swapper |= wakeupshlk(lk, file, line);
 
 		/* FALLTHROUGH */
 	case LK_EXCLUSIVE:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
 			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
 			    ilk : NULL);
 
 		/*
 		 * If curthread already holds the lock and this one is
 		 * allowed to recurse, simply recurse on it.
 		 */
 		if (lockmgr_xlocked(lk)) {
 			if ((flags & LK_CANRECURSE) == 0 &&
 			    (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) {
 
 				/*
 				 * If the lock is expected to not panic just
 				 * give up and return.
 				 */
 				if (LK_TRYOP(flags)) {
 					LOCK_LOG2(lk,
 					    "%s: %p fails the try operation",
 					    __func__, lk);
 					error = EBUSY;
 					break;
 				}
 				if (flags & LK_INTERLOCK)
 					class->lc_unlock(ilk);
 		panic("%s: recursing on non recursive lockmgr %s @ %s:%d\n",
 				    __func__, iwmesg, file, line);
 			}
 			lk->lk_recurse++;
 			LOCK_LOG2(lk, "%s: %p recursing", __func__, lk);
 			LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_LOCKS_INC(curthread);
 			break;
 		}
 
 		for (;;) {
 			if (lk->lk_lock == LK_UNLOCKED &&
 			    atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid))
 				break;
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock has been released while we spun on
 			 * the sleepqueue chain lock just try again.
 			 */
 			if (x == LK_UNLOCKED) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 			/*
 			 * The lock can be in the state where there is a
 			 * pending queue of waiters, but still no owner.
 			 * This happens when the lock is contested and an
 			 * owner is going to claim the lock.
 			 * If curthread is the one successfully acquiring it
 			 * claim lock ownership and return, preserving waiters
 			 * flags.
 			 */
 			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 			if ((x & ~v) == LK_UNLOCKED) {
 				v &= ~LK_EXCLUSIVE_SPINNERS;
 				if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
 				    tid | v)) {
 					sleepq_release(&lk->lock_object);
 					LOCK_LOG2(lk,
 					    "%s: %p claimed by a new writer",
 					    __func__, lk);
 					break;
 				}
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 			/*
 			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
 			 * fail, loop back and retry.
 			 */
 			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
 				    x | LK_EXCLUSIVE_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set excl waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * exclusive lock and the exclusive waiters flag
 			 * is set, we will sleep.
 			 */
 			error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
 			    SQ_EXCLUSIVE_QUEUE);
 			flags &= ~LK_INTERLOCK;
 			if (error) {
 				LOCK_LOG3(lk,
 				    "%s: interrupted sleep for %p with %d",
 				    __func__, lk, error);
 				break;
 			}
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 		if (error == 0) {
 #ifdef LOCK_PROFILING
 			lockmgr_note_exclusive_acquire(lk, contested, waittime,
 			    file, line, flags);
 #else
 			lockmgr_note_exclusive_acquire(lk, 0, 0, file, line,
 			    flags);
 #endif
 		}
 		break;
 	case LK_DOWNGRADE:
 		_lockmgr_assert(lk, KA_XLOCKED, file, line);
 		LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line);
 		WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line);
 
 		/*
 		 * Panic if the lock is recursed.
 		 */
 		if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n",
 			    __func__, iwmesg, file, line);
 		}
 		TD_SLOCKS_INC(curthread);
 
 		/*
 		 * In order to preserve waiters flags, just spin.
 		 */
 		for (;;) {
 			x = lk->lk_lock;
 			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 			x &= LK_ALL_WAITERS;
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
 			    LK_SHARERS_LOCK(1) | x))
 				break;
 			cpu_spinwait();
 		}
 		break;
 	case LK_RELEASE:
 		_lockmgr_assert(lk, KA_LOCKED, file, line);
 		x = lk->lk_lock;
 
 		if ((x & LK_SHARE) == 0) {
 
 			/*
 			 * As first option, treact the lock as if it has not
 			 * any waiter.
 			 * Fix-up the tid var if the lock has been disowned.
 			 */
 			if (LK_HOLDER(x) == LK_KERNPROC)
 				tid = LK_KERNPROC;
 			else {
 				WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE,
 				    file, line);
 				TD_LOCKS_DEC(curthread);
 			}
 			LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 
 			/*
 			 * The lock is held in exclusive mode.
 			 * If the lock is recursed also, then unrecurse it.
 			 */
 			if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
 				LOCK_LOG2(lk, "%s: %p unrecursing", __func__,
 				    lk);
 				lk->lk_recurse--;
 				break;
 			}
 			if (tid != LK_KERNPROC)
 				lock_profile_release_lock(&lk->lock_object);
 
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid,
 			    LK_UNLOCKED))
 				break;
 
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 			v = LK_UNLOCKED;
 
 			/*
 		 	 * If the lock has exclusive waiters, give them
 			 * preference in order to avoid deadlock with
 			 * shared runners up.
 			 * If interruptible sleeps left the exclusive queue
 			 * empty avoid a starvation for the threads sleeping
 			 * on the shared queue by giving them precedence
 			 * and cleaning up the exclusive waiters bit anyway.
 			 * Please note that lk_exslpfail count may be lying
 			 * about the real number of waiters with the
 			 * LK_SLEEPFAIL flag on because they may be used in
 			 * conjunction with interruptible sleeps so
 			 * lk_exslpfail might be considered an 'upper limit'
 			 * bound, including the edge cases.
 			 */
 			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 			realexslp = sleepq_sleepcnt(&lk->lock_object,
 			    SQ_EXCLUSIVE_QUEUE);
 			if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
 				if (lk->lk_exslpfail < realexslp) {
 					lk->lk_exslpfail = 0;
 					queue = SQ_EXCLUSIVE_QUEUE;
 					v |= (x & LK_SHARED_WAITERS);
 				} else {
 					lk->lk_exslpfail = 0;
 					LOCK_LOG2(lk,
 					"%s: %p has only LK_SLEEPFAIL sleepers",
 					    __func__, lk);
 					LOCK_LOG2(lk,
 			"%s: %p waking up threads on the exclusive queue",
 					    __func__, lk);
 					wakeup_swapper =
 					    sleepq_broadcast(&lk->lock_object,
 					    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
 					queue = SQ_SHARED_QUEUE;
 				}
 			} else {
 
 				/*
 				 * Exclusive waiters sleeping with LK_SLEEPFAIL
 				 * on and using interruptible sleeps/timeout
 				 * may have left spourious lk_exslpfail counts
 				 * on, so clean it up anyway. 
 				 */
 				lk->lk_exslpfail = 0;
 				queue = SQ_SHARED_QUEUE;
 			}
 
 			LOCK_LOG3(lk,
 			    "%s: %p waking up threads on the %s queue",
 			    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
 			    "exclusive");
 			atomic_store_rel_ptr(&lk->lk_lock, v);
 			wakeup_swapper |= sleepq_broadcast(&lk->lock_object,
 			    SLEEPQ_LK, 0, queue);
 			sleepq_release(&lk->lock_object);
 			break;
 		} else
 			wakeup_swapper = wakeupshlk(lk, file, line);
 		break;
 	case LK_DRAIN:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
 			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
 			    ilk : NULL);
 
 		/*
 		 * Trying to drain a lock we already own will result in a
 		 * deadlock.
 		 */
 		if (lockmgr_xlocked(lk)) {
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			panic("%s: draining %s with the lock held @ %s:%d\n",
 			    __func__, iwmesg, file, line);
 		}
 
 		for (;;) {
 			if (lk->lk_lock == LK_UNLOCKED &&
 			    atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid))
 				break;
 
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock has been released while we spun on
 			 * the sleepqueue chain lock just try again.
 			 */
 			if (x == LK_UNLOCKED) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 			if ((x & ~v) == LK_UNLOCKED) {
 				v = (x & ~LK_EXCLUSIVE_SPINNERS);
 
 				/*
 				 * If interruptible sleeps left the exclusive
 				 * queue empty avoid a starvation for the
 				 * threads sleeping on the shared queue by
 				 * giving them precedence and cleaning up the
 				 * exclusive waiters bit anyway.
 				 * Please note that lk_exslpfail count may be
 				 * lying about the real number of waiters with
 				 * the LK_SLEEPFAIL flag on because they may
 				 * be used in conjunction with interruptible
 				 * sleeps so lk_exslpfail might be considered
 				 * an 'upper limit' bound, including the edge
 				 * cases.
 				 */
 				if (v & LK_EXCLUSIVE_WAITERS) {
 					queue = SQ_EXCLUSIVE_QUEUE;
 					v &= ~LK_EXCLUSIVE_WAITERS;
 				} else {
 
 					/*
 					 * Exclusive waiters sleeping with
 					 * LK_SLEEPFAIL on and using
 					 * interruptible sleeps/timeout may
 					 * have left spourious lk_exslpfail
 					 * counts on, so clean it up anyway.
 					 */
 					MPASS(v & LK_SHARED_WAITERS);
 					lk->lk_exslpfail = 0;
 					queue = SQ_SHARED_QUEUE;
 					v &= ~LK_SHARED_WAITERS;
 				}
 				if (queue == SQ_EXCLUSIVE_QUEUE) {
 					realexslp =
 					    sleepq_sleepcnt(&lk->lock_object,
 					    SQ_EXCLUSIVE_QUEUE);
 					if (lk->lk_exslpfail >= realexslp) {
 						lk->lk_exslpfail = 0;
 						queue = SQ_SHARED_QUEUE;
 						v &= ~LK_SHARED_WAITERS;
 						if (realexslp != 0) {
 							LOCK_LOG2(lk,
 					"%s: %p has only LK_SLEEPFAIL sleepers",
 							    __func__, lk);
 							LOCK_LOG2(lk,
 			"%s: %p waking up threads on the exclusive queue",
 							    __func__, lk);
 							wakeup_swapper =
 							    sleepq_broadcast(
 							    &lk->lock_object,
 							    SLEEPQ_LK, 0,
 							    SQ_EXCLUSIVE_QUEUE);
 						}
 					} else
 						lk->lk_exslpfail = 0;
 				}
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG3(lk,
 				"%s: %p waking up all threads on the %s queue",
 				    __func__, lk, queue == SQ_SHARED_QUEUE ?
 				    "shared" : "exclusive");
 				wakeup_swapper |= sleepq_broadcast(
 				    &lk->lock_object, SLEEPQ_LK, 0, queue);
 
 				/*
 				 * If shared waiters have been woken up we need
 				 * to wait for one of them to acquire the lock
 				 * before to set the exclusive waiters in
 				 * order to avoid a deadlock.
 				 */
 				if (queue == SQ_SHARED_QUEUE) {
 					for (v = lk->lk_lock;
 					    (v & LK_SHARE) && !LK_SHARERS(v);
 					    v = lk->lk_lock)
 						cpu_spinwait();
 				}
 			}
 
 			/*
 			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
 			 * fail, loop back and retry.
 			 */
 			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
 				    x | LK_EXCLUSIVE_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set drain waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * exclusive lock and the exclusive waiters flag
 			 * is set, we will sleep.
 			 */
 			if (flags & LK_INTERLOCK) {
 				class->lc_unlock(ilk);
 				flags &= ~LK_INTERLOCK;
 			}
 			GIANT_SAVE();
 			sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK,
 			    SQ_EXCLUSIVE_QUEUE);
 			sleepq_wait(&lk->lock_object, ipri & PRIMASK);
 			GIANT_RESTORE();
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 
 		if (error == 0) {
 			lock_profile_obtain_lock_success(&lk->lock_object,
 			    contested, waittime, file, line);
 			LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_LOCKS_INC(curthread);
 			STACK_SAVE(lk);
 		}
 		break;
 	default:
 		if (flags & LK_INTERLOCK)
 			class->lc_unlock(ilk);
 		panic("%s: unknown lockmgr request 0x%x\n", __func__, op);
 	}
 
 	if (flags & LK_INTERLOCK)
 		class->lc_unlock(ilk);
 	if (wakeup_swapper)
 		kick_proc0();
 
 	return (error);
 }
 
 void
 _lockmgr_disown(struct lock *lk, const char *file, int line)
 {
 	uintptr_t tid, x;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	tid = (uintptr_t)curthread;
 	_lockmgr_assert(lk, KA_XLOCKED, file, line);
 
 	/*
 	 * Panic if the lock is recursed.
 	 */
 	if (lockmgr_xlocked(lk) && lockmgr_recursed(lk))
 		panic("%s: disown a recursed lockmgr @ %s:%d\n",
 		    __func__,  file, line);
 
 	/*
 	 * If the owner is already LK_KERNPROC just skip the whole operation.
 	 */
 	if (LK_HOLDER(lk->lk_lock) != tid)
 		return;
 	lock_profile_release_lock(&lk->lock_object);
 	LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line);
 	WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
 	TD_LOCKS_DEC(curthread);
 	STACK_SAVE(lk);
 
 	/*
 	 * In order to preserve waiters flags, just spin.
 	 */
 	for (;;) {
 		x = lk->lk_lock;
 		MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 		x &= LK_ALL_WAITERS;
 		if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
 		    LK_KERNPROC | x))
 			return;
 		cpu_spinwait();
 	}
 }
 
 void
 lockmgr_printinfo(const struct lock *lk)
 {
 	struct thread *td;
 	uintptr_t x;
 
 	if (lk->lk_lock == LK_UNLOCKED)
 		printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name);
 	else if (lk->lk_lock & LK_SHARE)
 		printf("lock type %s: SHARED (count %ju)\n",
 		    lk->lock_object.lo_name,
 		    (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else {
 		td = lockmgr_xholder(lk);
 		if (td == (struct thread *)LK_KERNPROC)
 			printf("lock type %s: EXCL by KERNPROC\n",
 			    lk->lock_object.lo_name);
 		else
 			printf("lock type %s: EXCL by thread %p "
 			    "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name,
 			    td, td->td_proc->p_pid, td->td_proc->p_comm,
 			    td->td_tid);
 	}
 
 	x = lk->lk_lock;
 	if (x & LK_EXCLUSIVE_WAITERS)
 		printf(" with exclusive waiters pending\n");
 	if (x & LK_SHARED_WAITERS)
 		printf(" with shared waiters pending\n");
 	if (x & LK_EXCLUSIVE_SPINNERS)
 		printf(" with exclusive spinners pending\n");
 
 	STACK_PRINT(lk);
 }
 
 int
 lockstatus(const struct lock *lk)
 {
 	uintptr_t v, x;
 	int ret;
 
 	ret = LK_SHARED;
 	x = lk->lk_lock;
 	v = LK_HOLDER(x);
 
 	if ((x & LK_SHARE) == 0) {
 		if (v == (uintptr_t)curthread || v == LK_KERNPROC)
 			ret = LK_EXCLUSIVE;
 		else
 			ret = LK_EXCLOTHER;
 	} else if (x == LK_UNLOCKED)
 		ret = 0;
 
 	return (ret);
 }
 
 #ifdef INVARIANT_SUPPORT
 
 FEATURE(invariant_support,
     "Support for modules compiled with INVARIANTS option");
 
 #ifndef INVARIANTS
 #undef	_lockmgr_assert
 #endif
 
 void
 _lockmgr_assert(const struct lock *lk, int what, const char *file, int line)
 {
 	int slocked = 0;
 
 	if (panicstr != NULL)
 		return;
 	switch (what) {
 	case KA_SLOCKED:
 	case KA_SLOCKED | KA_NOTRECURSED:
 	case KA_SLOCKED | KA_RECURSED:
 		slocked = 1;
 	case KA_LOCKED:
 	case KA_LOCKED | KA_NOTRECURSED:
 	case KA_LOCKED | KA_RECURSED:
 #ifdef WITNESS
 
 		/*
 		 * We cannot trust WITNESS if the lock is held in exclusive
 		 * mode and a call to lockmgr_disown() happened.
 		 * Workaround this skipping the check if the lock is held in
 		 * exclusive mode even for the KA_LOCKED case.
 		 */
 		if (slocked || (lk->lk_lock & LK_SHARE)) {
 			witness_assert(&lk->lock_object, what, file, line);
 			break;
 		}
 #endif
 		if (lk->lk_lock == LK_UNLOCKED ||
 		    ((lk->lk_lock & LK_SHARE) == 0 && (slocked ||
 		    (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)))))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    lk->lock_object.lo_name, slocked ? "share" : "",
 			    file, line);
 
 		if ((lk->lk_lock & LK_SHARE) == 0) {
 			if (lockmgr_recursed(lk)) {
 				if (what & KA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    lk->lock_object.lo_name, file,
 					    line);
 			} else if (what & KA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    lk->lock_object.lo_name, file, line);
 		}
 		break;
 	case KA_XLOCKED:
 	case KA_XLOCKED | KA_NOTRECURSED:
 	case KA_XLOCKED | KA_RECURSED:
 		if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		if (lockmgr_recursed(lk)) {
 			if (what & KA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    lk->lock_object.lo_name, file, line);
 		} else if (what & KA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		break;
 	case KA_UNLOCKED:
 		if (lockmgr_xlocked(lk) || lockmgr_disowned(lk))
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		break;
 	default:
 		panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file,
 		    line);
 	}
 }
 #endif
 
 #ifdef DDB
 int
 lockmgr_chain(struct thread *td, struct thread **ownerp)
 {
 	struct lock *lk;
 
 	lk = td->td_wchan;
 
 	if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
 		return (0);
 	db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
 	if (lk->lk_lock & LK_SHARE)
 		db_printf("SHARED (count %ju)\n",
 		    (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else
 		db_printf("EXCL\n");
 	*ownerp = lockmgr_xholder(lk);
 
 	return (1);
 }
 
 static void
 db_show_lockmgr(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct lock *lk;
 
 	lk = (const struct lock *)lock;
 
 	db_printf(" state: ");
 	if (lk->lk_lock == LK_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (lk->lk_lock & LK_SHARE)
 		db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else {
 		td = lockmgr_xholder(lk);
 		if (td == (struct thread *)LK_KERNPROC)
 			db_printf("XLOCK: LK_KERNPROC\n");
 		else
 			db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 			    td->td_tid, td->td_proc->p_pid,
 			    td->td_proc->p_comm);
 		if (lockmgr_recursed(lk))
 			db_printf(" recursed: %d\n", lk->lk_recurse);
 	}
 	db_printf(" waiters: ");
 	switch (lk->lk_lock & LK_ALL_WAITERS) {
 	case LK_SHARED_WAITERS:
 		db_printf("shared\n");
 		break;
 	case LK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive\n");
 		break;
 	case LK_ALL_WAITERS:
 		db_printf("shared and exclusive\n");
 		break;
 	default:
 		db_printf("none\n");
 	}
 	db_printf(" spinners: ");
 	if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS)
 		db_printf("exclusive\n");
 	else
 		db_printf("none\n");
 }
 #endif
Index: head/sys/kern/kern_lockstat.c
===================================================================
--- head/sys/kern/kern_lockstat.c	(revision 326270)
+++ head/sys/kern/kern_lockstat.c	(revision 326271)
@@ -1,82 +1,84 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright 2008-2009 Stacey Son <sson@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/lockstat.h>
 #include <sys/sdt.h>
 #include <sys/time.h>
 
 SDT_PROVIDER_DEFINE(lockstat);
 
 SDT_PROBE_DEFINE1(lockstat, , , adaptive__acquire, "struct mtx *");
 SDT_PROBE_DEFINE1(lockstat, , , adaptive__release, "struct mtx *");
 SDT_PROBE_DEFINE2(lockstat, , , adaptive__spin, "struct mtx *", "uint64_t");
 SDT_PROBE_DEFINE2(lockstat, , , adaptive__block, "struct mtx *", "uint64_t");
 
 SDT_PROBE_DEFINE1(lockstat, , , spin__acquire, "struct mtx *");
 SDT_PROBE_DEFINE1(lockstat, , , spin__release, "struct mtx *");
 SDT_PROBE_DEFINE2(lockstat, , , spin__spin, "struct mtx *", "uint64_t");
 
 SDT_PROBE_DEFINE2(lockstat, , , rw__acquire, "struct rwlock *", "int");
 SDT_PROBE_DEFINE2(lockstat, , , rw__release, "struct rwlock *", "int");
 SDT_PROBE_DEFINE5(lockstat, , , rw__block, "struct rwlock *", "uint64_t", "int",
     "int", "int");
 SDT_PROBE_DEFINE2(lockstat, , , rw__spin, "struct rwlock *", "uint64_t");
 SDT_PROBE_DEFINE1(lockstat, , , rw__upgrade, "struct rwlock *");
 SDT_PROBE_DEFINE1(lockstat, , , rw__downgrade, "struct rwlock *");
 
 SDT_PROBE_DEFINE2(lockstat, , , sx__acquire, "struct sx *", "int");
 SDT_PROBE_DEFINE2(lockstat, , , sx__release, "struct sx *", "int");
 SDT_PROBE_DEFINE5(lockstat, , , sx__block, "struct sx *", "uint64_t", "int",
     "int", "int");
 SDT_PROBE_DEFINE2(lockstat, , , sx__spin, "struct sx *", "uint64_t");
 SDT_PROBE_DEFINE1(lockstat, , , sx__upgrade, "struct sx *");
 SDT_PROBE_DEFINE1(lockstat, , , sx__downgrade, "struct sx *");
 
 SDT_PROBE_DEFINE2(lockstat, , , thread__spin, "struct mtx *", "uint64_t");
 
 volatile int __read_frequently lockstat_enabled;
 
 uint64_t 
 lockstat_nsecs(struct lock_object *lo)
 {
 	struct bintime bt;
 	uint64_t ns;
 
 	if (!lockstat_enabled)
 		return (0);
 	if ((lo->lo_flags & LO_NOPROFILE) != 0)
 		return (0);
 
 	binuptime(&bt);
 	ns = bt.sec * (uint64_t)1000000000;
 	ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
 	return (ns);
 }
Index: head/sys/kern/kern_loginclass.c
===================================================================
--- head/sys/kern/kern_loginclass.c	(revision 326270)
+++ head/sys/kern/kern_loginclass.c	(revision 326271)
@@ -1,256 +1,258 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Processes may set login class name using setloginclass(2).  This
  * is usually done through call to setusercontext(3), by programs
  * such as login(1), based on information from master.passwd(5).  Kernel
  * uses this information to enforce per-class resource limits.  Current
  * login class can be determined using id(1).  Login class is inherited
  * from the parent process during fork(2).  If not set, it defaults
  * to "default".
  *
  * Code in this file implements setloginclass(2) and getloginclass(2)
  * system calls, and maintains class name storage and retrieval.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/malloc.h>
 #include <sys/types.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/racct.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 
 static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures");
 
 LIST_HEAD(, loginclass)	loginclasses;
 
 /*
  * Lock protecting loginclasses list.
  */
 static struct rwlock loginclasses_lock;
 RW_SYSINIT(loginclasses_init, &loginclasses_lock, "loginclasses lock");
 
 void
 loginclass_hold(struct loginclass *lc)
 {
 
 	refcount_acquire(&lc->lc_refcount);
 }
 
 void
 loginclass_free(struct loginclass *lc)
 {
 	int old;
 
 	old = lc->lc_refcount;
 	if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1))
 		return;
 
 	rw_wlock(&loginclasses_lock);
 	if (!refcount_release(&lc->lc_refcount)) {
 		rw_wunlock(&loginclasses_lock);
 		return;
 	}
 
 	racct_destroy(&lc->lc_racct);
 	LIST_REMOVE(lc, lc_next);
 	rw_wunlock(&loginclasses_lock);
 
 	free(lc, M_LOGINCLASS);
 }
 
 /*
  * Look up a loginclass struct for the parameter name.
  * loginclasses_lock must be locked.
  * Increase refcount on loginclass struct returned.
  */
 static struct loginclass *
 loginclass_lookup(const char *name)
 {
 	struct loginclass *lc;
 
 	rw_assert(&loginclasses_lock, RA_LOCKED);
 	LIST_FOREACH(lc, &loginclasses, lc_next)
 		if (strcmp(name, lc->lc_name) == 0) {
 			loginclass_hold(lc);
 			break;
 		}
 
 	return (lc);
 }
 
 /*
  * Return loginclass structure with a corresponding name.  Not
  * performance critical, as it's used mainly by setloginclass(2),
  * which happens once per login session.  Caller has to use
  * loginclass_free() on the returned value when it's no longer
  * needed.
  */
 struct loginclass *
 loginclass_find(const char *name)
 {
 	struct loginclass *lc, *new_lc;
 
 	if (name[0] == '\0' || strlen(name) >= MAXLOGNAME)
 		return (NULL);
 
 	lc = curthread->td_ucred->cr_loginclass;
 	if (strcmp(name, lc->lc_name) == 0) {
 		loginclass_hold(lc);
 		return (lc);
 	}
 
 	rw_rlock(&loginclasses_lock);
 	lc = loginclass_lookup(name);
 	rw_runlock(&loginclasses_lock);
 	if (lc != NULL)
 		return (lc);
 
 	new_lc = malloc(sizeof(*new_lc), M_LOGINCLASS, M_ZERO | M_WAITOK);
 	racct_create(&new_lc->lc_racct);
 	refcount_init(&new_lc->lc_refcount, 1);
 	strcpy(new_lc->lc_name, name);
 
 	rw_wlock(&loginclasses_lock);
 	/*
 	 * There's a chance someone created our loginclass while we
 	 * were in malloc and not holding the lock, so we have to
 	 * make sure we don't insert a duplicate loginclass.
 	 */
 	if ((lc = loginclass_lookup(name)) == NULL) {
 		LIST_INSERT_HEAD(&loginclasses, new_lc, lc_next);
 		rw_wunlock(&loginclasses_lock);
 		lc = new_lc;
 	} else {
 		rw_wunlock(&loginclasses_lock);
 		racct_destroy(&new_lc->lc_racct);
 		free(new_lc, M_LOGINCLASS);
 	}
 
 	return (lc);
 }
 
 /*
  * Get login class name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getloginclass_args {
 	char	*namebuf;
 	size_t	namelen;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getloginclass(struct thread *td, struct getloginclass_args *uap)
 {
 	struct loginclass *lc;
 	size_t lcnamelen;
 
 	lc = td->td_ucred->cr_loginclass;
 	lcnamelen = strlen(lc->lc_name) + 1;
 	if (lcnamelen > uap->namelen)
 		return (ERANGE);
 	return (copyout(lc->lc_name, uap->namebuf, lcnamelen));
 }
 
 /*
  * Set login class name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setloginclass_args {
 	const char	*namebuf;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setloginclass(struct thread *td, struct setloginclass_args *uap)
 {
 	struct proc *p = td->td_proc;
 	int error;
 	char lcname[MAXLOGNAME];
 	struct loginclass *newlc;
 	struct ucred *newcred, *oldcred;
 
 	error = priv_check(td, PRIV_PROC_SETLOGINCLASS);
 	if (error != 0)
 		return (error);
 	error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL);
 	if (error != 0)
 		return (error);
 
 	newlc = loginclass_find(lcname);
 	if (newlc == NULL)
 		return (EINVAL);
 	newcred = crget();
 
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 	newcred->cr_loginclass = newlc;
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 #endif
 	loginclass_free(oldcred->cr_loginclass);
 	crfree(oldcred);
 
 	return (0);
 }
 
 void
 loginclass_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3)
 {
 	struct loginclass *lc;
 
 	rw_rlock(&loginclasses_lock);
 	if (pre != NULL)
 		(pre)();
 	LIST_FOREACH(lc, &loginclasses, lc_next)
 		(callback)(lc->lc_racct, arg2, arg3);
 	if (post != NULL)
 		(post)();
 	rw_runlock(&loginclasses_lock);
 }
Index: head/sys/kern/kern_mbuf.c
===================================================================
--- head/sys/kern/kern_mbuf.c	(revision 326270)
+++ head/sys/kern/kern_mbuf.c	(revision 326271)
@@ -1,944 +1,946 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2004, 2005,
  *	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_param.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/uma.h>
 #include <vm/uma_dbg.h>
 
 /*
  * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
  * Zones.
  *
  * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
  * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
  * administrator so desires.
  *
  * Mbufs are allocated from a UMA Master Zone called the Mbuf
  * Zone.
  *
  * Additionally, FreeBSD provides a Packet Zone, which it
  * configures as a Secondary Zone to the Mbuf Master Zone,
  * thus sharing backend Slab kegs with the Mbuf Master Zone.
  *
  * Thus common-case allocations and locking are simplified:
  *
  *  m_clget()                m_getcl()
  *    |                         |
  *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
  *    |   |             [     Packet   ]            |
  *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
  *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
  *        |                       \________         |
  *  [ Cluster Keg   ]                      \       /
  *        |	                         [ Mbuf Keg   ]
  *  [ Cluster Slabs ]                         |
  *        |                              [ Mbuf Slabs ]
  *         \____________(VM)_________________/
  *
  *
  * Whenever an object is allocated with uma_zalloc() out of
  * one of the Zones its _ctor_ function is executed.  The same
  * for any deallocation through uma_zfree() the _dtor_ function
  * is executed.
  *
  * Caches are per-CPU and are filled from the Master Zone.
  *
  * Whenever an object is allocated from the underlying global
  * memory pool it gets pre-initialized with the _zinit_ functions.
  * When the Keg's are overfull objects get decommissioned with
  * _zfini_ functions and free'd back to the global memory pool.
  *
  */
 
 int nmbufs;			/* limits number of mbufs */
 int nmbclusters;		/* limits number of mbuf clusters */
 int nmbjumbop;			/* limits number of page size jumbo clusters */
 int nmbjumbo9;			/* limits number of 9k jumbo clusters */
 int nmbjumbo16;			/* limits number of 16k jumbo clusters */
 
 static quad_t maxmbufmem;	/* overall real memory limit for all mbufs */
 
 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,
     "Maximum real memory allocatable to various mbuf types");
 
 /*
  * tunable_mbinit() has to be run before any mbuf allocations are done.
  */
 static void
 tunable_mbinit(void *dummy)
 {
 	quad_t realmem;
 
 	/*
 	 * The default limit for all mbuf related memory is 1/2 of all
 	 * available kernel memory (physical or kmem).
 	 * At most it can be 3/4 of available kernel memory.
 	 */
 	realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
 	maxmbufmem = realmem / 2;
 	TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
 	if (maxmbufmem > realmem / 4 * 3)
 		maxmbufmem = realmem / 4 * 3;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 	if (nmbclusters == 0)
 		nmbclusters = maxmbufmem / MCLBYTES / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
 	if (nmbjumbop == 0)
 		nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
 	if (nmbjumbo9 == 0)
 		nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
 
 	TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
 	if (nmbjumbo16 == 0)
 		nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
 
 	/*
 	 * We need at least as many mbufs as we have clusters of
 	 * the various types added together.
 	 */
 	TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
 	if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
 		nmbufs = lmax(maxmbufmem / MSIZE / 5,
 		    nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
 }
 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
 
 static int
 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbclusters;
 
 	newnmbclusters = nmbclusters;
 	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
 	if (error == 0 && req->newptr && newnmbclusters != nmbclusters) {
 		if (newnmbclusters > nmbclusters &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbclusters = newnmbclusters;
 			nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 			EVENTHANDLER_INVOKE(nmbclusters_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
 &nmbclusters, 0, sysctl_nmbclusters, "IU",
     "Maximum number of mbuf clusters allowed");
 
 static int
 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbop;
 
 	newnmbjumbop = nmbjumbop;
 	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) {
 		if (newnmbjumbop > nmbjumbop &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbop = newnmbjumbop;
 			nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW,
 &nmbjumbop, 0, sysctl_nmbjumbop, "IU",
     "Maximum number of mbuf page size jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo9;
 
 	newnmbjumbo9 = nmbjumbo9;
 	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) {
 		if (newnmbjumbo9 > nmbjumbo9 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo9 = newnmbjumbo9;
 			nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW,
 &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
     "Maximum number of mbuf 9k jumbo clusters allowed");
 
 static int
 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbjumbo16;
 
 	newnmbjumbo16 = nmbjumbo16;
 	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
 	if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) {
 		if (newnmbjumbo16 > nmbjumbo16 &&
 		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo16 = newnmbjumbo16;
 			nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
     "Maximum number of mbuf 16k jumbo clusters allowed");
 
 static int
 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
 {
 	int error, newnmbufs;
 
 	newnmbufs = nmbufs;
 	error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
 	if (error == 0 && req->newptr && newnmbufs != nmbufs) {
 		if (newnmbufs > nmbufs) {
 			nmbufs = newnmbufs;
 			nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 			EVENTHANDLER_INVOKE(nmbufs_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW,
 &nmbufs, 0, sysctl_nmbufs, "IU",
     "Maximum number of mbufs allowed");
 
 /*
  * Zones from which we allocate.
  */
 uma_zone_t	zone_mbuf;
 uma_zone_t	zone_clust;
 uma_zone_t	zone_pack;
 uma_zone_t	zone_jumbop;
 uma_zone_t	zone_jumbo9;
 uma_zone_t	zone_jumbo16;
 
 /*
  * Local prototypes.
  */
 static int	mb_ctor_mbuf(void *, int, void *, int);
 static int	mb_ctor_clust(void *, int, void *, int);
 static int	mb_ctor_pack(void *, int, void *, int);
 static void	mb_dtor_mbuf(void *, int, void *);
 static void	mb_dtor_pack(void *, int, void *);
 static int	mb_zinit_pack(void *, int, int);
 static void	mb_zfini_pack(void *, int);
 static void	mb_reclaim(uma_zone_t, int);
 static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
 
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
 
 /*
  * Initialize FreeBSD Network buffer allocation.
  */
 static void
 mbuf_init(void *dummy)
 {
 
 	/*
 	 * Configure UMA zones for Mbufs, Clusters, and Packets.
 	 */
 	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
 	    mb_ctor_mbuf, mb_dtor_mbuf,
 #ifdef INVARIANTS
 	    trash_init, trash_fini,
 #else
 	    NULL, NULL,
 #endif
 	    MSIZE - 1, UMA_ZONE_MAXBUCKET);
 	if (nmbufs > 0)
 		nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
 	uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
 	uma_zone_set_maxaction(zone_mbuf, mb_reclaim);
 
 	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
 	    mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, 0);
 	if (nmbclusters > 0)
 		nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
 	uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
 	uma_zone_set_maxaction(zone_clust, mb_reclaim);
 
 	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
 	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
 
 	/* Make jumbo frame zone too. Page size, 9k and 16k. */
 	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
 	    mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, 0);
 	if (nmbjumbop > 0)
 		nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
 	uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
 	uma_zone_set_maxaction(zone_jumbop, mb_reclaim);
 
 	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
 	    mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, 0);
 	uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
 	if (nmbjumbo9 > 0)
 		nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 	uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
 	uma_zone_set_maxaction(zone_jumbo9, mb_reclaim);
 
 	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
 	    mb_ctor_clust,
 #ifdef INVARIANTS
 	    trash_dtor, trash_init, trash_fini,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, 0);
 	uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
 	if (nmbjumbo16 > 0)
 		nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
 	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
 
 	/*
 	 * Hook event handler for low-memory situation, used to
 	 * drain protocols and push data back to the caches (UMA
 	 * later pushes it back to VM).
 	 */
 	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 }
 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
 
 /*
  * UMA backend page allocator for the jumbo frame zones.
  *
  * Allocates kernel virtual memory that is backed by contiguous physical
  * pages.
  */
 static void *
 mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */
 	*flags = UMA_SLAB_KERNEL;
 	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait,
 	    (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
 }
 
 /*
  * Constructor for Mbuf master zone.
  *
  * The 'arg' pointer points to a mb_args structure which
  * contains call-specific information required to support the
  * mbuf allocation API.  See mbuf.h.
  */
 static int
 mb_ctor_mbuf(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error;
 	int flags;
 	short type;
 
 #ifdef INVARIANTS
 	trash_ctor(mem, size, arg, how);
 #endif
 	args = (struct mb_args *)arg;
 	type = args->type;
 
 	/*
 	 * The mbuf is initialized later.  The caller has the
 	 * responsibility to set up any MAC labels too.
 	 */
 	if (type == MT_NOINIT)
 		return (0);
 
 	m = (struct mbuf *)mem;
 	flags = args->flags;
 	MPASS((flags & M_NOFREE) == 0);
 
 	error = m_init(m, how, type, flags);
 
 	return (error);
 }
 
 /*
  * The Mbuf master zone destructor.
  */
 static void
 mb_dtor_mbuf(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 	unsigned long flags;
 
 	m = (struct mbuf *)mem;
 	flags = (unsigned long)arg;
 
 	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
 	if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
 		m_tag_delete_chain(m, NULL);
 #ifdef INVARIANTS
 	trash_dtor(mem, size, arg);
 #endif
 }
 
 /*
  * The Mbuf Packet zone destructor.
  */
 static void
 mb_dtor_pack(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 	if ((m->m_flags & M_PKTHDR) != 0)
 		m_tag_delete_chain(m, NULL);
 
 	/* Make sure we've got a clean cluster back. */
 	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
 	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
 	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
 	KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
 	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
 	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
 #ifdef INVARIANTS
 	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
 #endif
 	/*
 	 * If there are processes blocked on zone_clust, waiting for pages
 	 * to be freed up, * cause them to be woken up by draining the
 	 * packet zone.  We are exposed to a race here * (in the check for
 	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
 	 * is deliberate. We don't want to acquire the zone lock for every
 	 * mbuf free.
 	 */
 	if (uma_zone_exhausted_nolock(zone_clust))
 		zone_drain(zone_pack);
 }
 
 /*
  * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
  *
  * Here the 'arg' pointer points to the Mbuf which we
  * are configuring cluster storage for.  If 'arg' is
  * empty we allocate just the cluster without setting
  * the mbuf to it.  See mbuf.h.
  */
 static int
 mb_ctor_clust(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 
 #ifdef INVARIANTS
 	trash_ctor(mem, size, arg, how);
 #endif
 	m = (struct mbuf *)arg;
 	if (m != NULL) {
 		m->m_ext.ext_buf = (char *)mem;
 		m->m_data = m->m_ext.ext_buf;
 		m->m_flags |= M_EXT;
 		m->m_ext.ext_free = NULL;
 		m->m_ext.ext_arg1 = NULL;
 		m->m_ext.ext_arg2 = NULL;
 		m->m_ext.ext_size = size;
 		m->m_ext.ext_type = m_gettype(size);
 		m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 		m->m_ext.ext_count = 1;
 	}
 
 	return (0);
 }
 
 /*
  * The Packet secondary zone's init routine, executed on the
  * object's transition from mbuf keg slab to zone cache.
  */
 static int
 mb_zinit_pack(void *mem, int size, int how)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;		/* m is virgin. */
 	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
 	    m->m_ext.ext_buf == NULL)
 		return (ENOMEM);
 	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
 #ifdef INVARIANTS
 	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
 #endif
 	return (0);
 }
 
 /*
  * The Packet secondary zone's fini routine, executed on the
  * object's transition from zone cache to keg slab.
  */
 static void
 mb_zfini_pack(void *mem, int size)
 {
 	struct mbuf *m;
 
 	m = (struct mbuf *)mem;
 #ifdef INVARIANTS
 	trash_fini(m->m_ext.ext_buf, MCLBYTES);
 #endif
 	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
 #ifdef INVARIANTS
 	trash_dtor(mem, size, NULL);
 #endif
 }
 
 /*
  * The "packet" keg constructor.
  */
 static int
 mb_ctor_pack(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
 	struct mb_args *args;
 	int error, flags;
 	short type;
 
 	m = (struct mbuf *)mem;
 	args = (struct mb_args *)arg;
 	flags = args->flags;
 	type = args->type;
 	MPASS((flags & M_NOFREE) == 0);
 
 #ifdef INVARIANTS
 	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
 #endif
 
 	error = m_init(m, how, type, flags);
 
 	/* m_ext is already initialized. */
 	m->m_data = m->m_ext.ext_buf;
  	m->m_flags = (flags | M_EXT);
 
 	return (error);
 }
 
 /*
  * This is the protocol drain routine.  Called by UMA whenever any of the
  * mbuf zones is closed to its limit.
  *
  * No locks should be held when this is called.  The drain routines have to
  * presently acquire some locks which raises the possibility of lock order
  * reversal.
  */
 static void
 mb_reclaim(uma_zone_t zone __unused, int pending __unused)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__);
 
 	for (dp = domains; dp != NULL; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_drain != NULL)
 				(*pr->pr_drain)();
 }
 
 /*
  * Clean up after mbufs with M_EXT storage attached to them if the
  * reference count hits 1.
  */
 void
 mb_free_ext(struct mbuf *m)
 {
 	volatile u_int *refcnt;
 	struct mbuf *mref;
 	int freembuf;
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 		refcnt = &m->m_ext.ext_count;
 		mref = m;
 	} else {
 		KASSERT(m->m_ext.ext_cnt != NULL,
 		    ("%s: no refcounting pointer on %p", __func__, m));
 		refcnt = m->m_ext.ext_cnt;
 		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 	}
 
 	/*
 	 * Check if the header is embedded in the cluster.  It is
 	 * important that we can't touch any of the mbuf fields
 	 * after we have freed the external storage, since mbuf
 	 * could have been embedded in it.  For now, the mbufs
 	 * embedded into the cluster are always of type EXT_EXTREF,
 	 * and for this type we won't free the mref.
 	 */
 	if (m->m_flags & M_NOFREE) {
 		freembuf = 0;
 		KASSERT(m->m_ext.ext_type == EXT_EXTREF,
 		    ("%s: no-free mbuf %p has wrong type", __func__, m));
 	} else
 		freembuf = 1;
 
 	/* Free attached storage if this mbuf is the only reference to it. */
 	if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 		switch (m->m_ext.ext_type) {
 		case EXT_PACKET:
 			/* The packet zone is special. */
 			if (*refcnt == 0)
 				*refcnt = 1;
 			uma_zfree(zone_pack, mref);
 			break;
 		case EXT_CLUSTER:
 			uma_zfree(zone_clust, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBOP:
 			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBO9:
 			uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_SFBUF:
 		case EXT_NET_DRV:
 		case EXT_MOD_TYPE:
 		case EXT_DISPOSABLE:
 			KASSERT(mref->m_ext.ext_free != NULL,
 				("%s: ext_free not set", __func__));
 			mref->m_ext.ext_free(mref);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_EXTREF:
 			KASSERT(m->m_ext.ext_free != NULL,
 				("%s: ext_free not set", __func__));
 			m->m_ext.ext_free(m);
 			break;
 		default:
 			KASSERT(m->m_ext.ext_type == 0,
 				("%s: unknown ext_type", __func__));
 		}
 	}
 
 	if (freembuf && m != mref)
 		uma_zfree(zone_mbuf, m);
 }
 
 /*
  * Official mbuf(9) allocation KPI for stack and drivers:
  *
  * m_get()	- a single mbuf without any attachments, sys/mbuf.h.
  * m_gethdr()	- a single mbuf initialized as M_PKTHDR, sys/mbuf.h.
  * m_getcl()	- an mbuf + 2k cluster, sys/mbuf.h.
  * m_clget()	- attach cluster to already allocated mbuf.
  * m_cljget()	- attach jumbo cluster to already allocated mbuf.
  * m_get2()	- allocate minimum mbuf that would fit size argument.
  * m_getm2()	- allocate a chain of mbufs/clusters.
  * m_extadd()	- attach external cluster to mbuf.
  *
  * m_free()	- free single mbuf with its tags and ext, sys/mbuf.h.
  * m_freem()	- free chain of mbufs.
  */
 
 int
 m_clget(struct mbuf *m, int how)
 {
 
 	KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 	    __func__, m));
 	m->m_ext.ext_buf = (char *)NULL;
 	uma_zalloc_arg(zone_clust, m, how);
 	/*
 	 * On a cluster allocation failure, drain the packet zone and retry,
 	 * we might be able to loosen a few clusters up on the drain.
 	 */
 	if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
 		zone_drain(zone_pack);
 		uma_zalloc_arg(zone_clust, m, how);
 	}
 	MBUF_PROBE2(m__clget, m, how);
 	return (m->m_flags & M_EXT);
 }
 
 /*
  * m_cljget() is different from m_clget() as it can allocate clusters without
  * attaching them to an mbuf.  In that case the return value is the pointer
  * to the cluster of the requested size.  If an mbuf was specified, it gets
  * the cluster attached to it and the return value can be safely ignored.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 void *
 m_cljget(struct mbuf *m, int how, int size)
 {
 	uma_zone_t zone;
 	void *retval;
 
 	if (m != NULL) {
 		KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 		    __func__, m));
 		m->m_ext.ext_buf = NULL;
 	}
 
 	zone = m_getzone(size);
 	retval = uma_zalloc_arg(zone, m, how);
 
 	MBUF_PROBE4(m__cljget, m, how, size, retval);
 
 	return (retval);
 }
 
 /*
  * m_get2() allocates minimum mbuf that would fit "size" argument.
  */
 struct mbuf *
 m_get2(int size, int how, short type, int flags)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 
 	args.flags = flags;
 	args.type = type;
 
 	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
 		return (uma_zalloc_arg(zone_mbuf, &args, how));
 	if (size <= MCLBYTES)
 		return (uma_zalloc_arg(zone_pack, &args, how));
 
 	if (size > MJUMPAGESIZE)
 		return (NULL);
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	n = uma_zalloc_arg(zone_jumbop, m, how);
 	if (n == NULL) {
 		uma_zfree(zone_mbuf, m);
 		return (NULL);
 	}
 
 	return (m);
 }
 
 /*
  * m_getjcl() returns an mbuf with a cluster of the specified size attached.
  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
  */
 struct mbuf *
 m_getjcl(int how, short type, int flags, int size)
 {
 	struct mb_args args;
 	struct mbuf *m, *n;
 	uma_zone_t zone;
 
 	if (size == MCLBYTES)
 		return m_getcl(how, type, flags);
 
 	args.flags = flags;
 	args.type = type;
 
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	if (m == NULL)
 		return (NULL);
 
 	zone = m_getzone(size);
 	n = uma_zalloc_arg(zone, m, how);
 	if (n == NULL) {
 		uma_zfree(zone_mbuf, m);
 		return (NULL);
 	}
 	return (m);
 }
 
 /*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
  * to the existing one but still return the top of the newly allocated
  * chain.
  */
 struct mbuf *
 m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 {
 	struct mbuf *mb, *nm = NULL, *mtail = NULL;
 
 	KASSERT(len >= 0, ("%s: len is < 0", __func__));
 
 	/* Validate flags. */
 	flags &= (M_PKTHDR | M_EOR);
 
 	/* Packet header mbuf must be first in chain. */
 	if ((flags & M_PKTHDR) && m != NULL)
 		flags &= ~M_PKTHDR;
 
 	/* Loop and append maximum sized mbufs to the chain tail. */
 	while (len > 0) {
 		if (len > MCLBYTES)
 			mb = m_getjcl(how, type, (flags & M_PKTHDR),
 			    MJUMPAGESIZE);
 		else if (len >= MINCLSIZE)
 			mb = m_getcl(how, type, (flags & M_PKTHDR));
 		else if (flags & M_PKTHDR)
 			mb = m_gethdr(how, type);
 		else
 			mb = m_get(how, type);
 
 		/* Fail the whole operation if one mbuf can't be allocated. */
 		if (mb == NULL) {
 			if (nm != NULL)
 				m_freem(nm);
 			return (NULL);
 		}
 
 		/* Book keeping. */
 		len -= M_SIZE(mb);
 		if (mtail != NULL)
 			mtail->m_next = mb;
 		else
 			nm = mb;
 		mtail = mb;
 		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
 	}
 	if (flags & M_EOR)
 		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
 
 	/* If mbuf was supplied, append new chain to the end of it. */
 	if (m != NULL) {
 		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
 			;
 		mtail->m_next = nm;
 		mtail->m_flags &= ~M_EOR;
 	} else
 		m = nm;
 
 	return (m);
 }
 
 /*-
  * Configure a provided mbuf to refer to the provided external storage
  * buffer and setup a reference count for said buffer.
  *
  * Arguments:
  *    mb     The existing mbuf to which to attach the provided buffer.
  *    buf    The address of the provided external storage buffer.
  *    size   The size of the provided buffer.
  *    freef  A pointer to a routine that is responsible for freeing the
  *           provided external storage buffer.
  *    args   A pointer to an argument structure (of any type) to be passed
  *           to the provided freef routine (may be NULL).
  *    flags  Any other flags to be passed to the provided mbuf.
  *    type   The type that the external storage buffer should be
  *           labeled with.
  *
  * Returns:
  *    Nothing.
  */
 void
 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef,
     void *arg1, void *arg2, int flags, int type)
 {
 
 	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 
 	mb->m_flags |= (M_EXT | flags);
 	mb->m_ext.ext_buf = buf;
 	mb->m_data = mb->m_ext.ext_buf;
 	mb->m_ext.ext_size = size;
 	mb->m_ext.ext_free = freef;
 	mb->m_ext.ext_arg1 = arg1;
 	mb->m_ext.ext_arg2 = arg2;
 	mb->m_ext.ext_type = type;
 
 	if (type != EXT_EXTREF) {
 		mb->m_ext.ext_count = 1;
 		mb->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	} else
 		mb->m_ext.ext_flags = 0;
 }
 
 /*
  * Free an entire chain of mbufs and associated external buffers, if
  * applicable.
  */
 void
 m_freem(struct mbuf *mb)
 {
 
 	MBUF_PROBE1(m__freem, mb);
 	while (mb != NULL)
 		mb = m_free(mb);
 }
Index: head/sys/kern/kern_module.c
===================================================================
--- head/sys/kern/kern_module.c	(revision 326270)
+++ head/sys/kern/kern_module.c	(revision 326271)
@@ -1,519 +1,521 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1997 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_compat.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/reboot.h>
 #include <sys/sx.h>
 #include <sys/module.h>
 #include <sys/linker.h>
 
 static MALLOC_DEFINE(M_MODULE, "module", "module data structures");
 
 struct module {
 	TAILQ_ENTRY(module)	link;	/* chain together all modules */
 	TAILQ_ENTRY(module)	flink;	/* all modules in a file */
 	struct linker_file	*file;	/* file which contains this module */
 	int			refs;	/* reference count */
 	int 			id;	/* unique id number */
 	char 			*name;	/* module name */
 	modeventhand_t 		handler;	/* event handler */
 	void 			*arg;	/* argument for handler */
 	modspecific_t 		data;	/* module specific data */
 };
 
 #define MOD_EVENT(mod, type)	(mod)->handler((mod), (type), (mod)->arg)
 
 static TAILQ_HEAD(modulelist, module) modules;
 struct sx modules_sx;
 static int nextid = 1;
 static void module_shutdown(void *, int);
 
 static int
 modevent_nop(module_t mod, int what, void *arg)
 {
 
 	switch(what) {
 	case MOD_LOAD:
 		return (0);
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static void
 module_init(void *arg)
 {
 
 	sx_init(&modules_sx, "module subsystem sx lock");
 	TAILQ_INIT(&modules);
 	EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL,
 	    SHUTDOWN_PRI_DEFAULT);
 }
 
 SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0);
 
 static void
 module_shutdown(void *arg1, int arg2)
 {
 	module_t mod;
 
 	if (arg2 & RB_NOSYNC)
 		return;
 	mtx_lock(&Giant);
 	MOD_SLOCK;
 	TAILQ_FOREACH_REVERSE(mod, &modules, modulelist, link)
 		MOD_EVENT(mod, MOD_SHUTDOWN);
 	MOD_SUNLOCK;
 	mtx_unlock(&Giant);
 }
 
 void
 module_register_init(const void *arg)
 {
 	const moduledata_t *data = (const moduledata_t *)arg;
 	int error;
 	module_t mod;
 
 	mtx_lock(&Giant);
 	MOD_SLOCK;
 	mod = module_lookupbyname(data->name);
 	if (mod == NULL)
 		panic("module_register_init: module named %s not found\n",
 		    data->name);
 	MOD_SUNLOCK;
 	error = MOD_EVENT(mod, MOD_LOAD);
 	if (error) {
 		MOD_EVENT(mod, MOD_UNLOAD);
 		MOD_XLOCK;
 		module_release(mod);
 		MOD_XUNLOCK;
 		printf("module_register_init: MOD_LOAD (%s, %p, %p) error"
 		    " %d\n", data->name, (void *)data->evhand, data->priv,
 		    error); 
 	} else {
 		MOD_XLOCK;
 		if (mod->file) {
 			/*
 			 * Once a module is successfully loaded, move
 			 * it to the head of the module list for this
 			 * linker file.  This resorts the list so that
 			 * when the kernel linker iterates over the
 			 * modules to unload them, it will unload them
 			 * in the reverse order they were loaded.
 			 */
 			TAILQ_REMOVE(&mod->file->modules, mod, flink);
 			TAILQ_INSERT_HEAD(&mod->file->modules, mod, flink);
 		}
 		MOD_XUNLOCK;
 	}
 	mtx_unlock(&Giant);
 }
 
 int
 module_register(const moduledata_t *data, linker_file_t container)
 {
 	size_t namelen;
 	module_t newmod;
 
 	MOD_XLOCK;
 	newmod = module_lookupbyname(data->name);
 	if (newmod != NULL) {
 		MOD_XUNLOCK;
 		printf("%s: cannot register %s from %s; already loaded from %s\n",
 		    __func__, data->name, container->filename, newmod->file->filename);
 		return (EEXIST);
 	}
 	namelen = strlen(data->name) + 1;
 	newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
 	newmod->refs = 1;
 	newmod->id = nextid++;
 	newmod->name = (char *)(newmod + 1);
 	strcpy(newmod->name, data->name);
 	newmod->handler = data->evhand ? data->evhand : modevent_nop;
 	newmod->arg = data->priv;
 	bzero(&newmod->data, sizeof(newmod->data));
 	TAILQ_INSERT_TAIL(&modules, newmod, link);
 
 	if (container)
 		TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
 	newmod->file = container;
 	MOD_XUNLOCK;
 	return (0);
 }
 
 void
 module_reference(module_t mod)
 {
 
 	MOD_XLOCK_ASSERT;
 
 	MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
 	mod->refs++;
 }
 
 void
 module_release(module_t mod)
 {
 
 	MOD_XLOCK_ASSERT;
 
 	if (mod->refs <= 0)
 		panic("module_release: bad reference count");
 
 	MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
 	
 	mod->refs--;
 	if (mod->refs == 0) {
 		TAILQ_REMOVE(&modules, mod, link);
 		if (mod->file)
 			TAILQ_REMOVE(&mod->file->modules, mod, flink);
 		free(mod, M_MODULE);
 	}
 }
 
 module_t
 module_lookupbyname(const char *name)
 {
 	module_t mod;
 	int err;
 
 	MOD_LOCK_ASSERT;
 
 	TAILQ_FOREACH(mod, &modules, link) {
 		err = strcmp(mod->name, name);
 		if (err == 0)
 			return (mod);
 	}
 	return (NULL);
 }
 
 module_t
 module_lookupbyid(int modid)
 {
         module_t mod;
 
         MOD_LOCK_ASSERT;
 
         TAILQ_FOREACH(mod, &modules, link)
                 if (mod->id == modid)
                         return(mod);
         return (NULL);
 }
 
 int
 module_quiesce(module_t mod)
 {
 	int error;
 
 	mtx_lock(&Giant);
 	error = MOD_EVENT(mod, MOD_QUIESCE);
 	mtx_unlock(&Giant);
 	if (error == EOPNOTSUPP || error == EINVAL)
 		error = 0;
 	return (error);
 }
 
 int
 module_unload(module_t mod)
 {
 	int error;
 
 	mtx_lock(&Giant);
 	error = MOD_EVENT(mod, MOD_UNLOAD);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 module_getid(module_t mod)
 {
 
 	MOD_LOCK_ASSERT;
 	return (mod->id);
 }
 
 module_t
 module_getfnext(module_t mod)
 {
 
 	MOD_LOCK_ASSERT;
 	return (TAILQ_NEXT(mod, flink));
 }
 
 const char *
 module_getname(module_t mod)
 {
 
 	MOD_LOCK_ASSERT;
 	return (mod->name);
 }
 
 void
 module_setspecific(module_t mod, modspecific_t *datap)
 {
 
 	MOD_XLOCK_ASSERT;
 	mod->data = *datap;
 }
 
 linker_file_t
 module_file(module_t mod)
 {
 
 	return (mod->file);
 }
 
 /*
  * Syscalls.
  */
 int
 sys_modnext(struct thread *td, struct modnext_args *uap)
 {
 	module_t mod;
 	int error = 0;
 
 	td->td_retval[0] = -1;
 
 	MOD_SLOCK;
 	if (uap->modid == 0) {
 		mod = TAILQ_FIRST(&modules);
 		if (mod)
 			td->td_retval[0] = mod->id;
 		else
 			error = ENOENT;
 		goto done2;
 	}
 	mod = module_lookupbyid(uap->modid);
 	if (mod == NULL) {
 		error = ENOENT;
 		goto done2;
 	}
 	if (TAILQ_NEXT(mod, link))
 		td->td_retval[0] = TAILQ_NEXT(mod, link)->id;
 	else
 		td->td_retval[0] = 0;
 done2:
 	MOD_SUNLOCK;
 	return (error);
 }
 
 int
 sys_modfnext(struct thread *td, struct modfnext_args *uap)
 {
 	module_t mod;
 	int error;
 
 	td->td_retval[0] = -1;
 
 	MOD_SLOCK;
 	mod = module_lookupbyid(uap->modid);
 	if (mod == NULL) {
 		error = ENOENT;
 	} else {
 		error = 0;
 		if (TAILQ_NEXT(mod, flink))
 			td->td_retval[0] = TAILQ_NEXT(mod, flink)->id;
 		else
 			td->td_retval[0] = 0;
 	}
 	MOD_SUNLOCK;
 	return (error);
 }
 
 struct module_stat_v1 {
 	int	version;		/* set to sizeof(struct module_stat) */
 	char	name[MAXMODNAME];
 	int	refs;
 	int	id;
 };
 
 int
 sys_modstat(struct thread *td, struct modstat_args *uap)
 {
 	module_t mod;
 	modspecific_t data;
 	int error = 0;
 	int id, namelen, refs, version;
 	struct module_stat *stat;
 	char *name;
 
 	MOD_SLOCK;
 	mod = module_lookupbyid(uap->modid);
 	if (mod == NULL) {
 		MOD_SUNLOCK;
 		return (ENOENT);
 	}
 	id = mod->id;
 	refs = mod->refs;
 	name = mod->name;
 	data = mod->data;
 	MOD_SUNLOCK;
 	stat = uap->stat;
 
 	/*
 	 * Check the version of the user's structure.
 	 */
 	if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
 		return (error);
 	if (version != sizeof(struct module_stat_v1)
 	    && version != sizeof(struct module_stat))
 		return (EINVAL);
 	namelen = strlen(mod->name) + 1;
 	if (namelen > MAXMODNAME)
 		namelen = MAXMODNAME;
 	if ((error = copyout(name, &stat->name[0], namelen)) != 0)
 		return (error);
 
 	if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0)
 		return (error);
 	if ((error = copyout(&id, &stat->id, sizeof(int))) != 0)
 		return (error);
 
 	/*
 	 * >v1 stat includes module data.
 	 */
 	if (version == sizeof(struct module_stat))
 		if ((error = copyout(&data, &stat->data, 
 		    sizeof(data))) != 0)
 			return (error);
 	td->td_retval[0] = 0;
 	return (error);
 }
 
 int
 sys_modfind(struct thread *td, struct modfind_args *uap)
 {
 	int error = 0;
 	char name[MAXMODNAME];
 	module_t mod;
 
 	if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0)
 		return (error);
 
 	MOD_SLOCK;
 	mod = module_lookupbyname(name);
 	if (mod == NULL)
 		error = ENOENT;
 	else
 		td->td_retval[0] = module_getid(mod);
 	MOD_SUNLOCK;
 	return (error);
 }
 
 MODULE_VERSION(kernel, __FreeBSD_version);
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 
 typedef union modspecific32 {
 	int		intval;
 	uint32_t	uintval;
 	int		longval;
 	uint32_t	ulongval;
 } modspecific32_t;
 
 struct module_stat32 {
 	int		version;
 	char		name[MAXMODNAME];
 	int		refs;
 	int		id;
 	modspecific32_t	data;
 };
 
 int
 freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap)
 {
 	module_t mod;
 	modspecific32_t data32;
 	int error = 0;
 	int id, namelen, refs, version;
 	struct module_stat32 *stat32;
 	char *name;
 
 	MOD_SLOCK;
 	mod = module_lookupbyid(uap->modid);
 	if (mod == NULL) {
 		MOD_SUNLOCK;
 		return (ENOENT);
 	}
 
 	id = mod->id;
 	refs = mod->refs;
 	name = mod->name;
 	CP(mod->data, data32, intval);
 	CP(mod->data, data32, uintval);
 	CP(mod->data, data32, longval);
 	CP(mod->data, data32, ulongval);
 	MOD_SUNLOCK;
 	stat32 = uap->stat;
 
 	if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0)
 		return (error);
 	if (version != sizeof(struct module_stat_v1)
 	    && version != sizeof(struct module_stat32))
 		return (EINVAL);
 	namelen = strlen(mod->name) + 1;
 	if (namelen > MAXMODNAME)
 		namelen = MAXMODNAME;
 	if ((error = copyout(name, &stat32->name[0], namelen)) != 0)
 		return (error);
 
 	if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0)
 		return (error);
 	if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0)
 		return (error);
 
 	/*
 	 * >v1 stat includes module data.
 	 */
 	if (version == sizeof(struct module_stat32))
 		if ((error = copyout(&data32, &stat32->data,
 		    sizeof(data32))) != 0)
 			return (error);
 	td->td_retval[0] = 0;
 	return (error);
 }
 #endif
Index: head/sys/kern/kern_mtxpool.c
===================================================================
--- head/sys/kern/kern_mtxpool.c	(revision 326270)
+++ head/sys/kern/kern_mtxpool.c	(revision 326271)
@@ -1,188 +1,190 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2001 Matthew Dillon.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /* Mutex pool routines.  These routines are designed to be used as short
  * term leaf mutexes (e.g. the last mutex you might acquire other then
  * calling msleep()).  They operate using a shared pool.  A mutex is chosen
  * from the pool based on the supplied pointer (which may or may not be
  * valid).
  *
  * Advantages:
  *	- no structural overhead.  Mutexes can be associated with structures
  *	  without adding bloat to the structures.
  *	- mutexes can be obtained for invalid pointers, useful when uses
  *	  mutexes to interlock destructor ops.
  *	- no initialization/destructor overhead.
  *	- can be used with msleep.
  *
  * Disadvantages:
  *	- should generally only be used as leaf mutexes.
  *	- pool/pool dependency ordering cannot be depended on.
  *	- possible L1 cache mastersip contention between cpus.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 
 
 static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool");
 
 /* Pool sizes must be a power of two */
 #ifndef MTX_POOL_SLEEP_SIZE
 #define MTX_POOL_SLEEP_SIZE		128
 #endif
 
 struct mtxpool_header {
 	int		mtxpool_size;
 	int		mtxpool_mask;
 	int		mtxpool_shift;
 	int		mtxpool_next;
 };
 
 struct mtx_pool {
 	struct mtxpool_header mtx_pool_header;
 	struct mtx	mtx_pool_ary[1];
 };
 
 #define mtx_pool_size	mtx_pool_header.mtxpool_size
 #define mtx_pool_mask	mtx_pool_header.mtxpool_mask
 #define mtx_pool_shift	mtx_pool_header.mtxpool_shift
 #define mtx_pool_next	mtx_pool_header.mtxpool_next
 
 struct mtx_pool *mtxpool_sleep;
 
 #if UINTPTR_MAX == UINT64_MAX	/* 64 bits */
 # define POINTER_BITS		64
 # define HASH_MULTIPLIER	11400714819323198485u /* (2^64)*(sqrt(5)-1)/2 */
 #else				/* assume 32 bits */
 # define POINTER_BITS		32
 # define HASH_MULTIPLIER	2654435769u	      /* (2^32)*(sqrt(5)-1)/2 */
 #endif
 
 /*
  * Return the (shared) pool mutex associated with the specified address.
  * The returned mutex is a leaf level mutex, meaning that if you obtain it
  * you cannot obtain any other mutexes until you release it.  You can
  * legally msleep() on the mutex.
  */
 struct mtx *
 mtx_pool_find(struct mtx_pool *pool, void *ptr)
 {
 	int p;
 
 	KASSERT(pool != NULL, ("_mtx_pool_find(): null pool"));
 	/*
 	 * Fibonacci hash, see Knuth's
 	 * _Art of Computer Programming, Volume 3 / Sorting and Searching_
 	 */
 	p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) &
 	    pool->mtx_pool_mask;
 	return (&pool->mtx_pool_ary[p]);
 }
 
 static void
 mtx_pool_initialize(struct mtx_pool *pool, const char *mtx_name, int pool_size,
     int opts)
 {
 	int i, maskbits;
 
 	pool->mtx_pool_size = pool_size;
 	pool->mtx_pool_mask = pool_size - 1;
 	for (i = 1, maskbits = 0; (i & pool_size) == 0; i = i << 1)
 		maskbits++;
 	pool->mtx_pool_shift = POINTER_BITS - maskbits;
 	pool->mtx_pool_next = 0;
 	for (i = 0; i < pool_size; ++i)
 		mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts);
 }
 
 struct mtx_pool *
 mtx_pool_create(const char *mtx_name, int pool_size, int opts)
 {
 	struct mtx_pool *pool;
 
 	if (pool_size <= 0 || !powerof2(pool_size)) {
 		printf("WARNING: %s pool size is not a power of 2.\n",
 		    mtx_name);
 		pool_size = 128;
 	}
 	pool = malloc(sizeof (struct mtx_pool) +
 	    ((pool_size - 1) * sizeof (struct mtx)),
 	    M_MTXPOOL, M_WAITOK | M_ZERO);
 	mtx_pool_initialize(pool, mtx_name, pool_size, opts);
 	return pool;
 }
 
 void
 mtx_pool_destroy(struct mtx_pool **poolp)
 {
 	int i;
 	struct mtx_pool *pool = *poolp;
 
 	for (i = pool->mtx_pool_size - 1; i >= 0; --i)
 		mtx_destroy(&pool->mtx_pool_ary[i]);
 	free(pool, M_MTXPOOL);
 	*poolp = NULL;
 }
 
 static void
 mtx_pool_setup_dynamic(void *dummy __unused)
 {
 	mtxpool_sleep = mtx_pool_create("sleep mtxpool",
 	    MTX_POOL_SLEEP_SIZE, MTX_DEF);
 }
 
 /*
  * Obtain a (shared) mutex from the pool.  The returned mutex is a leaf
  * level mutex, meaning that if you obtain it you cannot obtain any other
  * mutexes until you release it.  You can legally msleep() on the mutex.
  */
 struct mtx *
 mtx_pool_alloc(struct mtx_pool *pool)
 {
 	int i;
 
 	KASSERT(pool != NULL, ("mtx_pool_alloc(): null pool"));
 	/*
 	 * mtx_pool_next is unprotected against multiple accesses,
 	 * but simultaneous access by two CPUs should not be very
 	 * harmful.
 	 */
 	i = pool->mtx_pool_next;
 	pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask;
 	return (&pool->mtx_pool_ary[i]);
 }
 
 SYSINIT(mtxpooli2, SI_SUB_MTX_POOL_DYNAMIC, SI_ORDER_FIRST,
     mtx_pool_setup_dynamic, NULL);
Index: head/sys/kern/kern_mutex.c
===================================================================
--- head/sys/kern/kern_mutex.c	(revision 326270)
+++ head/sys/kern/kern_mutex.c	(revision 326271)
@@ -1,1270 +1,1272 @@
 /*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Machine independent bits of mutex implementation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_adaptive_mutexes.h"
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 #include <sys/vmmeter.h>
 #include <sys/lock_profile.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/cpu.h>
 
 #include <ddb/ddb.h>
 
 #include <fs/devfs/devfs_int.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
 #define	ADAPTIVE_MUTEXES
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , lock, failed);
 #endif
 
 /*
  * Return the mutex address when the lock cookie address is provided.
  * This functionality assumes that struct mtx* have a member named mtx_lock.
  */
 #define	mtxlock2mtx(c)	(__containerof(c, struct mtx, mtx_lock))
 
 /*
  * Internal utility macros.
  */
 #define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
 
 #define	mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED)
 
 static void	assert_mtx(const struct lock_object *lock, int what);
 #ifdef DDB
 static void	db_show_mtx(const struct lock_object *lock);
 #endif
 static void	lock_mtx(struct lock_object *lock, uintptr_t how);
 static void	lock_spin(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_mtx(const struct lock_object *lock,
 		    struct thread **owner);
 #endif
 static uintptr_t unlock_mtx(struct lock_object *lock);
 static uintptr_t unlock_spin(struct lock_object *lock);
 
 /*
  * Lock classes for sleep and spin mutexes.
  */
 struct lock_class lock_class_mtx_sleep = {
 	.lc_name = "sleep mutex",
 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
 	.lc_assert = assert_mtx,
 #ifdef DDB
 	.lc_ddb_show = db_show_mtx,
 #endif
 	.lc_lock = lock_mtx,
 	.lc_unlock = unlock_mtx,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_mtx,
 #endif
 };
 struct lock_class lock_class_mtx_spin = {
 	.lc_name = "spin mutex",
 	.lc_flags = LC_SPINLOCK | LC_RECURSABLE,
 	.lc_assert = assert_mtx,
 #ifdef DDB
 	.lc_ddb_show = db_show_mtx,
 #endif
 	.lc_lock = lock_spin,
 	.lc_unlock = unlock_spin,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_mtx,
 #endif
 };
 
 #ifdef ADAPTIVE_MUTEXES
 static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging");
 
 static struct lock_delay_config __read_frequently mtx_delay;
 
 SYSCTL_INT(_debug_mtx, OID_AUTO, delay_base, CTLFLAG_RW, &mtx_delay.base,
     0, "");
 SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_delay.max,
     0, "");
 
 LOCK_DELAY_SYSINIT_DEFAULT(mtx_delay);
 #endif
 
 static SYSCTL_NODE(_debug, OID_AUTO, mtx_spin, CTLFLAG_RD, NULL,
     "mtx spin debugging");
 
 static struct lock_delay_config __read_frequently mtx_spin_delay;
 
 SYSCTL_INT(_debug_mtx_spin, OID_AUTO, delay_base, CTLFLAG_RW,
     &mtx_spin_delay.base, 0, "");
 SYSCTL_INT(_debug_mtx_spin, OID_AUTO, delay_max, CTLFLAG_RW,
     &mtx_spin_delay.max, 0, "");
 
 LOCK_DELAY_SYSINIT_DEFAULT(mtx_spin_delay);
 
 /*
  * System-wide mutexes
  */
 struct mtx blocked_lock;
 struct mtx __exclusive_cache_line Giant;
 
 void
 assert_mtx(const struct lock_object *lock, int what)
 {
 
 	mtx_assert((const struct mtx *)lock, what);
 }
 
 void
 lock_mtx(struct lock_object *lock, uintptr_t how)
 {
 
 	mtx_lock((struct mtx *)lock);
 }
 
 void
 lock_spin(struct lock_object *lock, uintptr_t how)
 {
 
 	panic("spin locks can only use msleep_spin");
 }
 
 uintptr_t
 unlock_mtx(struct lock_object *lock)
 {
 	struct mtx *m;
 
 	m = (struct mtx *)lock;
 	mtx_assert(m, MA_OWNED | MA_NOTRECURSED);
 	mtx_unlock(m);
 	return (0);
 }
 
 uintptr_t
 unlock_spin(struct lock_object *lock)
 {
 
 	panic("spin locks can only use msleep_spin");
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_mtx(const struct lock_object *lock, struct thread **owner)
 {
 	const struct mtx *m;
 	uintptr_t x;
 
 	m = (const struct mtx *)lock;
 	x = m->mtx_lock;
 	*owner = (struct thread *)(x & ~MTX_FLAGMASK);
 	return (*owner != NULL);
 }
 #endif
 
 /*
  * Function versions of the inlined __mtx_* macros.  These are used by
  * modules and can also be called from assembly language if needed.
  */
 void
 __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 	uintptr_t tid, v;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
 	    !TD_IS_IDLETHREAD(curthread),
 	    ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d",
 	    curthread, m->lock_object.lo_name, file, line));
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_lock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 	WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) |
 	    LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
 
 	tid = (uintptr_t)curthread;
 	v = MTX_UNOWNED;
 	if (!_mtx_obtain_lock_fetch(m, &v, tid))
 		_mtx_lock_sleep(m, v, opts, file, line);
 	else
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire,
 		    m, 0, 0, file, line);
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE,
 	    file, line);
 	TD_LOCKS_INC(curthread);
 }
 
 void
 __mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_unlock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	mtx_assert(m, MA_OWNED);
 
 #ifdef LOCK_PROFILING
 	__mtx_unlock_sleep(c, (uintptr_t)curthread, opts, file, line);
 #else
 	__mtx_unlock(m, curthread, opts, file, line);
 #endif
 	TD_LOCKS_DEC(curthread);
 }
 
 void
 __mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
     int line)
 {
 	struct mtx *m;
 #ifdef SMP
 	uintptr_t tid, v;
 #endif
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_lock_spin() of sleep mutex %s @ %s:%d",
 	    m->lock_object.lo_name, file, line));
 	if (mtx_owned(m))
 		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 		    (opts & MTX_RECURSE) != 0,
 	    ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n",
 		    m->lock_object.lo_name, file, line));
 	opts &= ~MTX_RECURSE;
 	WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
 	    file, line, NULL);
 #ifdef SMP
 	spinlock_enter();
 	tid = (uintptr_t)curthread;
 	v = MTX_UNOWNED;
 	if (!_mtx_obtain_lock_fetch(m, &v, tid))
 		_mtx_lock_spin(m, v, opts, file, line);
 	else
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire,
 		    m, 0, 0, file, line);
 #else
 	__mtx_lock_spin(m, curthread, opts, file, line);
 #endif
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 }
 
 int
 __mtx_trylock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
     int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_trylock_spin() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_trylock_spin() of sleep mutex %s @ %s:%d",
 	    m->lock_object.lo_name, file, line));
 	KASSERT((opts & MTX_RECURSE) == 0,
 	    ("mtx_trylock_spin: unsupp. opt MTX_RECURSE on mutex %s @ %s:%d\n",
 	    m->lock_object.lo_name, file, line));
 	if (__mtx_trylock_spin(m, curthread, opts, file, line)) {
 		LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 1, file, line);
 		WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 		return (1);
 	}
 	LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 0, file, line);
 	return (0);
 }
 
 void
 __mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
     int line)
 {
 	struct mtx *m;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_unlock_spin() of sleep mutex %s @ %s:%d",
 	    m->lock_object.lo_name, file, line));
 	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	mtx_assert(m, MA_OWNED);
 
 	__mtx_unlock_spin(m);
 }
 
 /*
  * The important part of mtx_trylock{,_flags}()
  * Tries to acquire lock `m.'  If this function is called on a mutex that
  * is already owned, it will recursively acquire the lock.
  */
 int
 _mtx_trylock_flags_int(struct mtx *m, int opts LOCK_FILE_LINE_ARG_DEF)
 {
 	struct thread *td;
 	uintptr_t tid, v;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	int rval;
 	bool recursed;
 
 	td = curthread;
 	tid = (uintptr_t)td;
 	if (SCHEDULER_STOPPED_TD(td))
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td),
 	    ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d",
 	    curthread, m->lock_object.lo_name, file, line));
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_trylock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 
 	rval = 1;
 	recursed = false;
 	v = MTX_UNOWNED;
 	for (;;) {
 		if (_mtx_obtain_lock_fetch(m, &v, tid))
 			break;
 		if (v == MTX_UNOWNED)
 			continue;
 		if (v == tid &&
 		    ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 		    (opts & MTX_RECURSE) != 0)) {
 			m->mtx_recurse++;
 			atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 			recursed = true;
 			break;
 		}
 		rval = 0;
 		break;
 	}
 
 	opts &= ~MTX_RECURSE;
 
 	LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		TD_LOCKS_INC(curthread);
 		if (!recursed)
 			LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire,
 			    m, contested, waittime, file, line);
 	}
 
 	return (rval);
 }
 
 int
 _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 
 	m = mtxlock2mtx(c);
 	return (_mtx_trylock_flags_int(m, opts LOCK_FILE_LINE_ARG));
 }
 
 /*
  * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
  *
  * We call this if the lock is either contested (i.e. we need to go to
  * sleep waiting for it), or if we need to recurse on it.
  */
 #if LOCK_DEBUG > 0
 void
 __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t v, int opts, const char *file,
     int line)
 #else
 void
 __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t v)
 #endif
 {
 	struct thread *td;
 	struct mtx *m;
 	struct turnstile *ts;
 	uintptr_t tid;
 	struct thread *owner;
 #ifdef KTR
 	int cont_logged = 0;
 #endif
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 #if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS)
 	struct lock_delay_arg lda;
 #endif
 #ifdef KDTRACE_HOOKS
 	u_int sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
 #endif
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	int doing_lockprof;
 #endif
 	td = curthread;
 	tid = (uintptr_t)td;
 	if (SCHEDULER_STOPPED_TD(td))
 		return;
 
 #if defined(ADAPTIVE_MUTEXES)
 	lock_delay_arg_init(&lda, &mtx_delay);
 #elif defined(KDTRACE_HOOKS)
 	lock_delay_arg_init(&lda, NULL);
 #endif
 	m = mtxlock2mtx(c);
 	if (__predict_false(v == MTX_UNOWNED))
 		v = MTX_READ_VALUE(m);
 
 	if (__predict_false(lv_mtx_owner(v) == td)) {
 		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 		    (opts & MTX_RECURSE) != 0,
 	    ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n",
 		    m->lock_object.lo_name, file, line));
 #if LOCK_DEBUG > 0
 		opts &= ~MTX_RECURSE;
 #endif
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
 		return;
 	}
 #if LOCK_DEBUG > 0
 	opts &= ~MTX_RECURSE;
 #endif
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&m->lock_object,
 		    &contested, &waittime);
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR4(KTR_LOCK,
 		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
 		    m->lock_object.lo_name, (void *)m->mtx_lock, file, line);
 #ifdef LOCK_PROFILING
 	doing_lockprof = 1;
 #elif defined(KDTRACE_HOOKS)
 	doing_lockprof = lockstat_enabled;
 	if (__predict_false(doing_lockprof))
 		all_time -= lockstat_nsecs(&m->lock_object);
 #endif
 
 	for (;;) {
 		if (v == MTX_UNOWNED) {
 			if (_mtx_obtain_lock_fetch(m, &v, tid))
 				break;
 			continue;
 		}
 #ifdef KDTRACE_HOOKS
 		lda.spin_cnt++;
 #endif
 #ifdef ADAPTIVE_MUTEXES
 		/*
 		 * If the owner is running on another CPU, spin until the
 		 * owner stops running or the state of the lock changes.
 		 */
 		owner = lv_mtx_owner(v);
 		if (TD_IS_RUNNING(owner)) {
 			if (LOCK_LOG_TEST(&m->lock_object, 0))
 				CTR3(KTR_LOCK,
 				    "%s: spinning on %p held by %p",
 				    __func__, m, owner);
 			KTR_STATE1(KTR_SCHED, "thread",
 			    sched_tdname((struct thread *)tid),
 			    "spinning", "lockname:\"%s\"",
 			    m->lock_object.lo_name);
 			do {
 				lock_delay(&lda);
 				v = MTX_READ_VALUE(m);
 				owner = lv_mtx_owner(v);
 			} while (v != MTX_UNOWNED && TD_IS_RUNNING(owner));
 			KTR_STATE0(KTR_SCHED, "thread",
 			    sched_tdname((struct thread *)tid),
 			    "running");
 			continue;
 		}
 #endif
 
 		ts = turnstile_trywait(&m->lock_object);
 		v = MTX_READ_VALUE(m);
 
 		/*
 		 * Check if the lock has been released while spinning for
 		 * the turnstile chain lock.
 		 */
 		if (v == MTX_UNOWNED) {
 			turnstile_cancel(ts);
 			continue;
 		}
 
 #ifdef ADAPTIVE_MUTEXES
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		owner = lv_mtx_owner(v);
 		if (TD_IS_RUNNING(owner)) {
 			turnstile_cancel(ts);
 			continue;
 		}
 #endif
 
 		/*
 		 * If the mutex isn't already contested and a failure occurs
 		 * setting the contested bit, the mutex was either released
 		 * or the state of the MTX_RECURSED bit changed.
 		 */
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
 			turnstile_cancel(ts);
 			v = MTX_READ_VALUE(m);
 			continue;
 		}
 
 		/*
 		 * We definitely must sleep for this lock.
 		 */
 		mtx_assert(m, MA_NOTOWNED);
 
 #ifdef KTR
 		if (!cont_logged) {
 			CTR6(KTR_CONTENTION,
 			    "contention: %p at %s:%d wants %s, taken by %s:%d",
 			    (void *)tid, file, line, m->lock_object.lo_name,
 			    WITNESS_FILE(&m->lock_object),
 			    WITNESS_LINE(&m->lock_object));
 			cont_logged = 1;
 		}
 #endif
 
 		/*
 		 * Block on the turnstile.
 		 */
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs(&m->lock_object);
 #endif
 #ifndef ADAPTIVE_MUTEXES
 		owner = mtx_owner(m);
 #endif
 		MPASS(owner == mtx_owner(m));
 		turnstile_wait(ts, owner, TS_EXCLUSIVE_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs(&m->lock_object);
 		sleep_cnt++;
 #endif
 		v = MTX_READ_VALUE(m);
 	}
 #ifdef KTR
 	if (cont_logged) {
 		CTR4(KTR_CONTENTION,
 		    "contention end: %s acquired by %p at %s:%d",
 		    m->lock_object.lo_name, (void *)tid, file, line);
 	}
 #endif
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	if (__predict_true(!doing_lockprof))
 		return;
 #endif
 #ifdef KDTRACE_HOOKS
 	all_time += lockstat_nsecs(&m->lock_object);
 #endif
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested,
 	    waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(adaptive__block, m, sleep_time);
 
 	/*
 	 * Only record the loops spinning and not sleeping.
 	 */
 	if (lda.spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(adaptive__spin, m, all_time - sleep_time);
 #endif
 }
 
 static void
 _mtx_lock_spin_failed(struct mtx *m)
 {
 	struct thread *td;
 
 	td = mtx_owner(m);
 
 	/* If the mutex is unlocked, try again. */
 	if (td == NULL)
 		return;
 
 	printf( "spin lock %p (%s) held by %p (tid %d) too long\n",
 	    m, m->lock_object.lo_name, td, td->td_tid);
 #ifdef WITNESS
 	witness_display_spinlock(&m->lock_object, td, printf);
 #endif
 	panic("spin lock held too long");
 }
 
 #ifdef SMP
 /*
  * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock.
  *
  * This is only called if we need to actually spin for the lock. Recursion
  * is handled inline.
  */
 #if LOCK_DEBUG > 0
 void
 _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t v, int opts,
     const char *file, int line)
 #else
 void
 _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t v)
 #endif
 {
 	struct mtx *m;
 	struct lock_delay_arg lda;
 	uintptr_t tid;
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 #ifdef KDTRACE_HOOKS
 	int64_t spin_time = 0;
 #endif
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	int doing_lockprof;
 #endif
 
 	tid = (uintptr_t)curthread;
 	m = mtxlock2mtx(c);
 
 	if (__predict_false(v == MTX_UNOWNED))
 		v = MTX_READ_VALUE(m);
 
 	if (__predict_false(v == tid)) {
 		m->mtx_recurse++;
 		return;
 	}
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	lock_delay_arg_init(&lda, &mtx_spin_delay);
 
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid),
 	    "spinning", "lockname:\"%s\"", m->lock_object.lo_name);
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
 #ifdef LOCK_PROFILING
 	doing_lockprof = 1;
 #elif defined(KDTRACE_HOOKS)
 	doing_lockprof = lockstat_enabled;
 	if (__predict_false(doing_lockprof))
 		spin_time -= lockstat_nsecs(&m->lock_object);
 #endif
 	for (;;) {
 		if (v == MTX_UNOWNED) {
 			if (_mtx_obtain_lock_fetch(m, &v, tid))
 				break;
 			continue;
 		}
 		/* Give interrupts a chance while we spin. */
 		spinlock_exit();
 		do {
 			if (lda.spin_cnt < 10000000) {
 				lock_delay(&lda);
 			} else {
 				lda.spin_cnt++;
 				if (lda.spin_cnt < 60000000 || kdb_active ||
 				    panicstr != NULL)
 					DELAY(1);
 				else
 					_mtx_lock_spin_failed(m);
 				cpu_spinwait();
 			}
 			v = MTX_READ_VALUE(m);
 		} while (v != MTX_UNOWNED);
 		spinlock_enter();
 	}
 
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
 	KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid),
 	    "running");
 
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	if (__predict_true(!doing_lockprof))
 		return;
 #endif
 #ifdef KDTRACE_HOOKS
 	spin_time += lockstat_nsecs(&m->lock_object);
 #endif
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m,
 	    contested, waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (lda.spin_cnt != 0)
 		LOCKSTAT_RECORD1(spin__spin, m, spin_time);
 #endif
 }
 #endif /* SMP */
 
 #ifdef INVARIANTS
 static void
 thread_lock_validate(struct mtx *m, int opts, const char *file, int line)
 {
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("thread_lock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("thread_lock() of sleep mutex %s @ %s:%d",
 	    m->lock_object.lo_name, file, line));
 	if (mtx_owned(m))
 		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0,
 		    ("thread_lock: recursed on non-recursive mutex %s @ %s:%d\n",
 		    m->lock_object.lo_name, file, line));
 	WITNESS_CHECKORDER(&m->lock_object,
 	    opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
 }
 #else
 #define thread_lock_validate(m, opts, file, line) do { } while (0)
 #endif
 
 #ifndef LOCK_PROFILING
 #if LOCK_DEBUG > 0
 void
 _thread_lock(struct thread *td, int opts, const char *file, int line)
 #else
 void
 _thread_lock(struct thread *td)
 #endif
 {
 	struct mtx *m;
 	uintptr_t tid, v;
 
 	tid = (uintptr_t)curthread;
 
 	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(spin__acquire)))
 		goto slowpath_noirq;
 	spinlock_enter();
 	m = td->td_lock;
 	thread_lock_validate(m, 0, file, line);
 	v = MTX_READ_VALUE(m);
 	if (__predict_true(v == MTX_UNOWNED)) {
 		if (__predict_false(!_mtx_obtain_lock(m, tid)))
 			goto slowpath_unlocked;
 	} else if (v == tid) {
 		m->mtx_recurse++;
 	} else
 		goto slowpath_unlocked;
 	if (__predict_true(m == td->td_lock)) {
 		WITNESS_LOCK(&m->lock_object, LOP_EXCLUSIVE, file, line);
 		return;
 	}
 	if (m->mtx_recurse != 0)
 		m->mtx_recurse--;
 	else
 		_mtx_release_lock_quick(m);
 slowpath_unlocked:
 	spinlock_exit();
 slowpath_noirq:
 #if LOCK_DEBUG > 0
 	thread_lock_flags_(td, opts, file, line);
 #else
 	thread_lock_flags_(td, 0, 0, 0);
 #endif
 }
 #endif
 
 void
 thread_lock_flags_(struct thread *td, int opts, const char *file, int line)
 {
 	struct mtx *m;
 	uintptr_t tid, v;
 	struct lock_delay_arg lda;
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 #ifdef KDTRACE_HOOKS
 	int64_t spin_time = 0;
 #endif
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	int doing_lockprof = 1;
 #endif
 
 	tid = (uintptr_t)curthread;
 
 	if (SCHEDULER_STOPPED()) {
 		/*
 		 * Ensure that spinlock sections are balanced even when the
 		 * scheduler is stopped, since we may otherwise inadvertently
 		 * re-enable interrupts while dumping core.
 		 */
 		spinlock_enter();
 		return;
 	}
 
 	lock_delay_arg_init(&lda, &mtx_spin_delay);
 
 #ifdef LOCK_PROFILING
 	doing_lockprof = 1;
 #elif defined(KDTRACE_HOOKS)
 	doing_lockprof = lockstat_enabled;
 	if (__predict_false(doing_lockprof))
 		spin_time -= lockstat_nsecs(&td->td_lock->lock_object);
 #endif
 	for (;;) {
 retry:
 		v = MTX_UNOWNED;
 		spinlock_enter();
 		m = td->td_lock;
 		thread_lock_validate(m, opts, file, line);
 		for (;;) {
 			if (_mtx_obtain_lock_fetch(m, &v, tid))
 				break;
 			if (v == MTX_UNOWNED)
 				continue;
 			if (v == tid) {
 				m->mtx_recurse++;
 				break;
 			}
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&m->lock_object,
 			    &contested, &waittime);
 			/* Give interrupts a chance while we spin. */
 			spinlock_exit();
 			do {
 				if (lda.spin_cnt < 10000000) {
 					lock_delay(&lda);
 				} else {
 					lda.spin_cnt++;
 					if (lda.spin_cnt < 60000000 ||
 					    kdb_active || panicstr != NULL)
 						DELAY(1);
 					else
 						_mtx_lock_spin_failed(m);
 					cpu_spinwait();
 				}
 				if (m != td->td_lock)
 					goto retry;
 				v = MTX_READ_VALUE(m);
 			} while (v != MTX_UNOWNED);
 			spinlock_enter();
 		}
 		if (m == td->td_lock)
 			break;
 		__mtx_unlock_spin(m);	/* does spinlock_exit() */
 	}
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	if (__predict_true(!doing_lockprof))
 		return;
 #endif
 #ifdef KDTRACE_HOOKS
 	spin_time += lockstat_nsecs(&m->lock_object);
 #endif
 	if (m->mtx_recurse == 0)
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m,
 		    contested, waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (lda.spin_cnt != 0)
 		LOCKSTAT_RECORD1(thread__spin, m, spin_time);
 #endif
 }
 
 struct mtx *
 thread_lock_block(struct thread *td)
 {
 	struct mtx *lock;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	lock = td->td_lock;
 	td->td_lock = &blocked_lock;
 	mtx_unlock_spin(lock);
 
 	return (lock);
 }
 
 void
 thread_lock_unblock(struct thread *td, struct mtx *new)
 {
 	mtx_assert(new, MA_OWNED);
 	MPASS(td->td_lock == &blocked_lock);
 	atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new);
 }
 
 void
 thread_lock_set(struct thread *td, struct mtx *new)
 {
 	struct mtx *lock;
 
 	mtx_assert(new, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	lock = td->td_lock;
 	td->td_lock = new;
 	mtx_unlock_spin(lock);
 }
 
 /*
  * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
  * We are only called here if the lock is recursed, contested (i.e. we
  * need to wake up a blocked thread) or lockstat probe is active.
  */
 #if LOCK_DEBUG > 0
 void
 __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v, int opts,
     const char *file, int line)
 #else
 void
 __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v)
 #endif
 {
 	struct mtx *m;
 	struct turnstile *ts;
 	uintptr_t tid;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	tid = (uintptr_t)curthread;
 	m = mtxlock2mtx(c);
 
 	if (__predict_false(v == tid))
 		v = MTX_READ_VALUE(m);
 
 	if (__predict_false(v & MTX_RECURSED)) {
 		if (--(m->mtx_recurse) == 0)
 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
 		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
 		return;
 	}
 
 	LOCKSTAT_PROFILE_RELEASE_LOCK(adaptive__release, m);
 	if (v == tid && _mtx_release_lock(m, tid))
 		return;
 
 	/*
 	 * We have to lock the chain before the turnstile so this turnstile
 	 * can be removed from the hash list if it is empty.
 	 */
 	turnstile_chain_lock(&m->lock_object);
 	_mtx_release_lock_quick(m);
 	ts = turnstile_lookup(&m->lock_object);
 	MPASS(ts != NULL);
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
 	turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * This turnstile is now no longer associated with the mutex.  We can
 	 * unlock the chain lock so a new turnstile may take it's place.
 	 */
 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	turnstile_chain_unlock(&m->lock_object);
 }
 
 /*
  * All the unlocking of MTX_SPIN locks is done inline.
  * See the __mtx_unlock_spin() macro for the details.
  */
 
 /*
  * The backing function for the INVARIANTS-enabled mtx_assert()
  */
 #ifdef INVARIANT_SUPPORT
 void
 __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line)
 {
 	const struct mtx *m;
 
 	if (panicstr != NULL || dumping || SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	switch (what) {
 	case MA_OWNED:
 	case MA_OWNED | MA_RECURSED:
 	case MA_OWNED | MA_NOTRECURSED:
 		if (!mtx_owned(m))
 			panic("mutex %s not owned at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		if (mtx_recursed(m)) {
 			if ((what & MA_NOTRECURSED) != 0)
 				panic("mutex %s recursed at %s:%d",
 				    m->lock_object.lo_name, file, line);
 		} else if ((what & MA_RECURSED) != 0) {
 			panic("mutex %s unrecursed at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		}
 		break;
 	case MA_NOTOWNED:
 		if (mtx_owned(m))
 			panic("mutex %s owned at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		break;
 	default:
 		panic("unknown mtx_assert at %s:%d", file, line);
 	}
 }
 #endif
 
 /*
  * General init routine used by the MTX_SYSINIT() macro.
  */
 void
 mtx_sysinit(void *arg)
 {
 	struct mtx_args *margs = arg;
 
 	mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL,
 	    margs->ma_opts);
 }
 
 /*
  * Mutex initialization routine; initialize lock `m' of type contained in
  * `opts' with options contained in `opts' and name `name.'  The optional
  * lock type `type' is used as a general lock category name for use with
  * witness.
  */
 void
 _mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts)
 {
 	struct mtx *m;
 	struct lock_class *class;
 	int flags;
 
 	m = mtxlock2mtx(c);
 
 	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
 	    MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE | MTX_NEW)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock,
 	    ("%s: mtx_lock not aligned for %s: %p", __func__, name,
 	    &m->mtx_lock));
 
 	/* Determine lock class and lock flags. */
 	if (opts & MTX_SPIN)
 		class = &lock_class_mtx_spin;
 	else
 		class = &lock_class_mtx_sleep;
 	flags = 0;
 	if (opts & MTX_QUIET)
 		flags |= LO_QUIET;
 	if (opts & MTX_RECURSE)
 		flags |= LO_RECURSABLE;
 	if ((opts & MTX_NOWITNESS) == 0)
 		flags |= LO_WITNESS;
 	if (opts & MTX_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & MTX_NOPROFILE)
 		flags |= LO_NOPROFILE;
 	if (opts & MTX_NEW)
 		flags |= LO_NEW;
 
 	/* Initialize mutex. */
 	lock_init(&m->lock_object, class, name, type, flags);
 
 	m->mtx_lock = MTX_UNOWNED;
 	m->mtx_recurse = 0;
 }
 
 /*
  * Remove lock `m' from all_mtx queue.  We don't allow MTX_QUIET to be
  * passed in as a flag here because if the corresponding mtx_init() was
  * called with MTX_QUIET set, then it will already be set in the mutex's
  * flags.
  */
 void
 _mtx_destroy(volatile uintptr_t *c)
 {
 	struct mtx *m;
 
 	m = mtxlock2mtx(c);
 
 	if (!mtx_owned(m))
 		MPASS(mtx_unowned(m));
 	else {
 		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
 
 		/* Perform the non-mtx related part of mtx_unlock_spin(). */
 		if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin)
 			spinlock_exit();
 		else
 			TD_LOCKS_DEC(curthread);
 
 		lock_profile_release_lock(&m->lock_object);
 		/* Tell witness this isn't locked to make it happy. */
 		WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__,
 		    __LINE__);
 	}
 
 	m->mtx_lock = MTX_DESTROYED;
 	lock_destroy(&m->lock_object);
 }
 
 /*
  * Intialize the mutex code and system mutexes.  This is called from the MD
  * startup code prior to mi_startup().  The per-CPU data space needs to be
  * setup before this is called.
  */
 void
 mutex_init(void)
 {
 
 	/* Setup turnstiles so that sleep mutexes work. */
 	init_turnstiles();
 
 	/*
 	 * Initialize mutexes.
 	 */
 	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
 	mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
 	blocked_lock.mtx_lock = 0xdeadc0de;	/* Always blocked. */
 	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN);
 	mtx_init(&proc0.p_statmtx, "pstatl", NULL, MTX_SPIN);
 	mtx_init(&proc0.p_itimmtx, "pitiml", NULL, MTX_SPIN);
 	mtx_init(&proc0.p_profmtx, "pprofl", NULL, MTX_SPIN);
 	mtx_init(&devmtx, "cdev", NULL, MTX_DEF);
 	mtx_lock(&Giant);
 }
 
 #ifdef DDB
 void
 db_show_mtx(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct mtx *m;
 
 	m = (const struct mtx *)lock;
 
 	db_printf(" flags: {");
 	if (LOCK_CLASS(lock) == &lock_class_mtx_spin)
 		db_printf("SPIN");
 	else
 		db_printf("DEF");
 	if (m->lock_object.lo_flags & LO_RECURSABLE)
 		db_printf(", RECURSE");
 	if (m->lock_object.lo_flags & LO_DUPOK)
 		db_printf(", DUPOK");
 	db_printf("}\n");
 	db_printf(" state: {");
 	if (mtx_unowned(m))
 		db_printf("UNOWNED");
 	else if (mtx_destroyed(m))
 		db_printf("DESTROYED");
 	else {
 		db_printf("OWNED");
 		if (m->mtx_lock & MTX_CONTESTED)
 			db_printf(", CONTESTED");
 		if (m->mtx_lock & MTX_RECURSED)
 			db_printf(", RECURSED");
 	}
 	db_printf("}\n");
 	if (!mtx_unowned(m) && !mtx_destroyed(m)) {
 		td = mtx_owner(m);
 		db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (mtx_recursed(m))
 			db_printf(" recursed: %d\n", m->mtx_recurse);
 	}
 }
 #endif
Index: head/sys/kern/kern_osd.c
===================================================================
--- head/sys/kern/kern_osd.c	(revision 326270)
+++ head/sys/kern/kern_osd.c	(revision 326271)
@@ -1,443 +1,445 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/proc.h>
 #include <sys/osd.h>
 
 /* OSD (Object Specific Data) */
 
 /*
  * Lock key:
  *  (m) osd_module_lock
  *  (o) osd_object_lock
  *  (l) osd_list_lock
  */
 struct osd_master {
 	struct sx		 osd_module_lock;
 	struct rmlock		 osd_object_lock;
 	struct mtx		 osd_list_lock;
 	LIST_HEAD(, osd)	 osd_list;		/* (l) */
 	osd_destructor_t	*osd_destructors;	/* (o) */
 	osd_method_t		*osd_methods;		/* (m) */
 	u_int			 osd_ntslots;		/* (m) */
 	const u_int		 osd_nmethods;
 };
 
 static MALLOC_DEFINE(M_OSD, "osd", "Object Specific Data");
 
 static int osd_debug = 0;
 SYSCTL_INT(_debug, OID_AUTO, osd, CTLFLAG_RWTUN, &osd_debug, 0, "OSD debug level");
 
 #define	OSD_DEBUG(...)	do {						\
 	if (osd_debug) {						\
 		printf("OSD (%s:%u): ", __func__, __LINE__);		\
 		printf(__VA_ARGS__);					\
 		printf("\n");						\
 	}								\
 } while (0)
 
 static void do_osd_del(u_int type, struct osd *osd, u_int slot,
     int list_locked);
 
 /*
  * List of objects with OSD.
  */
 struct osd_master osdm[OSD_LAST + 1] = {
 	[OSD_JAIL] = { .osd_nmethods = PR_MAXMETHOD },
 };
 
 static void
 osd_default_destructor(void *value __unused)
 {
 	/* Do nothing. */
 }
 
 int
 osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods)
 {
 	void *newptr;
 	u_int i, m;
 
 	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
 
 	/*
 	 * If no destructor is given, use default one. We need to use some
 	 * destructor, because NULL destructor means unused slot.
 	 */
 	if (destructor == NULL)
 		destructor = osd_default_destructor;
 
 	sx_xlock(&osdm[type].osd_module_lock);
 	/*
 	 * First, we try to find unused slot.
 	 */
 	for (i = 0; i < osdm[type].osd_ntslots; i++) {
 		if (osdm[type].osd_destructors[i] == NULL) {
 			OSD_DEBUG("Unused slot found (type=%u, slot=%u).",
 			    type, i);
 			break;
 		}
 	}
 	/*
 	 * If no unused slot was found, allocate one.
 	 */
 	if (i == osdm[type].osd_ntslots) {
 		osdm[type].osd_ntslots++;
 		if (osdm[type].osd_nmethods != 0)
 			osdm[type].osd_methods = realloc(osdm[type].osd_methods,
 			    sizeof(osd_method_t) * osdm[type].osd_ntslots *
 			    osdm[type].osd_nmethods, M_OSD, M_WAITOK);
 		newptr = malloc(sizeof(osd_destructor_t) *
 		    osdm[type].osd_ntslots, M_OSD, M_WAITOK);
 		rm_wlock(&osdm[type].osd_object_lock);
 		bcopy(osdm[type].osd_destructors, newptr,
 		    sizeof(osd_destructor_t) * i);
 		free(osdm[type].osd_destructors, M_OSD);
 		osdm[type].osd_destructors = newptr;
 		rm_wunlock(&osdm[type].osd_object_lock);
 		OSD_DEBUG("New slot allocated (type=%u, slot=%u).",
 		    type, i + 1);
 	}
 
 	osdm[type].osd_destructors[i] = destructor;
 	if (osdm[type].osd_nmethods != 0) {
 		for (m = 0; m < osdm[type].osd_nmethods; m++)
 			osdm[type].osd_methods[i * osdm[type].osd_nmethods + m]
 			    = methods != NULL ? methods[m] : NULL;
 	}
 	sx_xunlock(&osdm[type].osd_module_lock);
 	return (i + 1);
 }
 
 void
 osd_deregister(u_int type, u_int slot)
 {
 	struct osd *osd, *tosd;
 
 	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
 	KASSERT(slot > 0, ("Invalid slot."));
 	KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot."));
 
 	sx_xlock(&osdm[type].osd_module_lock);
 	rm_wlock(&osdm[type].osd_object_lock);
 	/*
 	 * Free all OSD for the given slot.
 	 */
 	mtx_lock(&osdm[type].osd_list_lock);
 	LIST_FOREACH_SAFE(osd, &osdm[type].osd_list, osd_next, tosd)
 		do_osd_del(type, osd, slot, 1);
 	mtx_unlock(&osdm[type].osd_list_lock);
 	/*
 	 * Set destructor to NULL to free the slot.
 	 */
 	osdm[type].osd_destructors[slot - 1] = NULL;
 	if (slot == osdm[type].osd_ntslots) {
 		osdm[type].osd_ntslots--;
 		osdm[type].osd_destructors = realloc(osdm[type].osd_destructors,
 		    sizeof(osd_destructor_t) * osdm[type].osd_ntslots, M_OSD,
 		    M_NOWAIT | M_ZERO);
 		if (osdm[type].osd_nmethods != 0)
 			osdm[type].osd_methods = realloc(osdm[type].osd_methods,
 			    sizeof(osd_method_t) * osdm[type].osd_ntslots *
 			    osdm[type].osd_nmethods, M_OSD, M_NOWAIT | M_ZERO);
 		/*
 		 * We always reallocate to smaller size, so we assume it will
 		 * always succeed.
 		 */
 		KASSERT(osdm[type].osd_destructors != NULL &&
 		    (osdm[type].osd_nmethods == 0 ||
 		     osdm[type].osd_methods != NULL), ("realloc() failed"));
 		OSD_DEBUG("Deregistration of the last slot (type=%u, slot=%u).",
 		    type, slot);
 	} else {
 		OSD_DEBUG("Slot deregistration (type=%u, slot=%u).",
 		    type, slot);
 	}
 	rm_wunlock(&osdm[type].osd_object_lock);
 	sx_xunlock(&osdm[type].osd_module_lock);
 }
 
 int
 osd_set(u_int type, struct osd *osd, u_int slot, void *value)
 {
 
 	return (osd_set_reserved(type, osd, slot, NULL, value));
 }
 
 void **
 osd_reserve(u_int slot)
 {
 
 	KASSERT(slot > 0, ("Invalid slot."));
 
 	OSD_DEBUG("Reserving slot array (slot=%u).", slot);
 	return (malloc(sizeof(void *) * slot, M_OSD, M_WAITOK | M_ZERO));
 }
 
 int
 osd_set_reserved(u_int type, struct osd *osd, u_int slot, void **rsv,
     void *value)
 {
 	struct rm_priotracker tracker;
 
 	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
 	KASSERT(slot > 0, ("Invalid slot."));
 	KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot."));
 
 	rm_rlock(&osdm[type].osd_object_lock, &tracker);
 	if (slot > osd->osd_nslots) {
 		void **newptr;
 
 		if (value == NULL) {
 			OSD_DEBUG(
 			    "Not allocating null slot (type=%u, slot=%u).",
 			    type, slot);
 			rm_runlock(&osdm[type].osd_object_lock, &tracker);
 			if (rsv)
 				osd_free_reserved(rsv);
 			return (0);
 		}
 
 		/*
 		 * Too few slots allocated here, so we need to extend or create
 		 * the array.
 		 */
 		if (rsv) {
 			/*
 			 * Use the reserve passed in (assumed to be
 			 * the right size).
 			 */
 			newptr = rsv;
 			if (osd->osd_nslots != 0) {
 				memcpy(newptr, osd->osd_slots,
 				    sizeof(void *) * osd->osd_nslots);
 				free(osd->osd_slots, M_OSD);
 			}
 		} else {
 			newptr = realloc(osd->osd_slots, sizeof(void *) * slot,
 			    M_OSD, M_NOWAIT | M_ZERO);
 			if (newptr == NULL) {
 				rm_runlock(&osdm[type].osd_object_lock,
 				    &tracker);
 				return (ENOMEM);
 			}
 		}
 		if (osd->osd_nslots == 0) {
 			/*
 			 * First OSD for this object, so we need to put it
 			 * onto the list.
 			 */
 			mtx_lock(&osdm[type].osd_list_lock);
 			LIST_INSERT_HEAD(&osdm[type].osd_list, osd, osd_next);
 			mtx_unlock(&osdm[type].osd_list_lock);
 			OSD_DEBUG("Setting first slot (type=%u).", type);
 		} else
 			OSD_DEBUG("Growing slots array (type=%u).", type);
 		osd->osd_slots = newptr;
 		osd->osd_nslots = slot;
 	} else if (rsv)
 		osd_free_reserved(rsv);
 	OSD_DEBUG("Setting slot value (type=%u, slot=%u, value=%p).", type,
 	    slot, value);
 	osd->osd_slots[slot - 1] = value;
 	rm_runlock(&osdm[type].osd_object_lock, &tracker);
 	return (0);
 }
 
 void
 osd_free_reserved(void **rsv)
 {
 
 	OSD_DEBUG("Discarding reserved slot array.");
 	free(rsv, M_OSD);
 }
 
 void *
 osd_get(u_int type, struct osd *osd, u_int slot)
 {
 	struct rm_priotracker tracker;
 	void *value;
 
 	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
 	KASSERT(slot > 0, ("Invalid slot."));
 	KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot."));
 
 	rm_rlock(&osdm[type].osd_object_lock, &tracker);
 	if (slot > osd->osd_nslots) {
 		value = NULL;
 		OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot);
 	} else {
 		value = osd->osd_slots[slot - 1];
 		OSD_DEBUG("Returning slot value (type=%u, slot=%u, value=%p).",
 		    type, slot, value);
 	}
 	rm_runlock(&osdm[type].osd_object_lock, &tracker);
 	return (value);
 }
 
 void
 osd_del(u_int type, struct osd *osd, u_int slot)
 {
 	struct rm_priotracker tracker;
 
 	rm_rlock(&osdm[type].osd_object_lock, &tracker);
 	do_osd_del(type, osd, slot, 0);
 	rm_runlock(&osdm[type].osd_object_lock, &tracker);
 }
 
 static void
 do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked)
 {
 	int i;
 
 	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
 	KASSERT(slot > 0, ("Invalid slot."));
 	KASSERT(osdm[type].osd_destructors[slot - 1] != NULL, ("Unused slot."));
 
 	OSD_DEBUG("Deleting slot (type=%u, slot=%u).", type, slot);
 
 	if (slot > osd->osd_nslots) {
 		OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot);
 		return;
 	}
 	if (osd->osd_slots[slot - 1] != NULL) {
 		osdm[type].osd_destructors[slot - 1](osd->osd_slots[slot - 1]);
 		osd->osd_slots[slot - 1] = NULL;
 	}
 	for (i = osd->osd_nslots - 1; i >= 0; i--) {
 		if (osd->osd_slots[i] != NULL) {
 			OSD_DEBUG("Slot still has a value (type=%u, slot=%u).",
 			    type, i + 1);
 			break;
 		}
 	}
 	if (i == -1) {
 		/* No values left for this object. */
 		OSD_DEBUG("No more slots left (type=%u).", type);
 		if (!list_locked)
 			mtx_lock(&osdm[type].osd_list_lock);
 		LIST_REMOVE(osd, osd_next);
 		if (!list_locked)
 			mtx_unlock(&osdm[type].osd_list_lock);
 		free(osd->osd_slots, M_OSD);
 		osd->osd_slots = NULL;
 		osd->osd_nslots = 0;
 	} else if (slot == osd->osd_nslots) {
 		/* This was the last slot. */
 		osd->osd_slots = realloc(osd->osd_slots,
 		    sizeof(void *) * (i + 1), M_OSD, M_NOWAIT | M_ZERO);
 		/*
 		 * We always reallocate to smaller size, so we assume it will
 		 * always succeed.
 		 */
 		KASSERT(osd->osd_slots != NULL, ("realloc() failed"));
 		osd->osd_nslots = i + 1;
 		OSD_DEBUG("Reducing slots array to %u (type=%u).",
 		    osd->osd_nslots, type);
 	}
 }
 
 int
 osd_call(u_int type, u_int method, void *obj, void *data)
 {
 	osd_method_t methodfun;
 	int error, i;
 
 	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
 	KASSERT(method < osdm[type].osd_nmethods, ("Invalid method."));
 
 	/*
 	 * Call this method for every slot that defines it, stopping if an
 	 * error is encountered.
 	 */
 	error = 0;
 	sx_slock(&osdm[type].osd_module_lock);
 	for (i = 0; i < osdm[type].osd_ntslots; i++) {
 		methodfun = osdm[type].osd_methods[i * osdm[type].osd_nmethods +
 		    method];
 		if (methodfun != NULL && (error = methodfun(obj, data)) != 0)
 			break;
 	}
 	sx_sunlock(&osdm[type].osd_module_lock);
 	return (error);
 }
 
 void
 osd_exit(u_int type, struct osd *osd)
 {
 	struct rm_priotracker tracker;
 	u_int i;
 
 	KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
 
 	if (osd->osd_nslots == 0) {
 		KASSERT(osd->osd_slots == NULL, ("Non-null osd_slots."));
 		/* No OSD attached, just leave. */
 		return;
 	}
 
 	rm_rlock(&osdm[type].osd_object_lock, &tracker);
 	for (i = 1; i <= osd->osd_nslots; i++) {
 		if (osdm[type].osd_destructors[i - 1] != NULL)
 			do_osd_del(type, osd, i, 0);
 		else
 			OSD_DEBUG("Unused slot (type=%u, slot=%u).", type, i);
 	}
 	rm_runlock(&osdm[type].osd_object_lock, &tracker);
 	OSD_DEBUG("Object exit (type=%u).", type);
 }
 
 static void
 osd_init(void *arg __unused)
 {
 	u_int i;
 
 	for (i = OSD_FIRST; i <= OSD_LAST; i++) {
 		sx_init(&osdm[i].osd_module_lock, "osd_module");
 		rm_init(&osdm[i].osd_object_lock, "osd_object");
 		mtx_init(&osdm[i].osd_list_lock, "osd_list", NULL, MTX_DEF);
 		LIST_INIT(&osdm[i].osd_list);
 		osdm[i].osd_destructors = NULL;
 		osdm[i].osd_ntslots = 0;
 		osdm[i].osd_methods = NULL;
 	}
 }
 SYSINIT(osd, SI_SUB_LOCK, SI_ORDER_ANY, osd_init, NULL);
Index: head/sys/kern/kern_physio.c
===================================================================
--- head/sys/kern/kern_physio.c	(revision 326270)
+++ head/sys/kern/kern_physio.c	(revision 326271)
@@ -1,225 +1,227 @@
 /*-
+ * SPDX-License-Identifier: BSD-4-Clause
+ *
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/uio.h>
 #include <geom/geom.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 int
 physio(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cdevsw *csw;
 	struct buf *pbuf;
 	struct bio *bp;
 	struct vm_page **pages;
 	caddr_t sa;
 	u_int iolen, poff;
 	int error, i, npages, maxpages;
 	vm_prot_t prot;
 
 	csw = dev->si_devsw;
 	/* check if character device is being destroyed */
 	if (csw == NULL)
 		return (ENXIO);
 
 	/* XXX: sanity check */
 	if(dev->si_iosize_max < PAGE_SIZE) {
 		printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n",
 		    devtoname(dev), dev->si_iosize_max);
 		dev->si_iosize_max = DFLTPHYS;
 	}
 
 	/*
 	 * If the driver does not want I/O to be split, that means that we
 	 * need to reject any requests that will not fit into one buffer.
 	 */
 	if (dev->si_flags & SI_NOSPLIT &&
 	    (uio->uio_resid > dev->si_iosize_max || uio->uio_resid > MAXPHYS ||
 	    uio->uio_iovcnt > 1)) {
 		/*
 		 * Tell the user why his I/O was rejected.
 		 */
 		if (uio->uio_resid > dev->si_iosize_max)
 			uprintf("%s: request size=%zd > si_iosize_max=%d; "
 			    "cannot split request\n", devtoname(dev),
 			    uio->uio_resid, dev->si_iosize_max);
 		if (uio->uio_resid > MAXPHYS)
 			uprintf("%s: request size=%zd > MAXPHYS=%d; "
 			    "cannot split request\n", devtoname(dev),
 			    uio->uio_resid, MAXPHYS);
 		if (uio->uio_iovcnt > 1)
 			uprintf("%s: request vectors=%d > 1; "
 			    "cannot split request\n", devtoname(dev),
 			    uio->uio_iovcnt);
 		return (EFBIG);
 	}
 
 	/*
 	 * Keep the process UPAGES from being swapped.  Processes swapped
 	 * out while holding pbufs, used by swapper, may lead to deadlock.
 	 */
 	PHOLD(curproc);
 
 	bp = g_alloc_bio();
 	if (uio->uio_segflg != UIO_USERSPACE) {
 		pbuf = NULL;
 		pages = NULL;
 	} else if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
 		pbuf = NULL;
 		maxpages = btoc(MIN(uio->uio_resid, MAXPHYS)) + 1;
 		pages = malloc(sizeof(*pages) * maxpages, M_DEVBUF, M_WAITOK);
 	} else {
 		pbuf = getpbuf(NULL);
 		sa = pbuf->b_data;
 		maxpages = btoc(MAXPHYS);
 		pages = pbuf->b_pages;
 	}
 	prot = VM_PROT_READ;
 	if (uio->uio_rw == UIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	error = 0;
 	for (i = 0; i < uio->uio_iovcnt; i++) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			if (uio->uio_rw == UIO_READ) {
 				racct_add_force(curproc, RACCT_READBPS,
 				    uio->uio_iov[i].iov_len);
 				racct_add_force(curproc, RACCT_READIOPS, 1);
 			} else {
 				racct_add_force(curproc, RACCT_WRITEBPS,
 				    uio->uio_iov[i].iov_len);
 				racct_add_force(curproc, RACCT_WRITEIOPS, 1);
 			}
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 
 		while (uio->uio_iov[i].iov_len) {
 			g_reset_bio(bp);
 			if (uio->uio_rw == UIO_READ) {
 				bp->bio_cmd = BIO_READ;
 				curthread->td_ru.ru_inblock++;
 			} else {
 				bp->bio_cmd = BIO_WRITE;
 				curthread->td_ru.ru_oublock++;
 			}
 			bp->bio_offset = uio->uio_offset;
 			bp->bio_data = uio->uio_iov[i].iov_base;
 			bp->bio_length = uio->uio_iov[i].iov_len;
 			if (bp->bio_length > dev->si_iosize_max)
 				bp->bio_length = dev->si_iosize_max;
 			if (bp->bio_length > MAXPHYS)
 				bp->bio_length = MAXPHYS;
 
 			/*
 			 * Make sure the pbuf can map the request.
 			 * The pbuf has kvasize = MAXPHYS, so a request
 			 * larger than MAXPHYS - PAGE_SIZE must be
 			 * page aligned or it will be fragmented.
 			 */
 			poff = (vm_offset_t)bp->bio_data & PAGE_MASK;
 			if (pbuf && bp->bio_length + poff > pbuf->b_kvasize) {
 				if (dev->si_flags & SI_NOSPLIT) {
 					uprintf("%s: request ptr %p is not "
 					    "on a page boundary; cannot split "
 					    "request\n", devtoname(dev),
 					    bp->bio_data);
 					error = EFBIG;
 					goto doerror;
 				}
 				bp->bio_length = pbuf->b_kvasize;
 				if (poff != 0)
 					bp->bio_length -= PAGE_SIZE;
 			}
 
 			bp->bio_bcount = bp->bio_length;
 			bp->bio_dev = dev;
 
 			if (pages) {
 				if ((npages = vm_fault_quick_hold_pages(
 				    &curproc->p_vmspace->vm_map,
 				    (vm_offset_t)bp->bio_data, bp->bio_length,
 				    prot, pages, maxpages)) < 0) {
 					error = EFAULT;
 					goto doerror;
 				}
 				if (pbuf) {
 					pmap_qenter((vm_offset_t)sa,
 					    pages, npages);
 					bp->bio_data = sa + poff;
 				} else {
 					bp->bio_ma = pages;
 					bp->bio_ma_n = npages;
 					bp->bio_ma_offset = poff;
 					bp->bio_data = unmapped_buf;
 					bp->bio_flags |= BIO_UNMAPPED;
 				}
 			}
 
 			csw->d_strategy(bp);
 			if (uio->uio_rw == UIO_READ)
 				biowait(bp, "physrd");
 			else
 				biowait(bp, "physwr");
 
 			if (pages) {
 				if (pbuf)
 					pmap_qremove((vm_offset_t)sa, npages);
 				vm_page_unhold_pages(pages, npages);
 			}
 
 			iolen = bp->bio_length - bp->bio_resid;
 			if (iolen == 0 && !(bp->bio_flags & BIO_ERROR))
 				goto doerror;	/* EOF */
 			uio->uio_iov[i].iov_len -= iolen;
 			uio->uio_iov[i].iov_base =
 			    (char *)uio->uio_iov[i].iov_base + iolen;
 			uio->uio_resid -= iolen;
 			uio->uio_offset += iolen;
 			if (bp->bio_flags & BIO_ERROR) {
 				error = bp->bio_error;
 				goto doerror;
 			}
 		}
 	}
 doerror:
 	if (pbuf)
 		relpbuf(pbuf, NULL);
 	else if (pages)
 		free(pages, M_DEVBUF);
 	g_destroy_bio(bp);
 	PRELE(curproc);
 	return (error);
 }
Index: head/sys/kern/kern_pmc.c
===================================================================
--- head/sys/kern/kern_pmc.c	(revision 326270)
+++ head/sys/kern/kern_pmc.c	(revision 326271)
@@ -1,344 +1,346 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003-2008 Joseph Koshy
  * Copyright (c) 2007 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/types.h>
 #include <sys/ctype.h>
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/pmc.h>
 #include <sys/pmckern.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #ifdef	HWPMC_HOOKS
 FEATURE(hwpmc_hooks, "Kernel support for HW PMC");
 #define	PMC_KERNEL_VERSION	PMC_VERSION
 #else
 #define	PMC_KERNEL_VERSION	0
 #endif
 
 MALLOC_DECLARE(M_PMCHOOKS);
 MALLOC_DEFINE(M_PMCHOOKS, "pmchooks", "Memory space for PMC hooks");
 
 const int pmc_kernel_version = PMC_KERNEL_VERSION;
 
 /* Hook variable. */
 int __read_mostly (*pmc_hook)(struct thread *td, int function, void *arg) = NULL;
 
 /* Interrupt handler */
 int __read_mostly (*pmc_intr)(int cpu, struct trapframe *tf) = NULL;
 
 /* Bitmask of CPUs requiring servicing at hardclock time */
 volatile cpuset_t pmc_cpumask;
 
 /*
  * A global count of SS mode PMCs.  When non-zero, this means that
  * we have processes that are sampling the system as a whole.
  */
 volatile int pmc_ss_count;
 
 /*
  * Since PMC(4) may not be loaded in the current kernel, the
  * convention followed is that a non-NULL value of 'pmc_hook' implies
  * the presence of this kernel module.
  *
  * This requires us to protect 'pmc_hook' with a
  * shared (sx) lock -- thus making the process of calling into PMC(4)
  * somewhat more expensive than a simple 'if' check and indirect call.
  */
 struct sx pmc_sx;
 
 /*
  * PMC Soft per cpu trapframe.
  */
 struct trapframe pmc_tf[MAXCPU];
 
 /*
  * PMC Soft use a global table to store registered events.
  */
 
 SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters");
 
 static int pmc_softevents = 16;
 SYSCTL_INT(_kern_hwpmc, OID_AUTO, softevents, CTLFLAG_RDTUN,
     &pmc_softevents, 0, "maximum number of soft events");
 
 struct mtx pmc_softs_mtx;
 int pmc_softs_count;
 struct pmc_soft **pmc_softs;
 
 MTX_SYSINIT(pmc_soft_mtx, &pmc_softs_mtx, "pmc-softs", MTX_SPIN);
 
 static void
 pmc_init_sx(void)
 {
 	sx_init_flags(&pmc_sx, "pmc-sx", SX_NOWITNESS);
 }
 
 SYSINIT(pmcsx, SI_SUB_LOCK, SI_ORDER_MIDDLE, pmc_init_sx, NULL);
 
 /*
  * Helper functions.
  */
 
 /*
  * A note on the CPU numbering scheme used by the hwpmc(4) driver.
  *
  * CPUs are denoted using numbers in the range 0..[pmc_cpu_max()-1].
  * CPUs could be numbered "sparsely" in this range; the predicate
  * `pmc_cpu_is_present()' is used to test whether a given CPU is
  * physically present.
  *
  * Further, a CPU that is physically present may be administratively
  * disabled or otherwise unavailable for use by hwpmc(4).  The
  * `pmc_cpu_is_active()' predicate tests for CPU usability.  An
  * "active" CPU participates in thread scheduling and can field
  * interrupts raised by PMC hardware.
  *
  * On systems with hyperthreaded CPUs, multiple logical CPUs may share
  * PMC hardware resources.  For such processors one logical CPU is
  * denoted as the primary owner of the in-CPU PMC resources. The
  * pmc_cpu_is_primary() predicate is used to distinguish this primary
  * CPU from the others.
  */
 
 int
 pmc_cpu_is_active(int cpu)
 {
 #ifdef	SMP
 	return (pmc_cpu_is_present(cpu) &&
 	    !CPU_ISSET(cpu, &hlt_cpus_mask));
 #else
 	return (1);
 #endif
 }
 
 /* Deprecated. */
 int
 pmc_cpu_is_disabled(int cpu)
 {
 	return (!pmc_cpu_is_active(cpu));
 }
 
 int
 pmc_cpu_is_present(int cpu)
 {
 #ifdef	SMP
 	return (!CPU_ABSENT(cpu));
 #else
 	return (1);
 #endif
 }
 
 int
 pmc_cpu_is_primary(int cpu)
 {
 #ifdef	SMP
 	return (!CPU_ISSET(cpu, &logical_cpus_mask));
 #else
 	return (1);
 #endif
 }
 
 
 /*
  * Return the maximum CPU number supported by the system.  The return
  * value is used for scaling internal data structures and for runtime
  * checks.
  */
 unsigned int
 pmc_cpu_max(void)
 {
 #ifdef	SMP
 	return (mp_maxid+1);
 #else
 	return (1);
 #endif
 }
 
 #ifdef	INVARIANTS
 
 /*
  * Return the count of CPUs in the `active' state in the system.
  */
 int
 pmc_cpu_max_active(void)
 {
 #ifdef	SMP
 	/*
 	 * When support for CPU hot-plugging is added to the kernel,
 	 * this function would change to return the current number
 	 * of "active" CPUs.
 	 */
 	return (mp_ncpus);
 #else
 	return (1);
 #endif
 }
 
 #endif
 
 /*
  * Cleanup event name:
  * - remove duplicate '_'
  * - all uppercase
  */
 static void
 pmc_soft_namecleanup(char *name)
 {
 	char *p, *q;
 
 	p = q = name;
 
 	for ( ; *p == '_' ; p++)
 		;
 	for ( ; *p ; p++) {
 		if (*p == '_' && (*(p + 1) == '_' || *(p + 1) == '\0'))
 			continue;
 		else
 			*q++ = toupper(*p);
 	}
 	*q = '\0';
 }
 
 void
 pmc_soft_ev_register(struct pmc_soft *ps)
 {
 	static int warned = 0;
 	int n;
 
 	ps->ps_running  = 0;
 	ps->ps_ev.pm_ev_code = 0; /* invalid */
 	pmc_soft_namecleanup(ps->ps_ev.pm_ev_name);
 
 	mtx_lock_spin(&pmc_softs_mtx);
 
 	if (pmc_softs_count >= pmc_softevents) {
 		/*
 		 * XXX Reusing events can enter a race condition where
 		 * new allocated event will be used as an old one.
 		 */
 		for (n = 0; n < pmc_softevents; n++)
 			if (pmc_softs[n] == NULL)
 				break;
 		if (n == pmc_softevents) {
 			mtx_unlock_spin(&pmc_softs_mtx);
 			if (!warned) {
 				printf("hwpmc: too many soft events, "
 				    "increase kern.hwpmc.softevents tunable\n");
 				warned = 1;
 			}
 			return;
 		}
 
 		ps->ps_ev.pm_ev_code = PMC_EV_SOFT_FIRST + n;
 		pmc_softs[n] = ps;
 	} else {
 		ps->ps_ev.pm_ev_code = PMC_EV_SOFT_FIRST + pmc_softs_count;
 		pmc_softs[pmc_softs_count++] = ps;
 	}
 
 	mtx_unlock_spin(&pmc_softs_mtx);
 }
 
 void
 pmc_soft_ev_deregister(struct pmc_soft *ps)
 {
 
 	KASSERT(ps != NULL, ("pmc_soft_deregister: called with NULL"));
 
 	mtx_lock_spin(&pmc_softs_mtx);
 
 	if (ps->ps_ev.pm_ev_code != 0 &&
 	    (ps->ps_ev.pm_ev_code - PMC_EV_SOFT_FIRST) < pmc_softevents) {
 		KASSERT((int)ps->ps_ev.pm_ev_code >= PMC_EV_SOFT_FIRST &&
 		    (int)ps->ps_ev.pm_ev_code <= PMC_EV_SOFT_LAST,
 		    ("pmc_soft_deregister: invalid event value"));
 		pmc_softs[ps->ps_ev.pm_ev_code - PMC_EV_SOFT_FIRST] = NULL;
 	}
 
 	mtx_unlock_spin(&pmc_softs_mtx);
 }
 
 struct pmc_soft *
 pmc_soft_ev_acquire(enum pmc_event ev)
 {
 	struct pmc_soft *ps;
 
 	if (ev == 0 || (ev - PMC_EV_SOFT_FIRST) >= pmc_softevents)
 		return NULL;
 
 	KASSERT((int)ev >= PMC_EV_SOFT_FIRST &&
 	    (int)ev <= PMC_EV_SOFT_LAST,
 	    ("event out of range"));
 
 	mtx_lock_spin(&pmc_softs_mtx);
 
 	ps = pmc_softs[ev - PMC_EV_SOFT_FIRST];
 	if (ps == NULL)
 		mtx_unlock_spin(&pmc_softs_mtx);
 
 	return ps;
 }
 
 void
 pmc_soft_ev_release(struct pmc_soft *ps)
 {
 
 	mtx_unlock_spin(&pmc_softs_mtx);
 }
 
 /*
  *  Initialise hwpmc.
  */
 static void
 init_hwpmc(void *dummy __unused)
 {
 	if (pmc_softevents <= 0 ||
 	    pmc_softevents > PMC_EV_DYN_COUNT) {
 		(void) printf("hwpmc: tunable \"softevents\"=%d out of "
 		    "range.\n", pmc_softevents);
 		pmc_softevents = PMC_EV_DYN_COUNT;
 	}
 	pmc_softs = malloc(pmc_softevents * sizeof(struct pmc_soft *), M_PMCHOOKS, M_NOWAIT|M_ZERO);
 	KASSERT(pmc_softs != NULL, ("cannot allocate soft events table"));
 }
 
 SYSINIT(hwpmc, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_hwpmc, NULL);
 
Index: head/sys/kern/kern_poll.c
===================================================================
--- head/sys/kern/kern_poll.c	(revision 326270)
+++ head/sys/kern/kern_poll.c	(revision 326271)
@@ -1,574 +1,576 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2001-2002 Luigi Rizzo
  *
  * Supported by: the Xorp Project (www.xorp.org)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_device_polling.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/eventhandler.h>
 #include <sys/resourcevar.h>
 #include <sys/socket.h>			/* needed by net/if.h		*/
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>			/* for NETISR_POLL		*/
 #include <net/vnet.h>
 
 void hardclock_device_poll(void);	/* hook from hardclock		*/
 
 static struct mtx	poll_mtx;
 
 /*
  * Polling support for [network] device drivers.
  *
  * Drivers which support this feature can register with the
  * polling code.
  *
  * If registration is successful, the driver must disable interrupts,
  * and further I/O is performed through the handler, which is invoked
  * (at least once per clock tick) with 3 arguments: the "arg" passed at
  * register time (a struct ifnet pointer), a command, and a "count" limit.
  *
  * The command can be one of the following:
  *  POLL_ONLY: quick move of "count" packets from input/output queues.
  *  POLL_AND_CHECK_STATUS: as above, plus check status registers or do
  *	other more expensive operations. This command is issued periodically
  *	but less frequently than POLL_ONLY.
  *
  * The count limit specifies how much work the handler can do during the
  * call -- typically this is the number of packets to be received, or
  * transmitted, etc. (drivers are free to interpret this number, as long
  * as the max time spent in the function grows roughly linearly with the
  * count).
  *
  * Polling is enabled and disabled via setting IFCAP_POLLING flag on
  * the interface. The driver ioctl handler should register interface
  * with polling and disable interrupts, if registration was successful.
  *
  * A second variable controls the sharing of CPU between polling/kernel
  * network processing, and other activities (typically userlevel tasks):
  * kern.polling.user_frac (between 0 and 100, default 50) sets the share
  * of CPU allocated to user tasks. CPU is allocated proportionally to the
  * shares, by dynamically adjusting the "count" (poll_burst).
  *
  * Other parameters can should be left to their default values.
  * The following constraints hold
  *
  *	1 <= poll_each_burst <= poll_burst <= poll_burst_max
  *	MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
  */
 
 #define MIN_POLL_BURST_MAX	10
 #define MAX_POLL_BURST_MAX	20000
 
 static uint32_t poll_burst = 5;
 static uint32_t poll_burst_max = 150;	/* good for 100Mbit net and HZ=1000 */
 static uint32_t poll_each_burst = 5;
 
 static SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
 	"Device polling parameters");
 
 SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RD,
 	&poll_burst, 0, "Current polling burst size");
 
 static int	netisr_poll_scheduled;
 static int	netisr_pollmore_scheduled;
 static int	poll_shutting_down;
 
 static int poll_burst_max_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t val = poll_burst_max;
 	int error;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < MIN_POLL_BURST_MAX || val > MAX_POLL_BURST_MAX)
 		return (EINVAL);
 
 	mtx_lock(&poll_mtx);
 	poll_burst_max = val;
 	if (poll_burst > poll_burst_max)
 		poll_burst = poll_burst_max;
 	if (poll_each_burst > poll_burst_max)
 		poll_each_burst = MIN_POLL_BURST_MAX;
 	mtx_unlock(&poll_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_kern_polling, OID_AUTO, burst_max, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(uint32_t), poll_burst_max_sysctl, "I", "Max Polling burst size");
 
 static int poll_each_burst_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t val = poll_each_burst;
 	int error;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < 1)
 		return (EINVAL);
 
 	mtx_lock(&poll_mtx);
 	if (val > poll_burst_max) {
 		mtx_unlock(&poll_mtx);
 		return (EINVAL);
 	}
 	poll_each_burst = val;
 	mtx_unlock(&poll_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_kern_polling, OID_AUTO, each_burst, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(uint32_t), poll_each_burst_sysctl, "I",
 	"Max size of each burst");
 
 static uint32_t poll_in_idle_loop=0;	/* do we poll in idle loop ? */
 SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW,
 	&poll_in_idle_loop, 0, "Enable device polling in idle loop");
 
 static uint32_t user_frac = 50;
 static int user_frac_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t val = user_frac;
 	int error;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val > 99)
 		return (EINVAL);
 
 	mtx_lock(&poll_mtx);
 	user_frac = val;
 	mtx_unlock(&poll_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_kern_polling, OID_AUTO, user_frac, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(uint32_t), user_frac_sysctl, "I",
 	"Desired user fraction of cpu time");
 
 static uint32_t reg_frac_count = 0;
 static uint32_t reg_frac = 20 ;
 static int reg_frac_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t val = reg_frac;
 	int error;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < 1 || val > hz)
 		return (EINVAL);
 
 	mtx_lock(&poll_mtx);
 	reg_frac = val;
 	if (reg_frac_count >= reg_frac)
 		reg_frac_count = 0;
 	mtx_unlock(&poll_mtx);
 
 	return (0);
 }
 SYSCTL_PROC(_kern_polling, OID_AUTO, reg_frac, CTLTYPE_UINT | CTLFLAG_RW,
 	0, sizeof(uint32_t), reg_frac_sysctl, "I",
 	"Every this many cycles check registers");
 
 static uint32_t short_ticks;
 SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RD,
 	&short_ticks, 0, "Hardclock ticks shorter than they should be");
 
 static uint32_t lost_polls;
 SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RD,
 	&lost_polls, 0, "How many times we would have lost a poll tick");
 
 static uint32_t pending_polls;
 SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RD,
 	&pending_polls, 0, "Do we need to poll again");
 
 static int residual_burst = 0;
 SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RD,
 	&residual_burst, 0, "# of residual cycles in burst");
 
 static uint32_t poll_handlers; /* next free entry in pr[]. */
 SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD,
 	&poll_handlers, 0, "Number of registered poll handlers");
 
 static uint32_t phase;
 SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RD,
 	&phase, 0, "Polling phase");
 
 static uint32_t suspect;
 SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RD,
 	&suspect, 0, "suspect event");
 
 static uint32_t stalled;
 SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RD,
 	&stalled, 0, "potential stalls");
 
 static uint32_t idlepoll_sleeping; /* idlepoll is sleeping */
 SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD,
 	&idlepoll_sleeping, 0, "idlepoll is sleeping");
 
 
 #define POLL_LIST_LEN  128
 struct pollrec {
 	poll_handler_t	*handler;
 	struct ifnet	*ifp;
 };
 
 static struct pollrec pr[POLL_LIST_LEN];
 
 static void
 poll_shutdown(void *arg, int howto)
 {
 
 	poll_shutting_down = 1;
 }
 
 static void
 init_device_poll(void)
 {
 
 	mtx_init(&poll_mtx, "polling", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(shutdown_post_sync, poll_shutdown, NULL,
 	    SHUTDOWN_PRI_LAST);
 }
 SYSINIT(device_poll, SI_SUB_SOFTINTR, SI_ORDER_MIDDLE, init_device_poll, NULL);
 
 
 /*
  * Hook from hardclock. Tries to schedule a netisr, but keeps track
  * of lost ticks due to the previous handler taking too long.
  * Normally, this should not happen, because polling handler should
  * run for a short time. However, in some cases (e.g. when there are
  * changes in link status etc.) the drivers take a very long time
  * (even in the order of milliseconds) to reset and reconfigure the
  * device, causing apparent lost polls.
  *
  * The first part of the code is just for debugging purposes, and tries
  * to count how often hardclock ticks are shorter than they should,
  * meaning either stray interrupts or delayed events.
  */
 void
 hardclock_device_poll(void)
 {
 	static struct timeval prev_t, t;
 	int delta;
 
 	if (poll_handlers == 0 || poll_shutting_down)
 		return;
 
 	microuptime(&t);
 	delta = (t.tv_usec - prev_t.tv_usec) +
 		(t.tv_sec - prev_t.tv_sec)*1000000;
 	if (delta * hz < 500000)
 		short_ticks++;
 	else
 		prev_t = t;
 
 	if (pending_polls > 100) {
 		/*
 		 * Too much, assume it has stalled (not always true
 		 * see comment above).
 		 */
 		stalled++;
 		pending_polls = 0;
 		phase = 0;
 	}
 
 	if (phase <= 2) {
 		if (phase != 0)
 			suspect++;
 		phase = 1;
 		netisr_poll_scheduled = 1;
 		netisr_pollmore_scheduled = 1;
 		netisr_sched_poll();
 		phase = 2;
 	}
 	if (pending_polls++ > 0)
 		lost_polls++;
 }
 
 /*
  * ether_poll is called from the idle loop.
  */
 static void
 ether_poll(int count)
 {
 	int i;
 
 	mtx_lock(&poll_mtx);
 
 	if (count > poll_each_burst)
 		count = poll_each_burst;
 
 	for (i = 0 ; i < poll_handlers ; i++)
 		pr[i].handler(pr[i].ifp, POLL_ONLY, count);
 
 	mtx_unlock(&poll_mtx);
 }
 
 /*
  * netisr_pollmore is called after other netisr's, possibly scheduling
  * another NETISR_POLL call, or adapting the burst size for the next cycle.
  *
  * It is very bad to fetch large bursts of packets from a single card at once,
  * because the burst could take a long time to be completely processed, or
  * could saturate the intermediate queue (ipintrq or similar) leading to
  * losses or unfairness. To reduce the problem, and also to account better for
  * time spent in network-related processing, we split the burst in smaller
  * chunks of fixed size, giving control to the other netisr's between chunks.
  * This helps in improving the fairness, reducing livelock (because we
  * emulate more closely the "process to completion" that we have with
  * fastforwarding) and accounting for the work performed in low level
  * handling and forwarding.
  */
 
 static struct timeval poll_start_t;
 
 void
 netisr_pollmore()
 {
 	struct timeval t;
 	int kern_load;
 
 	if (poll_handlers == 0)
 		return;
 
 	mtx_lock(&poll_mtx);
 	if (!netisr_pollmore_scheduled) {
 		mtx_unlock(&poll_mtx);
 		return;
 	}
 	netisr_pollmore_scheduled = 0;
 	phase = 5;
 	if (residual_burst > 0) {
 		netisr_poll_scheduled = 1;
 		netisr_pollmore_scheduled = 1;
 		netisr_sched_poll();
 		mtx_unlock(&poll_mtx);
 		/* will run immediately on return, followed by netisrs */
 		return;
 	}
 	/* here we can account time spent in netisr's in this tick */
 	microuptime(&t);
 	kern_load = (t.tv_usec - poll_start_t.tv_usec) +
 		(t.tv_sec - poll_start_t.tv_sec)*1000000;	/* us */
 	kern_load = (kern_load * hz) / 10000;			/* 0..100 */
 	if (kern_load > (100 - user_frac)) { /* try decrease ticks */
 		if (poll_burst > 1)
 			poll_burst--;
 	} else {
 		if (poll_burst < poll_burst_max)
 			poll_burst++;
 	}
 
 	pending_polls--;
 	if (pending_polls == 0) /* we are done */
 		phase = 0;
 	else {
 		/*
 		 * Last cycle was long and caused us to miss one or more
 		 * hardclock ticks. Restart processing again, but slightly
 		 * reduce the burst size to prevent that this happens again.
 		 */
 		poll_burst -= (poll_burst / 8);
 		if (poll_burst < 1)
 			poll_burst = 1;
 		netisr_poll_scheduled = 1;
 		netisr_pollmore_scheduled = 1;
 		netisr_sched_poll();
 		phase = 6;
 	}
 	mtx_unlock(&poll_mtx);
 }
 
 /*
  * netisr_poll is typically scheduled once per tick.
  */
 void
 netisr_poll(void)
 {
 	int i, cycles;
 	enum poll_cmd arg = POLL_ONLY;
 
 	if (poll_handlers == 0)
 		return;
 
 	mtx_lock(&poll_mtx);
 	if (!netisr_poll_scheduled) {
 		mtx_unlock(&poll_mtx);
 		return;
 	}
 	netisr_poll_scheduled = 0;
 	phase = 3;
 	if (residual_burst == 0) { /* first call in this tick */
 		microuptime(&poll_start_t);
 		if (++reg_frac_count == reg_frac) {
 			arg = POLL_AND_CHECK_STATUS;
 			reg_frac_count = 0;
 		}
 
 		residual_burst = poll_burst;
 	}
 	cycles = (residual_burst < poll_each_burst) ?
 		residual_burst : poll_each_burst;
 	residual_burst -= cycles;
 
 	for (i = 0 ; i < poll_handlers ; i++)
 		pr[i].handler(pr[i].ifp, arg, cycles);
 
 	phase = 4;
 	mtx_unlock(&poll_mtx);
 }
 
 /*
  * Try to register routine for polling. Returns 0 if successful
  * (and polling should be enabled), error code otherwise.
  * A device is not supposed to register itself multiple times.
  *
  * This is called from within the *_ioctl() functions.
  */
 int
 ether_poll_register(poll_handler_t *h, if_t ifp)
 {
 	int i;
 
 	KASSERT(h != NULL, ("%s: handler is NULL", __func__));
 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 
 	mtx_lock(&poll_mtx);
 	if (poll_handlers >= POLL_LIST_LEN) {
 		/*
 		 * List full, cannot register more entries.
 		 * This should never happen; if it does, it is probably a
 		 * broken driver trying to register multiple times. Checking
 		 * this at runtime is expensive, and won't solve the problem
 		 * anyways, so just report a few times and then give up.
 		 */
 		static int verbose = 10 ;
 		if (verbose >0) {
 			log(LOG_ERR, "poll handlers list full, "
 			    "maybe a broken driver ?\n");
 			verbose--;
 		}
 		mtx_unlock(&poll_mtx);
 		return (ENOMEM); /* no polling for you */
 	}
 
 	for (i = 0 ; i < poll_handlers ; i++)
 		if (pr[i].ifp == ifp && pr[i].handler != NULL) {
 			mtx_unlock(&poll_mtx);
 			log(LOG_DEBUG, "ether_poll_register: %s: handler"
 			    " already registered\n", ifp->if_xname);
 			return (EEXIST);
 		}
 
 	pr[poll_handlers].handler = h;
 	pr[poll_handlers].ifp = ifp;
 	poll_handlers++;
 	mtx_unlock(&poll_mtx);
 	if (idlepoll_sleeping)
 		wakeup(&idlepoll_sleeping);
 	return (0);
 }
 
 /*
  * Remove interface from the polling list. Called from *_ioctl(), too.
  */
 int
 ether_poll_deregister(if_t ifp)
 {
 	int i;
 
 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 
 	mtx_lock(&poll_mtx);
 
 	for (i = 0 ; i < poll_handlers ; i++)
 		if (pr[i].ifp == ifp) /* found it */
 			break;
 	if (i == poll_handlers) {
 		log(LOG_DEBUG, "ether_poll_deregister: %s: not found!\n",
 		    ifp->if_xname);
 		mtx_unlock(&poll_mtx);
 		return (ENOENT);
 	}
 	poll_handlers--;
 	if (i < poll_handlers) { /* Last entry replaces this one. */
 		pr[i].handler = pr[poll_handlers].handler;
 		pr[i].ifp = pr[poll_handlers].ifp;
 	}
 	mtx_unlock(&poll_mtx);
 	return (0);
 }
 
 static void
 poll_idle(void)
 {
 	struct thread *td = curthread;
 	struct rtprio rtp;
 
 	rtp.prio = RTP_PRIO_MAX;	/* lowest priority */
 	rtp.type = RTP_PRIO_IDLE;
 	PROC_SLOCK(td->td_proc);
 	rtp_to_pri(&rtp, td);
 	PROC_SUNLOCK(td->td_proc);
 
 	for (;;) {
 		if (poll_in_idle_loop && poll_handlers > 0) {
 			idlepoll_sleeping = 0;
 			ether_poll(poll_each_burst);
 			thread_lock(td);
 			mi_switch(SW_VOL, NULL);
 			thread_unlock(td);
 		} else {
 			idlepoll_sleeping = 1;
 			tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3);
 		}
 	}
 }
 
 static struct proc *idlepoll;
 static struct kproc_desc idlepoll_kp = {
 	 "idlepoll",
 	 poll_idle,
 	 &idlepoll
 };
 SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start,
     &idlepoll_kp);
Index: head/sys/kern/kern_priv.c
===================================================================
--- head/sys/kern/kern_priv.c	(revision 326270)
+++ head/sys/kern/kern_priv.c	(revision 326271)
@@ -1,181 +1,183 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2006 nCircle Network Security, Inc.
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson for the TrustedBSD
  * Project under contract to nCircle Network Security, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY,
  * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * `suser_enabled' (which can be set by the security.bsd.suser_enabled
  * sysctl) determines whether the system 'super-user' policy is in effect.  If
  * it is nonzero, an effective uid of 0 connotes special privilege,
  * overriding many mandatory and discretionary protections.  If it is zero,
  * uid 0 is offered no special privilege in the kernel security policy.
  * Setting it to zero may seriously impact the functionality of many existing
  * userland programs, and should not be done without careful consideration of
  * the consequences.
  */
 static int	suser_enabled = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RWTUN,
     &suser_enabled, 0, "processes with uid 0 have privilege");
 
 static int	unprivileged_mlock = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_mlock, CTLFLAG_RWTUN,
     &unprivileged_mlock, 0, "Allow non-root users to call mlock(2)");
 
 SDT_PROVIDER_DEFINE(priv);
 SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv__ok, "int");
 SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv__err, "int");
 
 /*
  * Check a credential for privilege.  Lots of good reasons to deny privilege;
  * only a few to grant it.
  */
 int
 priv_check_cred(struct ucred *cred, int priv, int flags)
 {
 	int error;
 
 	KASSERT(PRIV_VALID(priv), ("priv_check_cred: invalid privilege %d",
 	    priv));
 
 	/*
 	 * We first evaluate policies that may deny the granting of
 	 * privilege unilaterally.
 	 */
 #ifdef MAC
 	error = mac_priv_check(cred, priv);
 	if (error)
 		goto out;
 #endif
 
 	/*
 	 * Jail policy will restrict certain privileges that may otherwise be
 	 * be granted.
 	 */
 	error = prison_priv_check(cred, priv);
 	if (error)
 		goto out;
 
 	if (unprivileged_mlock) {
 		/*
 		 * Allow unprivileged users to call mlock(2)/munlock(2) and
 		 * mlockall(2)/munlockall(2).
 		 */
 		switch (priv) {
 		case PRIV_VM_MLOCK:
 		case PRIV_VM_MUNLOCK:
 			error = 0;
 			goto out;
 		}
 	}
 
 	/*
 	 * Having determined if privilege is restricted by various policies,
 	 * now determine if privilege is granted.  At this point, any policy
 	 * may grant privilege.  For now, we allow short-circuit boolean
 	 * evaluation, so may not call all policies.  Perhaps we should.
 	 *
 	 * Superuser policy grants privilege based on the effective (or in
 	 * the case of specific privileges, real) uid being 0.  We allow the
 	 * superuser policy to be globally disabled, although this is
 	 * currenty of limited utility.
 	 */
 	if (suser_enabled) {
 		switch (priv) {
 		case PRIV_MAXFILES:
 		case PRIV_MAXPROC:
 		case PRIV_PROC_LIMIT:
 			if (cred->cr_ruid == 0) {
 				error = 0;
 				goto out;
 			}
 			break;
 		default:
 			if (cred->cr_uid == 0) {
 				error = 0;
 				goto out;
 			}
 			break;
 		}
 	}
 
 	/*
 	 * Writes to kernel/physical memory are a typical root-only operation,
 	 * but non-root users are expected to be able to read it (provided they
 	 * have permission to access /dev/[k]mem).
 	 */
 	if (priv == PRIV_KMEM_READ) {
 		error = 0;
 		goto out;
 	}
 
 	/*
 	 * Now check with MAC, if enabled, to see if a policy module grants
 	 * privilege.
 	 */
 #ifdef MAC
 	if (mac_priv_grant(cred, priv) == 0) {
 		error = 0;
 		goto out;
 	}
 #endif
 
 	/*
 	 * The default is deny, so if no policies have granted it, reject
 	 * with a privilege error here.
 	 */
 	error = EPERM;
 out:
 	if (error)
 		SDT_PROBE1(priv, kernel, priv_check, priv__err, priv);
 	else
 		SDT_PROBE1(priv, kernel, priv_check, priv__ok, priv);
 	return (error);
 }
 
 int
 priv_check(struct thread *td, int priv)
 {
 
 	KASSERT(td == curthread, ("priv_check: td != curthread"));
 
 	return (priv_check_cred(td->td_ucred, priv, 0));
 }
Index: head/sys/kern/kern_racct.c
===================================================================
--- head/sys/kern/kern_racct.c	(revision 326270)
+++ head/sys/kern/kern_racct.c	(revision 326271)
@@ -1,1342 +1,1344 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/buf.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/umtx.h>
 #include <machine/smp.h>
 
 #ifdef RCTL
 #include <sys/rctl.h>
 #endif
 
 #ifdef RACCT
 
 FEATURE(racct, "Resource Accounting");
 
 /*
  * Do not block processes that have their %cpu usage <= pcpu_threshold.
  */
 static int pcpu_threshold = 1;
 #ifdef RACCT_DEFAULT_TO_DISABLED
 int racct_enable = 0;
 #else
 int racct_enable = 1;
 #endif
 
 SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
 SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
     0, "Enable RACCT/RCTL");
 SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
     0, "Processes with higher %cpu usage than this value can be throttled.");
 
 /*
  * How many seconds it takes to use the scheduler %cpu calculations.  When a
  * process starts, we compute its %cpu usage by dividing its runtime by the
  * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
  * provided by the scheduler.
  */
 #define RACCT_PCPU_SECS		3
 
 struct mtx racct_lock;
 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
 
 static uma_zone_t racct_zone;
 
 static void racct_sub_racct(struct racct *dest, const struct racct *src);
 static void racct_sub_cred_locked(struct ucred *cred, int resource,
 		uint64_t amount);
 static void racct_add_cred_locked(struct ucred *cred, int resource,
 		uint64_t amount);
 
 SDT_PROVIDER_DEFINE(racct);
 SDT_PROBE_DEFINE3(racct, , rusage, add,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, , rusage, add__failure,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, , rusage, add__buf,
     "struct proc *", "const struct buf *", "int");
 SDT_PROBE_DEFINE3(racct, , rusage, add__cred,
     "struct ucred *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, , rusage, add__force,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, , rusage, set,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, , rusage, set__failure,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, , rusage, set__force,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, , rusage, sub,
     "struct proc *", "int", "uint64_t");
 SDT_PROBE_DEFINE3(racct, , rusage, sub__cred,
     "struct ucred *", "int", "uint64_t");
 SDT_PROBE_DEFINE1(racct, , racct, create,
     "struct racct *");
 SDT_PROBE_DEFINE1(racct, , racct, destroy,
     "struct racct *");
 SDT_PROBE_DEFINE2(racct, , racct, join,
     "struct racct *", "struct racct *");
 SDT_PROBE_DEFINE2(racct, , racct, join__failure,
     "struct racct *", "struct racct *");
 SDT_PROBE_DEFINE2(racct, , racct, leave,
     "struct racct *", "struct racct *");
 
 int racct_types[] = {
 	[RACCT_CPU] =
 		RACCT_IN_MILLIONS,
 	[RACCT_DATA] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_STACK] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_CORE] =
 		RACCT_DENIABLE,
 	[RACCT_RSS] =
 		RACCT_RECLAIMABLE,
 	[RACCT_MEMLOCK] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_NPROC] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_NOFILE] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_VMEM] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_NPTS] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_SWAP] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NTHR] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
 	[RACCT_MSGQQUEUED] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_MSGQSIZE] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NMSGQ] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NSEM] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_NSEMOP] =
 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
 	[RACCT_NSHM] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_SHMSIZE] =
 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
 	[RACCT_WALLCLOCK] =
 		RACCT_IN_MILLIONS,
 	[RACCT_PCTCPU] =
 		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
 	[RACCT_READBPS] =
 		RACCT_DECAYING,
 	[RACCT_WRITEBPS] =
 		RACCT_DECAYING,
 	[RACCT_READIOPS] =
 		RACCT_DECAYING,
 	[RACCT_WRITEIOPS] =
 		RACCT_DECAYING };
 
 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
 
 #ifdef SCHED_4BSD
 /*
  * Contains intermediate values for %cpu calculations to avoid using floating
  * point in the kernel.
  * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
  * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
  * zero so the calculations are more straightforward.
  */
 fixpt_t ccpu_exp[] = {
 	[0] = FSCALE * 1,
 	[1] = FSCALE * 0.95122942450071400909,
 	[2] = FSCALE * 0.90483741803595957316,
 	[3] = FSCALE * 0.86070797642505780722,
 	[4] = FSCALE * 0.81873075307798185866,
 	[5] = FSCALE * 0.77880078307140486824,
 	[6] = FSCALE * 0.74081822068171786606,
 	[7] = FSCALE * 0.70468808971871343435,
 	[8] = FSCALE * 0.67032004603563930074,
 	[9] = FSCALE * 0.63762815162177329314,
 	[10] = FSCALE * 0.60653065971263342360,
 	[11] = FSCALE * 0.57694981038048669531,
 	[12] = FSCALE * 0.54881163609402643262,
 	[13] = FSCALE * 0.52204577676101604789,
 	[14] = FSCALE * 0.49658530379140951470,
 	[15] = FSCALE * 0.47236655274101470713,
 	[16] = FSCALE * 0.44932896411722159143,
 	[17] = FSCALE * 0.42741493194872666992,
 	[18] = FSCALE * 0.40656965974059911188,
 	[19] = FSCALE * 0.38674102345450120691,
 	[20] = FSCALE * 0.36787944117144232159,
 	[21] = FSCALE * 0.34993774911115535467,
 	[22] = FSCALE * 0.33287108369807955328,
 	[23] = FSCALE * 0.31663676937905321821,
 	[24] = FSCALE * 0.30119421191220209664,
 	[25] = FSCALE * 0.28650479686019010032,
 	[26] = FSCALE * 0.27253179303401260312,
 	[27] = FSCALE * 0.25924026064589150757,
 	[28] = FSCALE * 0.24659696394160647693,
 	[29] = FSCALE * 0.23457028809379765313,
 	[30] = FSCALE * 0.22313016014842982893,
 	[31] = FSCALE * 0.21224797382674305771,
 	[32] = FSCALE * 0.20189651799465540848,
 	[33] = FSCALE * 0.19204990862075411423,
 	[34] = FSCALE * 0.18268352405273465022,
 	[35] = FSCALE * 0.17377394345044512668,
 	[36] = FSCALE * 0.16529888822158653829,
 	[37] = FSCALE * 0.15723716631362761621,
 	[38] = FSCALE * 0.14956861922263505264,
 	[39] = FSCALE * 0.14227407158651357185,
 	[40] = FSCALE * 0.13533528323661269189,
 	[41] = FSCALE * 0.12873490358780421886,
 	[42] = FSCALE * 0.12245642825298191021,
 	[43] = FSCALE * 0.11648415777349695786,
 	[44] = FSCALE * 0.11080315836233388333,
 	[45] = FSCALE * 0.10539922456186433678,
 	[46] = FSCALE * 0.10025884372280373372,
 	[47] = FSCALE * 0.09536916221554961888,
 	[48] = FSCALE * 0.09071795328941250337,
 	[49] = FSCALE * 0.08629358649937051097,
 	[50] = FSCALE * 0.08208499862389879516,
 	[51] = FSCALE * 0.07808166600115315231,
 	[52] = FSCALE * 0.07427357821433388042,
 	[53] = FSCALE * 0.07065121306042958674,
 	[54] = FSCALE * 0.06720551273974976512,
 	[55] = FSCALE * 0.06392786120670757270,
 	[56] = FSCALE * 0.06081006262521796499,
 	[57] = FSCALE * 0.05784432087483846296,
 	[58] = FSCALE * 0.05502322005640722902,
 	[59] = FSCALE * 0.05233970594843239308,
 	[60] = FSCALE * 0.04978706836786394297,
 	[61] = FSCALE * 0.04735892439114092119,
 	[62] = FSCALE * 0.04504920239355780606,
 	[63] = FSCALE * 0.04285212686704017991,
 	[64] = FSCALE * 0.04076220397836621516,
 	[65] = FSCALE * 0.03877420783172200988,
 	[66] = FSCALE * 0.03688316740124000544,
 	[67] = FSCALE * 0.03508435410084502588,
 	[68] = FSCALE * 0.03337326996032607948,
 	[69] = FSCALE * 0.03174563637806794323,
 	[70] = FSCALE * 0.03019738342231850073,
 	[71] = FSCALE * 0.02872463965423942912,
 	[72] = FSCALE * 0.02732372244729256080,
 	[73] = FSCALE * 0.02599112877875534358,
 	[74] = FSCALE * 0.02472352647033939120,
 	[75] = FSCALE * 0.02351774585600910823,
 	[76] = FSCALE * 0.02237077185616559577,
 	[77] = FSCALE * 0.02127973643837716938,
 	[78] = FSCALE * 0.02024191144580438847,
 	[79] = FSCALE * 0.01925470177538692429,
 	[80] = FSCALE * 0.01831563888873418029,
 	[81] = FSCALE * 0.01742237463949351138,
 	[82] = FSCALE * 0.01657267540176124754,
 	[83] = FSCALE * 0.01576441648485449082,
 	[84] = FSCALE * 0.01499557682047770621,
 	[85] = FSCALE * 0.01426423390899925527,
 	[86] = FSCALE * 0.01356855901220093175,
 	[87] = FSCALE * 0.01290681258047986886,
 	[88] = FSCALE * 0.01227733990306844117,
 	[89] = FSCALE * 0.01167856697039544521,
 	[90] = FSCALE * 0.01110899653824230649,
 	[91] = FSCALE * 0.01056720438385265337,
 	[92] = FSCALE * 0.01005183574463358164,
 	[93] = FSCALE * 0.00956160193054350793,
 	[94] = FSCALE * 0.00909527710169581709,
 	[95] = FSCALE * 0.00865169520312063417,
 	[96] = FSCALE * 0.00822974704902002884,
 	[97] = FSCALE * 0.00782837754922577143,
 	[98] = FSCALE * 0.00744658307092434051,
 	[99] = FSCALE * 0.00708340892905212004,
 	[100] = FSCALE * 0.00673794699908546709,
 	[101] = FSCALE * 0.00640933344625638184,
 	[102] = FSCALE * 0.00609674656551563610,
 	[103] = FSCALE * 0.00579940472684214321,
 	[104] = FSCALE * 0.00551656442076077241,
 	[105] = FSCALE * 0.00524751839918138427,
 	[106] = FSCALE * 0.00499159390691021621,
 	[107] = FSCALE * 0.00474815099941147558,
 	[108] = FSCALE * 0.00451658094261266798,
 	[109] = FSCALE * 0.00429630469075234057,
 	[110] = FSCALE * 0.00408677143846406699,
 };
 #endif
 
 #define	CCPU_EXP_MAX	110
 
 /*
  * This function is analogical to the getpcpu() function in the ps(1) command.
  * They should both calculate in the same way so that the racct %cpu
  * calculations are consistent with the values showed by the ps(1) tool.
  * The calculations are more complex in the 4BSD scheduler because of the value
  * of the ccpu variable.  In ULE it is defined to be zero which saves us some
  * work.
  */
 static uint64_t
 racct_getpcpu(struct proc *p, u_int pcpu)
 {
 	u_int swtime;
 #ifdef SCHED_4BSD
 	fixpt_t pctcpu, pctcpu_next;
 #endif
 #ifdef SMP
 	struct pcpu *pc;
 	int found;
 #endif
 	fixpt_t p_pctcpu;
 	struct thread *td;
 
 	ASSERT_RACCT_ENABLED();
 
 	/*
 	 * If the process is swapped out, we count its %cpu usage as zero.
 	 * This behaviour is consistent with the userland ps(1) tool.
 	 */
 	if ((p->p_flag & P_INMEM) == 0)
 		return (0);
 	swtime = (ticks - p->p_swtick) / hz;
 
 	/*
 	 * For short-lived processes, the sched_pctcpu() returns small
 	 * values even for cpu intensive processes.  Therefore we use
 	 * our own estimate in this case.
 	 */
 	if (swtime < RACCT_PCPU_SECS)
 		return (pcpu);
 
 	p_pctcpu = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td == PCPU_GET(idlethread))
 			continue;
 #ifdef SMP
 		found = 0;
 		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 			if (td == pc->pc_idlethread) {
 				found = 1;
 				break;
 			}
 		}
 		if (found)
 			continue;
 #endif
 		thread_lock(td);
 #ifdef SCHED_4BSD
 		pctcpu = sched_pctcpu(td);
 		/* Count also the yet unfinished second. */
 		pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
 		pctcpu_next += sched_pctcpu_delta(td);
 		p_pctcpu += max(pctcpu, pctcpu_next);
 #else
 		/*
 		 * In ULE the %cpu statistics are updated on every
 		 * sched_pctcpu() call.  So special calculations to
 		 * account for the latest (unfinished) second are
 		 * not needed.
 		 */
 		p_pctcpu += sched_pctcpu(td);
 #endif
 		thread_unlock(td);
 	}
 
 #ifdef SCHED_4BSD
 	if (swtime <= CCPU_EXP_MAX)
 		return ((100 * (uint64_t)p_pctcpu * 1000000) /
 		    (FSCALE - ccpu_exp[swtime]));
 #endif
 
 	return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
 }
 
 static void
 racct_add_racct(struct racct *dest, const struct racct *src)
 {
 	int i;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	/*
 	 * Update resource usage in dest.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		KASSERT(dest->r_resources[i] >= 0,
 		    ("%s: resource %d propagation meltdown: dest < 0",
 		    __func__, i));
 		KASSERT(src->r_resources[i] >= 0,
 		    ("%s: resource %d propagation meltdown: src < 0",
 		    __func__, i));
 		dest->r_resources[i] += src->r_resources[i];
 	}
 }
 
 static void
 racct_sub_racct(struct racct *dest, const struct racct *src)
 {
 	int i;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	/*
 	 * Update resource usage in dest.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
 			KASSERT(dest->r_resources[i] >= 0,
 			    ("%s: resource %d propagation meltdown: dest < 0",
 			    __func__, i));
 			KASSERT(src->r_resources[i] >= 0,
 			    ("%s: resource %d propagation meltdown: src < 0",
 			    __func__, i));
 			KASSERT(src->r_resources[i] <= dest->r_resources[i],
 			    ("%s: resource %d propagation meltdown: src > dest",
 			    __func__, i));
 		}
 		if (RACCT_CAN_DROP(i)) {
 			dest->r_resources[i] -= src->r_resources[i];
 			if (dest->r_resources[i] < 0)
 				dest->r_resources[i] = 0;
 		}
 	}
 }
 
 void
 racct_create(struct racct **racctp)
 {
 
 	if (!racct_enable)
 		return;
 
 	SDT_PROBE1(racct, , racct, create, racctp);
 
 	KASSERT(*racctp == NULL, ("racct already allocated"));
 
 	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
 }
 
 static void
 racct_destroy_locked(struct racct **racctp)
 {
 	struct racct *racct;
 	int i;
 
 	ASSERT_RACCT_ENABLED();
 
 	SDT_PROBE1(racct, , racct, destroy, racctp);
 
 	RACCT_LOCK_ASSERT();
 	KASSERT(racctp != NULL, ("NULL racctp"));
 	KASSERT(*racctp != NULL, ("NULL racct"));
 
 	racct = *racctp;
 
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (RACCT_IS_SLOPPY(i))
 			continue;
 		if (!RACCT_IS_RECLAIMABLE(i))
 			continue;
 		KASSERT(racct->r_resources[i] == 0,
 		    ("destroying non-empty racct: "
 		    "%ju allocated for resource %d\n",
 		    racct->r_resources[i], i));
 	}
 	uma_zfree(racct_zone, racct);
 	*racctp = NULL;
 }
 
 void
 racct_destroy(struct racct **racct)
 {
 
 	if (!racct_enable)
 		return;
 
 	RACCT_LOCK();
 	racct_destroy_locked(racct);
 	RACCT_UNLOCK();
 }
 
 /*
  * Increase consumption of 'resource' by 'amount' for 'racct',
  * but not its parents.  Differently from other cases, 'amount' here
  * may be less than zero.
  */
 static void
 racct_adjust_resource(struct racct *racct, int resource,
     int64_t amount)
 {
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 	KASSERT(racct != NULL, ("NULL racct"));
 
 	racct->r_resources[resource] += amount;
 	if (racct->r_resources[resource] < 0) {
 		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
 		    ("%s: resource %d usage < 0", __func__, resource));
 		racct->r_resources[resource] = 0;
 	}
 	
 	/*
 	 * There are some cases where the racct %cpu resource would grow
 	 * beyond 100% per core.  For example in racct_proc_exit() we add
 	 * the process %cpu usage to the ucred racct containers.  If too
 	 * many processes terminated in a short time span, the ucred %cpu
 	 * resource could grow too much.  Also, the 4BSD scheduler sometimes
 	 * returns for a thread more than 100% cpu usage. So we set a sane
 	 * boundary here to 100% * the maxumum number of CPUs.
 	 */
 	if ((resource == RACCT_PCTCPU) &&
 	    (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU))
 		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU;
 }
 
 static int
 racct_add_locked(struct proc *p, int resource, uint64_t amount, int force)
 {
 #ifdef RCTL
 	int error;
 #endif
 
 	ASSERT_RACCT_ENABLED();
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 #ifdef RCTL
 	error = rctl_enforce(p, resource, amount);
 	if (error && !force && RACCT_IS_DENIABLE(resource)) {
 		SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount);
 		return (error);
 	}
 #endif
 	racct_adjust_resource(p->p_racct, resource, amount);
 	racct_add_cred_locked(p->p_ucred, resource, amount);
 
 	return (0);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for process 'p'.
  * Return 0 if it's below limits, or errno, if it's not.
  */
 int
 racct_add(struct proc *p, int resource, uint64_t amount)
 {
 	int error;
 
 	if (!racct_enable)
 		return (0);
 
 	SDT_PROBE3(racct, , rusage, add, p, resource, amount);
 
 	RACCT_LOCK();
 	error = racct_add_locked(p, resource, amount, 0);
 	RACCT_UNLOCK();
 	return (error);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for process 'p'.
  * Doesn't check for limits and never fails.
  */
 void
 racct_add_force(struct proc *p, int resource, uint64_t amount)
 {
 
 	if (!racct_enable)
 		return;
 
 	SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
 
 	RACCT_LOCK();
 	racct_add_locked(p, resource, amount, 1);
 	RACCT_UNLOCK();
 }
 
 static void
 racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
 {
 	struct prison *pr;
 
 	ASSERT_RACCT_ENABLED();
 
 	racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
 	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
 		racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
 		    amount);
 	racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount);
 }
 
 /*
  * Increase allocation of 'resource' by 'amount' for credential 'cred'.
  * Doesn't check for limits and never fails.
  */
 void
 racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 
 	if (!racct_enable)
 		return;
 
 	SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount);
 
 	RACCT_LOCK();
 	racct_add_cred_locked(cred, resource, amount);
 	RACCT_UNLOCK();
 }
 
 /*
  * Account for disk IO resource consumption.  Checks for limits,
  * but never fails, due to disk limits being undeniable.
  */
 void
 racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
 {
 
 	ASSERT_RACCT_ENABLED();
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write);
 
 	RACCT_LOCK();
 	if (is_write) {
 		racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
 		racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
 	} else {
 		racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
 		racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
 	}
 	RACCT_UNLOCK();
 }
 
 static int
 racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
 {
 	int64_t old_amount, decayed_amount, diff_proc, diff_cred;
 #ifdef RCTL
 	int error;
 #endif
 
 	ASSERT_RACCT_ENABLED();
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	old_amount = p->p_racct->r_resources[resource];
 	/*
 	 * The diffs may be negative.
 	 */
 	diff_proc = amount - old_amount;
 	if (resource == RACCT_PCTCPU) {
 		/*
 		 * Resources in per-credential racct containers may decay.
 		 * If this is the case, we need to calculate the difference
 		 * between the new amount and the proportional value of the
 		 * old amount that has decayed in the ucred racct containers.
 		 */
 		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
 		diff_cred = amount - decayed_amount;
 	} else
 		diff_cred = diff_proc;
 #ifdef notyet
 	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
 	    ("%s: usage of non-droppable resource %d dropping", __func__,
 	     resource));
 #endif
 #ifdef RCTL
 	if (diff_proc > 0) {
 		error = rctl_enforce(p, resource, diff_proc);
 		if (error && !force && RACCT_IS_DENIABLE(resource)) {
 			SDT_PROBE3(racct, , rusage, set__failure, p, resource,
 			    amount);
 			return (error);
 		}
 	}
 #endif
 	racct_adjust_resource(p->p_racct, resource, diff_proc);
 	if (diff_cred > 0)
 		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
 	else if (diff_cred < 0)
 		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
 
 	return (0);
 }
 
 /*
  * Set allocation of 'resource' to 'amount' for process 'p'.
  * Return 0 if it's below limits, or errno, if it's not.
  *
  * Note that decreasing the allocation always returns 0,
  * even if it's above the limit.
  */
 int
 racct_set(struct proc *p, int resource, uint64_t amount)
 {
 	int error;
 
 	if (!racct_enable)
 		return (0);
 
 	SDT_PROBE3(racct, , rusage, set__force, p, resource, amount);
 
 	RACCT_LOCK();
 	error = racct_set_locked(p, resource, amount, 0);
 	RACCT_UNLOCK();
 	return (error);
 }
 
 void
 racct_set_force(struct proc *p, int resource, uint64_t amount)
 {
 
 	if (!racct_enable)
 		return;
 
 	SDT_PROBE3(racct, , rusage, set, p, resource, amount);
 
 	RACCT_LOCK();
 	racct_set_locked(p, resource, amount, 1);
 	RACCT_UNLOCK();
 }
 
 /*
  * Returns amount of 'resource' the process 'p' can keep allocated.
  * Allocating more than that would be denied, unless the resource
  * is marked undeniable.  Amount of already allocated resource does
  * not matter.
  */
 uint64_t
 racct_get_limit(struct proc *p, int resource)
 {
 #ifdef RCTL
 	uint64_t available;
 
 	if (!racct_enable)
 		return (UINT64_MAX);
 
 	RACCT_LOCK();
 	available = rctl_get_limit(p, resource);
 	RACCT_UNLOCK();
 
 	return (available);
 #else
 
 	return (UINT64_MAX);
 #endif
 }
 
 /*
  * Returns amount of 'resource' the process 'p' can keep allocated.
  * Allocating more than that would be denied, unless the resource
  * is marked undeniable.  Amount of already allocated resource does
  * matter.
  */
 uint64_t
 racct_get_available(struct proc *p, int resource)
 {
 #ifdef RCTL
 	uint64_t available;
 
 	if (!racct_enable)
 		return (UINT64_MAX);
 
 	RACCT_LOCK();
 	available = rctl_get_available(p, resource);
 	RACCT_UNLOCK();
 
 	return (available);
 #else
 
 	return (UINT64_MAX);
 #endif
 }
 
 /*
  * Returns amount of the %cpu resource that process 'p' can add to its %cpu
  * utilization.  Adding more than that would lead to the process being
  * throttled.
  */
 static int64_t
 racct_pcpu_available(struct proc *p)
 {
 #ifdef RCTL
 	uint64_t available;
 
 	ASSERT_RACCT_ENABLED();
 
 	RACCT_LOCK();
 	available = rctl_pcpu_available(p);
 	RACCT_UNLOCK();
 
 	return (available);
 #else
 
 	return (INT64_MAX);
 #endif
 }
 
 /*
  * Decrease allocation of 'resource' by 'amount' for process 'p'.
  */
 void
 racct_sub(struct proc *p, int resource, uint64_t amount)
 {
 
 	if (!racct_enable)
 		return;
 
 	SDT_PROBE3(racct, , rusage, sub, p, resource, amount);
 
 	/*
 	 * We need proc lock to dereference p->p_ucred.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(RACCT_CAN_DROP(resource),
 	    ("%s: called for non-droppable resource %d", __func__, resource));
 
 	RACCT_LOCK();
 	KASSERT(amount <= p->p_racct->r_resources[resource],
 	    ("%s: freeing %ju of resource %d, which is more "
 	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
 	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
 
 	racct_adjust_resource(p->p_racct, resource, -amount);
 	racct_sub_cred_locked(p->p_ucred, resource, amount);
 	RACCT_UNLOCK();
 }
 
 static void
 racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
 {
 	struct prison *pr;
 
 	ASSERT_RACCT_ENABLED();
 
 	racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
 	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
 		racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
 		    -amount);
 	racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount);
 }
 
 /*
  * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
  */
 void
 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
 {
 
 	if (!racct_enable)
 		return;
 
 	SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount);
 
 #ifdef notyet
 	KASSERT(RACCT_CAN_DROP(resource),
 	    ("%s: called for resource %d which can not drop", __func__,
 	     resource));
 #endif
 
 	RACCT_LOCK();
 	racct_sub_cred_locked(cred, resource, amount);
 	RACCT_UNLOCK();
 }
 
 /*
  * Inherit resource usage information from the parent process.
  */
 int
 racct_proc_fork(struct proc *parent, struct proc *child)
 {
 	int i, error = 0;
 
 	if (!racct_enable)
 		return (0);
 
 	/*
 	 * Create racct for the child process.
 	 */
 	racct_create(&child->p_racct);
 
 	PROC_LOCK(parent);
 	PROC_LOCK(child);
 	RACCT_LOCK();
 
 #ifdef RCTL
 	error = rctl_proc_fork(parent, child);
 	if (error != 0)
 		goto out;
 #endif
 
 	/* Init process cpu time. */
 	child->p_prev_runtime = 0;
 	child->p_throttled = 0;
 
 	/*
 	 * Inherit resource usage.
 	 */
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (parent->p_racct->r_resources[i] == 0 ||
 		    !RACCT_IS_INHERITABLE(i))
 			continue;
 
 		error = racct_set_locked(child, i,
 		    parent->p_racct->r_resources[i], 0);
 		if (error != 0)
 			goto out;
 	}
 
 	error = racct_add_locked(child, RACCT_NPROC, 1, 0);
 	error += racct_add_locked(child, RACCT_NTHR, 1, 0);
 
 out:
 	RACCT_UNLOCK();
 	PROC_UNLOCK(child);
 	PROC_UNLOCK(parent);
 
 	if (error != 0)
 		racct_proc_exit(child);
 
 	return (error);
 }
 
 /*
  * Called at the end of fork1(), to handle rules that require the process
  * to be fully initialized.
  */
 void
 racct_proc_fork_done(struct proc *child)
 {
 
 	if (!racct_enable)
 		return;
 
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 
 #ifdef RCTL
 	RACCT_LOCK();
 	rctl_enforce(child, RACCT_NPROC, 0);
 	rctl_enforce(child, RACCT_NTHR, 0);
 	RACCT_UNLOCK();
 #endif
 }
 
 void
 racct_proc_exit(struct proc *p)
 {
 	struct timeval wallclock;
 	uint64_t pct_estimate, pct, runtime;
 	int i;
 
 	if (!racct_enable)
 		return;
 
 	PROC_LOCK(p);
 	/*
 	 * We don't need to calculate rux, proc_reap() has already done this.
 	 */
 	runtime = cputick2usec(p->p_rux.rux_runtime);
 #ifdef notyet
 	KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
 #else
 	if (runtime < p->p_prev_runtime)
 		runtime = p->p_prev_runtime;
 #endif
 	microuptime(&wallclock);
 	timevalsub(&wallclock, &p->p_stats->p_start);
 	if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
 		pct_estimate = (1000000 * runtime * 100) /
 		    ((uint64_t)wallclock.tv_sec * 1000000 +
 		    wallclock.tv_usec);
 	} else
 		pct_estimate = 0;
 	pct = racct_getpcpu(p, pct_estimate);
 
 	RACCT_LOCK();
 	racct_set_locked(p, RACCT_CPU, runtime, 0);
 	racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
 
 	KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0,
 	    ("process reaped with %ju allocated for RSS\n",
 	    p->p_racct->r_resources[RACCT_RSS]));
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (p->p_racct->r_resources[i] == 0)
 			continue;
 		if (!RACCT_IS_RECLAIMABLE(i))
 			continue;
 		racct_set_locked(p, i, 0, 0);
 	}
 
 #ifdef RCTL
 	rctl_racct_release(p->p_racct);
 #endif
 	racct_destroy_locked(&p->p_racct);
 	RACCT_UNLOCK();
 	PROC_UNLOCK(p);
 }
 
 /*
  * Called after credentials change, to move resource utilisation
  * between raccts.
  */
 void
 racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
     struct ucred *newcred)
 {
 	struct uidinfo *olduip, *newuip;
 	struct loginclass *oldlc, *newlc;
 	struct prison *oldpr, *newpr, *pr;
 
 	if (!racct_enable)
 		return;
 
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 
 	newuip = newcred->cr_ruidinfo;
 	olduip = oldcred->cr_ruidinfo;
 	newlc = newcred->cr_loginclass;
 	oldlc = oldcred->cr_loginclass;
 	newpr = newcred->cr_prison;
 	oldpr = oldcred->cr_prison;
 
 	RACCT_LOCK();
 	if (newuip != olduip) {
 		racct_sub_racct(olduip->ui_racct, p->p_racct);
 		racct_add_racct(newuip->ui_racct, p->p_racct);
 	}
 	if (newlc != oldlc) {
 		racct_sub_racct(oldlc->lc_racct, p->p_racct);
 		racct_add_racct(newlc->lc_racct, p->p_racct);
 	}
 	if (newpr != oldpr) {
 		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
 			racct_sub_racct(pr->pr_prison_racct->prr_racct,
 			    p->p_racct);
 		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
 			racct_add_racct(pr->pr_prison_racct->prr_racct,
 			    p->p_racct);
 	}
 	RACCT_UNLOCK();
 
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 #endif
 }
 
 void
 racct_move(struct racct *dest, struct racct *src)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	RACCT_LOCK();
 	racct_add_racct(dest, src);
 	racct_sub_racct(src, src);
 	RACCT_UNLOCK();
 }
 
 /*
  * Make the process sleep in userret() for 'timeout' ticks.  Setting
  * timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
  */
 void
 racct_proc_throttle(struct proc *p, int timeout)
 {
 	struct thread *td;
 #ifdef SMP
 	int cpuid;
 #endif
 
 	KASSERT(timeout != 0, ("timeout %d", timeout));
 	ASSERT_RACCT_ENABLED();
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Do not block kernel processes.  Also do not block processes with
 	 * low %cpu utilization to improve interactivity.
 	 */
 	if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
 		return;
 
 	if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
 		return;
 
 	p->p_throttled = timeout;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		switch (td->td_state) {
 		case TDS_RUNQ:
 			/*
 			 * If the thread is on the scheduler run-queue, we can
 			 * not just remove it from there.  So we set the flag
 			 * TDF_NEEDRESCHED for the thread, so that once it is
 			 * running, it is taken off the cpu as soon as possible.
 			 */
 			td->td_flags |= TDF_NEEDRESCHED;
 			break;
 		case TDS_RUNNING:
 			/*
 			 * If the thread is running, we request a context
 			 * switch for it by setting the TDF_NEEDRESCHED flag.
 			 */
 			td->td_flags |= TDF_NEEDRESCHED;
 #ifdef SMP
 			cpuid = td->td_oncpu;
 			if ((cpuid != NOCPU) && (td != curthread))
 				ipi_cpu(cpuid, IPI_AST);
 #endif
 			break;
 		default:
 			break;
 		}
 		thread_unlock(td);
 	}
 }
 
 static void
 racct_proc_wakeup(struct proc *p)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (p->p_throttled != 0) {
 		p->p_throttled = 0;
 		wakeup(p->p_racct);
 	}
 }
 
 static void
 racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
 {
 	int64_t r_old, r_new;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 #ifdef RCTL
 	rctl_throttle_decay(racct, RACCT_READBPS);
 	rctl_throttle_decay(racct, RACCT_WRITEBPS);
 	rctl_throttle_decay(racct, RACCT_READIOPS);
 	rctl_throttle_decay(racct, RACCT_WRITEIOPS);
 #endif
 
 	r_old = racct->r_resources[RACCT_PCTCPU];
 
 	/* If there is nothing to decay, just exit. */
 	if (r_old <= 0)
 		return;
 
 	r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
 	racct->r_resources[RACCT_PCTCPU] = r_new;
 }
 
 static void
 racct_decay_pre(void)
 {
 
 	RACCT_LOCK();
 }
 
 static void
 racct_decay_post(void)
 {
 
 	RACCT_UNLOCK();
 }
 
 static void
 racct_decay(void)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	ui_racct_foreach(racct_decay_callback, racct_decay_pre,
 	    racct_decay_post, NULL, NULL);
 	loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
 	    racct_decay_post, NULL, NULL);
 	prison_racct_foreach(racct_decay_callback, racct_decay_pre,
 	    racct_decay_post, NULL, NULL);
 }
 
 static void
 racctd(void)
 {
 	struct thread *td;
 	struct proc *p;
 	struct timeval wallclock;
 	uint64_t pct, pct_estimate, runtime;
 
 	ASSERT_RACCT_ENABLED();
 
 	for (;;) {
 		racct_decay();
 
 		sx_slock(&allproc_lock);
 
 		LIST_FOREACH(p, &zombproc, p_list) {
 			PROC_LOCK(p);
 			racct_set(p, RACCT_PCTCPU, 0);
 			PROC_UNLOCK(p);
 		}
 
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state != PRS_NORMAL) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 
 			microuptime(&wallclock);
 			timevalsub(&wallclock, &p->p_stats->p_start);
 			PROC_STATLOCK(p);
 			FOREACH_THREAD_IN_PROC(p, td)
 				ruxagg(p, td);
 			runtime = cputick2usec(p->p_rux.rux_runtime);
 			PROC_STATUNLOCK(p);
 #ifdef notyet
 			KASSERT(runtime >= p->p_prev_runtime,
 			    ("runtime < p_prev_runtime"));
 #else
 			if (runtime < p->p_prev_runtime)
 				runtime = p->p_prev_runtime;
 #endif
 			p->p_prev_runtime = runtime;
 			if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
 				pct_estimate = (1000000 * runtime * 100) /
 				    ((uint64_t)wallclock.tv_sec * 1000000 +
 				    wallclock.tv_usec);
 			} else
 				pct_estimate = 0;
 			pct = racct_getpcpu(p, pct_estimate);
 			RACCT_LOCK();
 #ifdef RCTL
 			rctl_throttle_decay(p->p_racct, RACCT_READBPS);
 			rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
 			rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
 			rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
 #endif
 			racct_set_locked(p, RACCT_PCTCPU, pct, 1);
 			racct_set_locked(p, RACCT_CPU, runtime, 0);
 			racct_set_locked(p, RACCT_WALLCLOCK,
 			    (uint64_t)wallclock.tv_sec * 1000000 +
 			    wallclock.tv_usec, 0);
 			RACCT_UNLOCK();
 			PROC_UNLOCK(p);
 		}
 
 		/*
 		 * To ensure that processes are throttled in a fair way, we need
 		 * to iterate over all processes again and check the limits
 		 * for %cpu resource only after ucred racct containers have been
 		 * properly filled.
 		 */
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state != PRS_NORMAL) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 
 			if (racct_pcpu_available(p) <= 0) {
 				if (p->p_racct->r_resources[RACCT_PCTCPU] >
 				    pcpu_threshold)
 					racct_proc_throttle(p, -1);
 			} else if (p->p_throttled == -1) {
 				racct_proc_wakeup(p);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		pause("-", hz);
 	}
 }
 
 static struct kproc_desc racctd_kp = {
 	"racctd",
 	racctd,
 	NULL
 };
 
 static void
 racctd_init(void)
 {
 	if (!racct_enable)
 		return;
 
 	kproc_start(&racctd_kp);
 }
 SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
 
 static void
 racct_init(void)
 {
 	if (!racct_enable)
 		return;
 
 	racct_zone = uma_zcreate("racct", sizeof(struct racct),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
 	 * XXX: Move this somewhere.
 	 */
 	prison0.pr_prison_racct = prison_racct_find("0");
 }
 SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
 
 #endif /* !RACCT */
Index: head/sys/kern/kern_rangelock.c
===================================================================
--- head/sys/kern/kern_rangelock.c	(revision 326270)
+++ head/sys/kern/kern_rangelock.c	(revision 326271)
@@ -1,248 +1,250 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangelock.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 struct rl_q_entry {
 	TAILQ_ENTRY(rl_q_entry) rl_q_link;
 	off_t		rl_q_start, rl_q_end;
 	int		rl_q_flags;
 };
 
 static uma_zone_t rl_entry_zone;
 
 static void
 rangelock_sys_init(void)
 {
 
 	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
 
 static struct rl_q_entry *
 rlqentry_alloc(void)
 {
 
 	return (uma_zalloc(rl_entry_zone, M_WAITOK));
 }
 
 void
 rlqentry_free(struct rl_q_entry *rleq)
 {
 
 	uma_zfree(rl_entry_zone, rleq);
 }
 
 void
 rangelock_init(struct rangelock *lock)
 {
 
 	TAILQ_INIT(&lock->rl_waiters);
 	lock->rl_currdep = NULL;
 }
 
 void
 rangelock_destroy(struct rangelock *lock)
 {
 
 	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
 }
 
 /*
  * Two entries are compatible if their ranges do not overlap, or both
  * entries are for read.
  */
 static int
 ranges_overlap(const struct rl_q_entry *e1,
     const struct rl_q_entry *e2)
 {
 
 	if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
 		return (1);
 	return (0);
 }
 
 /*
  * Recalculate the lock->rl_currdep after an unlock.
  */
 static void
 rangelock_calc_block(struct rangelock *lock)
 {
 	struct rl_q_entry *entry, *nextentry, *entry1;
 
 	for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) {
 		nextentry = TAILQ_NEXT(entry, rl_q_link);
 		if (entry->rl_q_flags & RL_LOCK_READ) {
 			/* Reads must not overlap with granted writes. */
 			for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
 			    !(entry1->rl_q_flags & RL_LOCK_READ);
 			    entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
 				if (ranges_overlap(entry, entry1))
 					goto out;
 			}
 		} else {
 			/* Write must not overlap with any granted locks. */
 			for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
 			    entry1 != entry;
 			    entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
 				if (ranges_overlap(entry, entry1))
 					goto out;
 			}
 
 			/* Move grantable write locks to the front. */
 			TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
 			TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link);
 		}
 
 		/* Grant this lock. */
 		entry->rl_q_flags |= RL_LOCK_GRANTED;
 		wakeup(entry);
 	}
 out:
 	lock->rl_currdep = entry;
 }
 
 static void
 rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
     struct mtx *ilk)
 {
 
 	MPASS(lock != NULL && entry != NULL && ilk != NULL);
 	mtx_assert(ilk, MA_OWNED);
 	KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
 
 	TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
 	rangelock_calc_block(lock);
 	mtx_unlock(ilk);
 	if (curthread->td_rlqe == NULL)
 		curthread->td_rlqe = entry;
 	else
 		rlqentry_free(entry);
 }
 
 void
 rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
 {
 
 	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
 
 	mtx_lock(ilk);
 	rangelock_unlock_locked(lock, cookie, ilk);
 }
 
 /*
  * Unlock the sub-range of granted lock.
  */
 void *
 rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
     off_t end, struct mtx *ilk)
 {
 	struct rl_q_entry *entry;
 
 	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
 	entry = cookie;
 	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
 	    ("Unlocking non-granted lock"));
 	KASSERT(entry->rl_q_start == start, ("wrong start"));
 	KASSERT(entry->rl_q_end >= end, ("wrong end"));
 
 	mtx_lock(ilk);
 	if (entry->rl_q_end == end) {
 		rangelock_unlock_locked(lock, cookie, ilk);
 		return (NULL);
 	}
 	entry->rl_q_end = end;
 	rangelock_calc_block(lock);
 	mtx_unlock(ilk);
 	return (cookie);
 }
 
 /*
  * Add the lock request to the queue of the pending requests for
  * rangelock.  Sleep until the request can be granted.
  */
 static void *
 rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
     struct mtx *ilk)
 {
 	struct rl_q_entry *entry;
 	struct thread *td;
 
 	MPASS(lock != NULL && ilk != NULL);
 
 	td = curthread;
 	if (td->td_rlqe != NULL) {
 		entry = td->td_rlqe;
 		td->td_rlqe = NULL;
 	} else
 		entry = rlqentry_alloc();
 	MPASS(entry != NULL);
 	entry->rl_q_flags = mode;
 	entry->rl_q_start = start;
 	entry->rl_q_end = end;
 
 	mtx_lock(ilk);
 	/*
 	 * XXXKIB TODO. Check that a thread does not try to enqueue a
 	 * lock that is incompatible with another request from the same
 	 * thread.
 	 */
 
 	TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
 	if (lock->rl_currdep == NULL)
 		lock->rl_currdep = entry;
 	rangelock_calc_block(lock);
 	while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
 		msleep(entry, ilk, 0, "range", 0);
 	mtx_unlock(ilk);
 	return (entry);
 }
 
 void *
 rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
 {
 
 	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk));
 }
 
 void *
 rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
 {
 
 	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk));
 }
Index: head/sys/kern/kern_rctl.c
===================================================================
--- head/sys/kern/kern_rctl.c	(revision 326270)
+++ head/sys/kern/kern_rctl.c	(revision 326271)
@@ -1,2233 +1,2235 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/loginclass.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/rctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <vm/uma.h>
 
 #ifdef RCTL
 #ifndef RACCT
 #error "The RCTL option requires the RACCT option"
 #endif
 
 FEATURE(rctl, "Resource Limits");
 
 #define	HRF_DEFAULT		0
 #define	HRF_DONT_INHERIT	1
 #define	HRF_DONT_ACCUMULATE	2
 
 #define	RCTL_MAX_INBUFSIZE	4 * 1024
 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
 #define	RCTL_LOG_BUFSIZE	128
 
 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
 
 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
 static int rctl_log_rate_limit = 10;
 static int rctl_devctl_rate_limit = 10;
 
 /*
  * Values below are initialized in rctl_init().
  */
 static int rctl_throttle_min = -1;
 static int rctl_throttle_max = -1;
 static int rctl_throttle_pct = -1;
 static int rctl_throttle_pct2 = -1;
 
 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
     &rctl_maxbufsize, 0, "Maximum output buffer size");
 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
     "Shortest throttling duration, in hz");
 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
     "Longest throttling duration, in hz");
 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
     "Throttling penalty for process consumption, in percent");
 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
     "Throttling penalty for container consumption, in percent");
 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
 
 /*
  * 'rctl_rule_link' connects a rule with every racct it's related to.
  * For example, rule 'user:X:openfiles:deny=N/process' is linked
  * with uidinfo for user X, and to each process of that user.
  */
 struct rctl_rule_link {
 	LIST_ENTRY(rctl_rule_link)	rrl_next;
 	struct rctl_rule		*rrl_rule;
 	int				rrl_exceeded;
 };
 
 struct dict {
 	const char	*d_name;
 	int		d_value;
 };
 
 static struct dict subjectnames[] = {
 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
 	{ "user", RCTL_SUBJECT_TYPE_USER },
 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
 	{ NULL, -1 }};
 
 static struct dict resourcenames[] = {
 	{ "cputime", RACCT_CPU },
 	{ "datasize", RACCT_DATA },
 	{ "stacksize", RACCT_STACK },
 	{ "coredumpsize", RACCT_CORE },
 	{ "memoryuse", RACCT_RSS },
 	{ "memorylocked", RACCT_MEMLOCK },
 	{ "maxproc", RACCT_NPROC },
 	{ "openfiles", RACCT_NOFILE },
 	{ "vmemoryuse", RACCT_VMEM },
 	{ "pseudoterminals", RACCT_NPTS },
 	{ "swapuse", RACCT_SWAP },
 	{ "nthr", RACCT_NTHR },
 	{ "msgqqueued", RACCT_MSGQQUEUED },
 	{ "msgqsize", RACCT_MSGQSIZE },
 	{ "nmsgq", RACCT_NMSGQ },
 	{ "nsem", RACCT_NSEM },
 	{ "nsemop", RACCT_NSEMOP },
 	{ "nshm", RACCT_NSHM },
 	{ "shmsize", RACCT_SHMSIZE },
 	{ "wallclock", RACCT_WALLCLOCK },
 	{ "pcpu", RACCT_PCTCPU },
 	{ "readbps", RACCT_READBPS },
 	{ "writebps", RACCT_WRITEBPS },
 	{ "readiops", RACCT_READIOPS },
 	{ "writeiops", RACCT_WRITEIOPS },
 	{ NULL, -1 }};
 
 static struct dict actionnames[] = {
 	{ "sighup", RCTL_ACTION_SIGHUP },
 	{ "sigint", RCTL_ACTION_SIGINT },
 	{ "sigquit", RCTL_ACTION_SIGQUIT },
 	{ "sigill", RCTL_ACTION_SIGILL },
 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
 	{ "sigabrt", RCTL_ACTION_SIGABRT },
 	{ "sigemt", RCTL_ACTION_SIGEMT },
 	{ "sigfpe", RCTL_ACTION_SIGFPE },
 	{ "sigkill", RCTL_ACTION_SIGKILL },
 	{ "sigbus", RCTL_ACTION_SIGBUS },
 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
 	{ "sigsys", RCTL_ACTION_SIGSYS },
 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
 	{ "sigalrm", RCTL_ACTION_SIGALRM },
 	{ "sigterm", RCTL_ACTION_SIGTERM },
 	{ "sigurg", RCTL_ACTION_SIGURG },
 	{ "sigstop", RCTL_ACTION_SIGSTOP },
 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
 	{ "sigchld", RCTL_ACTION_SIGCHLD },
 	{ "sigttin", RCTL_ACTION_SIGTTIN },
 	{ "sigttou", RCTL_ACTION_SIGTTOU },
 	{ "sigio", RCTL_ACTION_SIGIO },
 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
 	{ "sigprof", RCTL_ACTION_SIGPROF },
 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
 	{ "siginfo", RCTL_ACTION_SIGINFO },
 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
 	{ "sigthr", RCTL_ACTION_SIGTHR },
 	{ "deny", RCTL_ACTION_DENY },
 	{ "log", RCTL_ACTION_LOG },
 	{ "devctl", RCTL_ACTION_DEVCTL },
 	{ "throttle", RCTL_ACTION_THROTTLE },
 	{ NULL, -1 }};
 
 static void rctl_init(void);
 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
 
 static uma_zone_t rctl_rule_zone;
 static uma_zone_t rctl_rule_link_zone;
 
 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
 
 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
 
 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, val = rctl_throttle_min;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (val < 1 || val > rctl_throttle_max)
 		return (EINVAL);
 
 	RACCT_LOCK();
 	rctl_throttle_min = val;
 	RACCT_UNLOCK();
 
 	return (0);
 }
 
 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, val = rctl_throttle_max;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (val < rctl_throttle_min)
 		return (EINVAL);
 
 	RACCT_LOCK();
 	rctl_throttle_max = val;
 	RACCT_UNLOCK();
 
 	return (0);
 }
 
 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, val = rctl_throttle_pct;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (val < 0)
 		return (EINVAL);
 
 	RACCT_LOCK();
 	rctl_throttle_pct = val;
 	RACCT_UNLOCK();
 
 	return (0);
 }
 
 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int error, val = rctl_throttle_pct2;
 
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (val < 0)
 		return (EINVAL);
 
 	RACCT_LOCK();
 	rctl_throttle_pct2 = val;
 	RACCT_UNLOCK();
 
 	return (0);
 }
 
 static const char *
 rctl_subject_type_name(int subject)
 {
 	int i;
 
 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
 		if (subjectnames[i].d_value == subject)
 			return (subjectnames[i].d_name);
 	}
 
 	panic("rctl_subject_type_name: unknown subject type %d", subject);
 }
 
 static const char *
 rctl_action_name(int action)
 {
 	int i;
 
 	for (i = 0; actionnames[i].d_name != NULL; i++) {
 		if (actionnames[i].d_value == action)
 			return (actionnames[i].d_name);
 	}
 
 	panic("rctl_action_name: unknown action %d", action);
 }
 
 const char *
 rctl_resource_name(int resource)
 {
 	int i;
 
 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
 		if (resourcenames[i].d_value == resource)
 			return (resourcenames[i].d_name);
 	}
 
 	panic("rctl_resource_name: unknown resource %d", resource);
 }
 
 static struct racct *
 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
 {
 	struct ucred *cred = p->p_ucred;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	switch (rule->rr_per) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		return (p->p_racct);
 	case RCTL_SUBJECT_TYPE_USER:
 		return (cred->cr_ruidinfo->ui_racct);
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		return (cred->cr_loginclass->lc_racct);
 	case RCTL_SUBJECT_TYPE_JAIL:
 		return (cred->cr_prison->pr_prison_racct->prr_racct);
 	default:
 		panic("%s: unknown per %d", __func__, rule->rr_per);
 	}
 }
 
 /*
  * Return the amount of resource that can be allocated by 'p' before
  * hitting 'rule'.
  */
 static int64_t
 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
 {
 	const struct racct *racct;
 	int64_t available;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	racct = rctl_proc_rule_to_racct(p, rule);
 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
 
 	return (available);
 }
 
 /*
  * Called every second for proc, uidinfo, loginclass, and jail containers.
  * If the limit isn't exceeded, it decreases the usage amount to zero.
  * Otherwise, it decreases it by the value of the limit.  This way
  * resource consumption exceeding the limit "carries over" to the next
  * period.
  */
 void
 rctl_throttle_decay(struct racct *racct, int resource)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int64_t minavailable;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	minavailable = INT64_MAX;
 
 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 
 		if (rule->rr_resource != resource)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
 			continue;
 
 		if (rule->rr_amount < minavailable)
 			minavailable = rule->rr_amount;
 	}
 
 	if (racct->r_resources[resource] < minavailable) {
 		racct->r_resources[resource] = 0;
 	} else {
 		/*
 		 * Cap utilization counter at ten times the limit.  Otherwise,
 		 * if we changed the rule lowering the allowed amount, it could
 		 * take unreasonably long time for the accumulated resource
 		 * usage to drop.
 		 */
 		if (racct->r_resources[resource] > minavailable * 10)
 			racct->r_resources[resource] = minavailable * 10;
 
 		racct->r_resources[resource] -= minavailable;
 	}
 }
 
 /*
  * Special version of rctl_get_available() for the %CPU resource.
  * We slightly cheat here and return less than we normally would.
  */
 int64_t
 rctl_pcpu_available(const struct proc *p) {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int64_t available, minavailable, limit;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	minavailable = INT64_MAX;
 	limit = 0;
 
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != RACCT_PCTCPU)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		available = rctl_available_resource(p, rule);
 		if (available < minavailable) {
 			minavailable = available;
 			limit = rule->rr_amount;
 		}
 	}
 
 	/*
 	 * Return slightly less than actual value of the available
 	 * %cpu resource.  This makes %cpu throttling more aggressive
 	 * and lets us act sooner than the limits are already exceeded.
 	 */
 	if (limit != 0) {
 		if (limit > 2 * RCTL_PCPU_SHIFT)
 			minavailable -= RCTL_PCPU_SHIFT;
 		else
 			minavailable -= (limit / 2);
 	}
 
 	return (minavailable);
 }
 
 static uint64_t
 xadd(uint64_t a, uint64_t b)
 {
 	uint64_t c;
 
 	c = a + b;
 
 	/*
 	 * Detect overflow.
 	 */
 	if (c < a || c < b)
 		return (UINT64_MAX);
 
 	return (c);
 }
 
 static uint64_t
 xmul(uint64_t a, uint64_t b)
 {
 
 	if (b != 0 && a > UINT64_MAX / b)
 		return (UINT64_MAX);
 
 	return (a * b);
 }
 
 /*
  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
  * to what it keeps allocated now.  Returns non-zero if the allocation should
  * be denied, 0 otherwise.
  */
 int
 rctl_enforce(struct proc *p, int resource, uint64_t amount)
 {
 	static struct timeval log_lasttime, devctl_lasttime;
 	static int log_curtime = 0, devctl_curtime = 0;
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	struct sbuf sb;
 	char *buf;
 	int64_t available;
 	uint64_t sleep_ms, sleep_ratio;
 	int should_deny = 0;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 
 		available = rctl_available_resource(p, rule);
 		if (available >= (int64_t)amount) {
 			link->rrl_exceeded = 0;
 			continue;
 		}
 
 		switch (rule->rr_action) {
 		case RCTL_ACTION_DENY:
 			should_deny = 1;
 			continue;
 		case RCTL_ACTION_LOG:
 			/*
 			 * If rrl_exceeded != 0, it means we've already
 			 * logged a warning for this process.
 			 */
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			/*
 			 * If the process state is not fully initialized yet,
 			 * we can't access most of the required fields, e.g.
 			 * p->p_comm.  This happens when called from fork1().
 			 * Ignore this rule for now; it will be processed just
 			 * after fork, when called from racct_proc_fork_done().
 			 */
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			if (!ppsratecheck(&log_lasttime, &log_curtime,
 			    rctl_log_rate_limit))
 				continue;
 
 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
 			if (buf == NULL) {
 				printf("rctl_enforce: out of memory\n");
 				continue;
 			}
 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
 			rctl_rule_to_sbuf(&sb, rule);
 			sbuf_finish(&sb);
 			printf("rctl: rule \"%s\" matched by pid %d "
 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
 			sbuf_delete(&sb);
 			free(buf, M_RCTL);
 			link->rrl_exceeded = 1;
 			continue;
 		case RCTL_ACTION_DEVCTL:
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
 			    rctl_devctl_rate_limit))
 				continue;
 
 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
 			if (buf == NULL) {
 				printf("rctl_enforce: out of memory\n");
 				continue;
 			}
 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
 			sbuf_printf(&sb, "rule=");
 			rctl_rule_to_sbuf(&sb, rule);
 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
 			    p->p_pid, p->p_ucred->cr_ruid,
 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
 			sbuf_finish(&sb);
 			devctl_notify_f("RCTL", "rule", "matched",
 			    sbuf_data(&sb), M_NOWAIT);
 			sbuf_delete(&sb);
 			free(buf, M_RCTL);
 			link->rrl_exceeded = 1;
 			continue;
 		case RCTL_ACTION_THROTTLE:
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			/*
 			 * Make the process sleep for a fraction of second
 			 * proportional to the ratio of process' resource
 			 * utilization compared to the limit.  The point is
 			 * to penalize resource hogs: processes that consume
 			 * more of the available resources sleep for longer.
 			 *
 			 * We're trying to defer division until the very end,
 			 * to minimize the rounding effects.  The following
 			 * calculation could have been written in a clearer
 			 * way like this:
 			 *
 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
 			 *     rule->rr_amount;
 			 * sleep_ms *= rctl_throttle_pct / 100;
 			 * if (sleep_ms < rctl_throttle_min)
 			 *         sleep_ms = rctl_throttle_min;
 			 *
 			 */
 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
 				sleep_ms = rctl_throttle_min * rule->rr_amount;
 
 			/*
 			 * Multiply that by the ratio of the resource
 			 * consumption for the container compared to the limit,
 			 * squared.  In other words, a process in a container
 			 * that is two times over the limit will be throttled
 			 * four times as much for hitting the same rule.  The
 			 * point is to penalize processes more if the container
 			 * itself (eg certain UID or jail) is above the limit.
 			 */
 			if (available < 0)
 				sleep_ratio = -available / rule->rr_amount;
 			else
 				sleep_ratio = 0;
 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
 
 			/*
 			 * Finally the division.
 			 */
 			sleep_ms /= rule->rr_amount;
 
 			if (sleep_ms > rctl_throttle_max)
 				sleep_ms = rctl_throttle_max;
 #if 0
 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
 			   __func__, p->p_pid, p->p_comm,
 			   p->p_racct->r_resources[resource],
 			   rule->rr_amount, (uintmax_t)sleep_ms,
 			   (uintmax_t)sleep_ratio, (intmax_t)available);
 #endif
 
 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
 			racct_proc_throttle(p, sleep_ms);
 			continue;
 		default:
 			if (link->rrl_exceeded != 0)
 				continue;
 
 			if (p->p_state != PRS_NORMAL)
 				continue;
 
 			KASSERT(rule->rr_action > 0 &&
 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
 			    ("rctl_enforce: unknown action %d",
 			     rule->rr_action));
 
 			/*
 			 * We're using the fact that RCTL_ACTION_SIG* values
 			 * are equal to their counterparts from sys/signal.h.
 			 */
 			kern_psignal(p, rule->rr_action);
 			link->rrl_exceeded = 1;
 			continue;
 		}
 	}
 
 	if (should_deny) {
 		/*
 		 * Return fake error code; the caller should change it
 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
 		 */
 		return (EDOOFUS);
 	}
 
 	return (0);
 }
 
 uint64_t
 rctl_get_limit(struct proc *p, int resource)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	uint64_t amount = UINT64_MAX;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		if (rule->rr_amount < amount)
 			amount = rule->rr_amount;
 	}
 
 	return (amount);
 }
 
 uint64_t
 rctl_get_available(struct proc *p, int resource)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int64_t available, minavailable, allocated;
 
 	minavailable = INT64_MAX;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	/*
 	 * There may be more than one matching rule; go through all of them.
 	 * Denial should be done last, after logging and sending signals.
 	 */
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
 		if (rule->rr_action != RCTL_ACTION_DENY)
 			continue;
 		available = rctl_available_resource(p, rule);
 		if (available < minavailable)
 			minavailable = available;
 	}
 
 	/*
 	 * XXX: Think about this _hard_.
 	 */
 	allocated = p->p_racct->r_resources[resource];
 	if (minavailable < INT64_MAX - allocated)
 		minavailable += allocated;
 	if (minavailable < 0)
 		minavailable = 0;
 
 	return (minavailable);
 }
 
 static int
 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
 		if (rule->rr_subject_type != filter->rr_subject_type)
 			return (0);
 
 		switch (filter->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_PROCESS:
 			if (filter->rr_subject.rs_proc != NULL &&
 			    rule->rr_subject.rs_proc !=
 			    filter->rr_subject.rs_proc)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_USER:
 			if (filter->rr_subject.rs_uip != NULL &&
 			    rule->rr_subject.rs_uip !=
 			    filter->rr_subject.rs_uip)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			if (filter->rr_subject.rs_loginclass != NULL &&
 			    rule->rr_subject.rs_loginclass !=
 			    filter->rr_subject.rs_loginclass)
 				return (0);
 			break;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			if (filter->rr_subject.rs_prison_racct != NULL &&
 			    rule->rr_subject.rs_prison_racct !=
 			    filter->rr_subject.rs_prison_racct)
 				return (0);
 			break;
 		default:
 			panic("rctl_rule_matches: unknown subject type %d",
 			    filter->rr_subject_type);
 		}
 	}
 
 	if (filter->rr_resource != RACCT_UNDEFINED) {
 		if (rule->rr_resource != filter->rr_resource)
 			return (0);
 	}
 
 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
 		if (rule->rr_action != filter->rr_action)
 			return (0);
 	}
 
 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
 		if (rule->rr_amount != filter->rr_amount)
 			return (0);
 	}
 
 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
 		if (rule->rr_per != filter->rr_per)
 			return (0);
 	}
 
 	return (1);
 }
 
 static int
 str2value(const char *str, int *value, struct dict *table)
 {
 	int i;
 
 	if (value == NULL)
 		return (EINVAL);
 
 	for (i = 0; table[i].d_name != NULL; i++) {
 		if (strcasecmp(table[i].d_name, str) == 0) {
 			*value =  table[i].d_value;
 			return (0);
 		}
 	}
 
 	return (EINVAL);
 }
 
 static int
 str2id(const char *str, id_t *value)
 {
 	char *end;
 
 	if (str == NULL)
 		return (EINVAL);
 
 	*value = strtoul(str, &end, 10);
 	if ((size_t)(end - str) != strlen(str))
 		return (EINVAL);
 
 	return (0);
 }
 
 static int
 str2int64(const char *str, int64_t *value)
 {
 	char *end;
 
 	if (str == NULL)
 		return (EINVAL);
 
 	*value = strtoul(str, &end, 10);
 	if ((size_t)(end - str) != strlen(str))
 		return (EINVAL);
 
 	if (*value < 0)
 		return (ERANGE);
 
 	return (0);
 }
 
 /*
  * Connect the rule to the racct, increasing refcount for the rule.
  */
 static void
 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
 {
 	struct rctl_rule_link *link;
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 
 	rctl_rule_acquire(rule);
 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
 	link->rrl_rule = rule;
 	link->rrl_exceeded = 0;
 
 	RACCT_LOCK();
 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
 	RACCT_UNLOCK();
 }
 
 static int
 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
 {
 	struct rctl_rule_link *link;
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 	RACCT_LOCK_ASSERT();
 
 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
 	if (link == NULL)
 		return (ENOMEM);
 	rctl_rule_acquire(rule);
 	link->rrl_rule = rule;
 	link->rrl_exceeded = 0;
 
 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
 
 	return (0);
 }
 
 /*
  * Remove limits for a rules matching the filter and release
  * the refcounts for the rules, possibly freeing them.  Returns
  * the number of limit structures removed.
  */
 static int
 rctl_racct_remove_rules(struct racct *racct,
     const struct rctl_rule *filter)
 {
 	struct rctl_rule_link *link, *linktmp;
 	int removed = 0;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
 		if (!rctl_rule_matches(link->rrl_rule, filter))
 			continue;
 
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 		removed++;
 	}
 	return (removed);
 }
 
 static void
 rctl_rule_acquire_subject(struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct != NULL)
 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip != NULL)
 			uihold(rule->rr_subject.rs_uip);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass != NULL)
 			loginclass_hold(rule->rr_subject.rs_loginclass);
 		break;
 	default:
 		panic("rctl_rule_acquire_subject: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 }
 
 static void
 rctl_rule_release_subject(struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct != NULL)
 			prison_racct_free(rule->rr_subject.rs_prison_racct);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip != NULL)
 			uifree(rule->rr_subject.rs_uip);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass != NULL)
 			loginclass_free(rule->rr_subject.rs_loginclass);
 		break;
 	default:
 		panic("rctl_rule_release_subject: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 }
 
 struct rctl_rule *
 rctl_rule_alloc(int flags)
 {
 	struct rctl_rule *rule;
 
 	ASSERT_RACCT_ENABLED();
 
 	rule = uma_zalloc(rctl_rule_zone, flags);
 	if (rule == NULL)
 		return (NULL);
 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
 	rule->rr_subject.rs_proc = NULL;
 	rule->rr_subject.rs_uip = NULL;
 	rule->rr_subject.rs_loginclass = NULL;
 	rule->rr_subject.rs_prison_racct = NULL;
 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
 	rule->rr_resource = RACCT_UNDEFINED;
 	rule->rr_action = RCTL_ACTION_UNDEFINED;
 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
 	refcount_init(&rule->rr_refcount, 1);
 
 	return (rule);
 }
 
 struct rctl_rule *
 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
 {
 	struct rctl_rule *copy;
 
 	ASSERT_RACCT_ENABLED();
 
 	copy = uma_zalloc(rctl_rule_zone, flags);
 	if (copy == NULL)
 		return (NULL);
 	copy->rr_subject_type = rule->rr_subject_type;
 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
 	copy->rr_per = rule->rr_per;
 	copy->rr_resource = rule->rr_resource;
 	copy->rr_action = rule->rr_action;
 	copy->rr_amount = rule->rr_amount;
 	refcount_init(&copy->rr_refcount, 1);
 	rctl_rule_acquire_subject(copy);
 
 	return (copy);
 }
 
 void
 rctl_rule_acquire(struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
 
 	refcount_acquire(&rule->rr_refcount);
 }
 
 static void
 rctl_rule_free(void *context, int pending)
 {
 	struct rctl_rule *rule;
 	
 	rule = (struct rctl_rule *)context;
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
 	
 	/*
 	 * We don't need locking here; rule is guaranteed to be inaccessible.
 	 */
 	
 	rctl_rule_release_subject(rule);
 	uma_zfree(rctl_rule_zone, rule);
 }
 
 void
 rctl_rule_release(struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
 
 	if (refcount_release(&rule->rr_refcount)) {
 		/*
 		 * rctl_rule_release() is often called when iterating
 		 * over all the uidinfo structures in the system,
 		 * holding uihashtbl_lock.  Since rctl_rule_free()
 		 * might end up calling uifree(), this would lead
 		 * to lock recursion.  Use taskqueue to avoid this.
 		 */
 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
 	}
 }
 
 static int
 rctl_rule_fully_specified(const struct rctl_rule *rule)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_UNDEFINED:
 		return (0);
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		if (rule->rr_subject.rs_proc == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass == NULL)
 			return (0);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct == NULL)
 			return (0);
 		break;
 	default:
 		panic("rctl_rule_fully_specified: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 	if (rule->rr_resource == RACCT_UNDEFINED)
 		return (0);
 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
 		return (0);
 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
 		return (0);
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
 		return (0);
 
 	return (1);
 }
 
 static int
 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
 {
 	struct rctl_rule *rule;
 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
 	     *amountstr, *perstr;
 	id_t id;
 	int error = 0;
 
 	ASSERT_RACCT_ENABLED();
 
 	rule = rctl_rule_alloc(M_WAITOK);
 
 	subjectstr = strsep(&rulestr, ":");
 	subject_idstr = strsep(&rulestr, ":");
 	resourcestr = strsep(&rulestr, ":");
 	actionstr = strsep(&rulestr, "=/");
 	amountstr = strsep(&rulestr, "/");
 	perstr = rulestr;
 
 	if (subjectstr == NULL || subjectstr[0] == '\0')
 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
 	else {
 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
 		rule->rr_subject.rs_proc = NULL;
 		rule->rr_subject.rs_uip = NULL;
 		rule->rr_subject.rs_loginclass = NULL;
 		rule->rr_subject.rs_prison_racct = NULL;
 	} else {
 		switch (rule->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_UNDEFINED:
 			error = EINVAL;
 			goto out;
 		case RCTL_SUBJECT_TYPE_PROCESS:
 			error = str2id(subject_idstr, &id);
 			if (error != 0)
 				goto out;
 			sx_assert(&allproc_lock, SA_LOCKED);
 			rule->rr_subject.rs_proc = pfind(id);
 			if (rule->rr_subject.rs_proc == NULL) {
 				error = ESRCH;
 				goto out;
 			}
 			PROC_UNLOCK(rule->rr_subject.rs_proc);
 			break;
 		case RCTL_SUBJECT_TYPE_USER:
 			error = str2id(subject_idstr, &id);
 			if (error != 0)
 				goto out;
 			rule->rr_subject.rs_uip = uifind(id);
 			break;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			rule->rr_subject.rs_loginclass =
 			    loginclass_find(subject_idstr);
 			if (rule->rr_subject.rs_loginclass == NULL) {
 				error = ENAMETOOLONG;
 				goto out;
 			}
 			break;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			rule->rr_subject.rs_prison_racct =
 			    prison_racct_find(subject_idstr);
 			if (rule->rr_subject.rs_prison_racct == NULL) {
 				error = ENAMETOOLONG;
 				goto out;
 			}
 			break;
                default:
                        panic("rctl_string_to_rule: unknown subject type %d",
                            rule->rr_subject_type);
                }
 	}
 
 	if (resourcestr == NULL || resourcestr[0] == '\0')
 		rule->rr_resource = RACCT_UNDEFINED;
 	else {
 		error = str2value(resourcestr, &rule->rr_resource,
 		    resourcenames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (actionstr == NULL || actionstr[0] == '\0')
 		rule->rr_action = RCTL_ACTION_UNDEFINED;
 	else {
 		error = str2value(actionstr, &rule->rr_action, actionnames);
 		if (error != 0)
 			goto out;
 	}
 
 	if (amountstr == NULL || amountstr[0] == '\0')
 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
 	else {
 		error = str2int64(amountstr, &rule->rr_amount);
 		if (error != 0)
 			goto out;
 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
 			if (rule->rr_amount > INT64_MAX / 1000000) {
 				error = ERANGE;
 				goto out;
 			}
 			rule->rr_amount *= 1000000;
 		}
 	}
 
 	if (perstr == NULL || perstr[0] == '\0')
 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
 	else {
 		error = str2value(perstr, &rule->rr_per, subjectnames);
 		if (error != 0)
 			goto out;
 	}
 
 out:
 	if (error == 0)
 		*rulep = rule;
 	else
 		rctl_rule_release(rule);
 
 	return (error);
 }
 
 /*
  * Link a rule with all the subjects it applies to.
  */
 int
 rctl_rule_add(struct rctl_rule *rule)
 {
 	struct proc *p;
 	struct ucred *cred;
 	struct uidinfo *uip;
 	struct prison *pr;
 	struct prison_racct *prr;
 	struct loginclass *lc;
 	struct rctl_rule *rule2;
 	int match;
 
 	ASSERT_RACCT_ENABLED();
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 
 	/*
 	 * Some rules just don't make sense, like "deny" rule for an undeniable
 	 * resource.  The exception are the RSS and %CPU resources - they are
 	 * not deniable in the racct sense, but the limit is enforced in
 	 * a different way.
 	 */
 	if (rule->rr_action == RCTL_ACTION_DENY &&
 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
 	    rule->rr_resource != RACCT_RSS &&
 	    rule->rr_resource != RACCT_PCTCPU) {
 		return (EOPNOTSUPP);
 	}
 
 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
 		return (EOPNOTSUPP);
 	}
 
 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
 	    rule->rr_resource == RACCT_PCTCPU) {
 		return (EOPNOTSUPP);
 	}
 
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
 		return (EOPNOTSUPP);
 	}
 
 	/*
 	 * Make sure there are no duplicated rules.  Also, for the "deny"
 	 * rules, remove ones differing only by "amount".
 	 */
 	if (rule->rr_action == RCTL_ACTION_DENY) {
 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
 		rctl_rule_remove(rule2);
 		rctl_rule_release(rule2);
 	} else
 		rctl_rule_remove(rule);
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		p = rule->rr_subject.rs_proc;
 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
 
 		rctl_racct_add_rule(p->p_racct, rule);
 		/*
 		 * In case of per-process rule, we don't have anything more
 		 * to do.
 		 */
 		return (0);
 
 	case RCTL_SUBJECT_TYPE_USER:
 		uip = rule->rr_subject.rs_uip;
 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
 		rctl_racct_add_rule(uip->ui_racct, rule);
 		break;
 
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		lc = rule->rr_subject.rs_loginclass;
 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
 		rctl_racct_add_rule(lc->lc_racct, rule);
 		break;
 
 	case RCTL_SUBJECT_TYPE_JAIL:
 		prr = rule->rr_subject.rs_prison_racct;
 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
 		rctl_racct_add_rule(prr->prr_racct, rule);
 		break;
 
 	default:
 		panic("rctl_rule_add: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 
 	/*
 	 * Now go through all the processes and add the new rule to the ones
 	 * it applies to.
 	 */
 	sx_assert(&allproc_lock, SA_LOCKED);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		cred = p->p_ucred;
 		switch (rule->rr_subject_type) {
 		case RCTL_SUBJECT_TYPE_USER:
 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
 				break;
 			continue;
 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
 				break;
 			continue;
 		case RCTL_SUBJECT_TYPE_JAIL:
 			match = 0;
 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
 					match = 1;
 					break;
 				}
 			}
 			if (match)
 				break;
 			continue;
 		default:
 			panic("rctl_rule_add: unknown subject type %d",
 			    rule->rr_subject_type);
 		}
 
 		rctl_racct_add_rule(p->p_racct, rule);
 	}
 
 	return (0);
 }
 
 static void
 rctl_rule_pre_callback(void)
 {
 
 	RACCT_LOCK();
 }
 
 static void
 rctl_rule_post_callback(void)
 {
 
 	RACCT_UNLOCK();
 }
 
 static void
 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
 {
 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
 	int found = 0;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	found += rctl_racct_remove_rules(racct, filter);
 
 	*((int *)arg3) += found;
 }
 
 /*
  * Remove all rules that match the filter.
  */
 int
 rctl_rule_remove(struct rctl_rule *filter)
 {
 	struct proc *p;
 	int found = 0;
 
 	ASSERT_RACCT_ENABLED();
 
 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
 	    filter->rr_subject.rs_proc != NULL) {
 		p = filter->rr_subject.rs_proc;
 		RACCT_LOCK();
 		found = rctl_racct_remove_rules(p->p_racct, filter);
 		RACCT_UNLOCK();
 		if (found)
 			return (0);
 		return (ESRCH);
 	}
 
 	loginclass_racct_foreach(rctl_rule_remove_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, (void *)&found);
 	ui_racct_foreach(rctl_rule_remove_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, (void *)&found);
 	prison_racct_foreach(rctl_rule_remove_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, (void *)&found);
 
 	sx_assert(&allproc_lock, SA_LOCKED);
 	RACCT_LOCK();
 	FOREACH_PROC_IN_SYSTEM(p) {
 		found += rctl_racct_remove_rules(p->p_racct, filter);
 	}
 	RACCT_UNLOCK();
 
 	if (found)
 		return (0);
 	return (ESRCH);
 }
 
 /*
  * Appends a rule to the sbuf.
  */
 static void
 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
 {
 	int64_t amount;
 
 	ASSERT_RACCT_ENABLED();
 
 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
 
 	switch (rule->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		if (rule->rr_subject.rs_proc == NULL)
 			sbuf_printf(sb, ":");
 		else
 			sbuf_printf(sb, "%d:",
 			    rule->rr_subject.rs_proc->p_pid);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		if (rule->rr_subject.rs_uip == NULL)
 			sbuf_printf(sb, ":");
 		else
 			sbuf_printf(sb, "%d:",
 			    rule->rr_subject.rs_uip->ui_uid);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		if (rule->rr_subject.rs_loginclass == NULL)
 			sbuf_printf(sb, ":");
 		else
 			sbuf_printf(sb, "%s:",
 			    rule->rr_subject.rs_loginclass->lc_name);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		if (rule->rr_subject.rs_prison_racct == NULL)
 			sbuf_printf(sb, ":");
 		else
 			sbuf_printf(sb, "%s:",
 			    rule->rr_subject.rs_prison_racct->prr_name);
 		break;
 	default:
 		panic("rctl_rule_to_sbuf: unknown subject type %d",
 		    rule->rr_subject_type);
 	}
 
 	amount = rule->rr_amount;
 	if (amount != RCTL_AMOUNT_UNDEFINED &&
 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
 		amount /= 1000000;
 
 	sbuf_printf(sb, "%s:%s=%jd",
 	    rctl_resource_name(rule->rr_resource),
 	    rctl_action_name(rule->rr_action),
 	    amount);
 
 	if (rule->rr_per != rule->rr_subject_type)
 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
 }
 
 /*
  * Routine used by RCTL syscalls to read in input string.
  */
 static int
 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
 {
 	char *str;
 	int error;
 
 	ASSERT_RACCT_ENABLED();
 
 	if (inbuflen <= 0)
 		return (EINVAL);
 	if (inbuflen > RCTL_MAX_INBUFSIZE)
 		return (E2BIG);
 
 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
 	error = copyinstr(inbufp, str, inbuflen, NULL);
 	if (error != 0) {
 		free(str, M_RCTL);
 		return (error);
 	}
 
 	*inputstr = str;
 
 	return (0);
 }
 
 /*
  * Routine used by RCTL syscalls to write out output string.
  */
 static int
 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
 {
 	int error;
 
 	ASSERT_RACCT_ENABLED();
 
 	if (outputsbuf == NULL)
 		return (0);
 
 	sbuf_finish(outputsbuf);
 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
 		sbuf_delete(outputsbuf);
 		return (ERANGE);
 	}
 	error = copyout(sbuf_data(outputsbuf), outbufp,
 	    sbuf_len(outputsbuf) + 1);
 	sbuf_delete(outputsbuf);
 	return (error);
 }
 
 static struct sbuf *
 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
 {
 	struct sbuf *sb;
 	int64_t amount;
 	int i;
 
 	ASSERT_RACCT_ENABLED();
 
 	sb = sbuf_new_auto();
 	for (i = 0; i <= RACCT_MAX; i++) {
 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
 			continue;
 		RACCT_LOCK();
 		amount = racct->r_resources[i];
 		RACCT_UNLOCK();
 		if (RACCT_IS_IN_MILLIONS(i))
 			amount /= 1000000;
 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
 	}
 	sbuf_setpos(sb, sbuf_len(sb) - 1);
 	return (sb);
 }
 
 int
 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
 {
 	struct rctl_rule *filter;
 	struct sbuf *outputsbuf = NULL;
 	struct proc *p;
 	struct uidinfo *uip;
 	struct loginclass *lc;
 	struct prison_racct *prr;
 	char *inputstr;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	switch (filter->rr_subject_type) {
 	case RCTL_SUBJECT_TYPE_PROCESS:
 		p = filter->rr_subject.rs_proc;
 		if (p == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
 		break;
 	case RCTL_SUBJECT_TYPE_USER:
 		uip = filter->rr_subject.rs_uip;
 		if (uip == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
 		break;
 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
 		lc = filter->rr_subject.rs_loginclass;
 		if (lc == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
 		break;
 	case RCTL_SUBJECT_TYPE_JAIL:
 		prr = filter->rr_subject.rs_prison_racct;
 		if (prr == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
 		break;
 	default:
 		error = EINVAL;
 	}
 out:
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	if (error != 0)
 		return (error);
 
 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
 
 	return (error);
 }
 
 static void
 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
 {
 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
 	struct rctl_rule_link *link;
 	struct sbuf *sb = (struct sbuf *)arg3;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
 		if (!rctl_rule_matches(link->rrl_rule, filter))
 			continue;
 		rctl_rule_to_sbuf(sb, link->rrl_rule);
 		sbuf_printf(sb, ",");
 	}
 }
 
 int
 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
 {
 	struct sbuf *sb;
 	struct rctl_rule *filter;
 	struct rctl_rule_link *link;
 	struct proc *p;
 	char *inputstr, *buf;
 	size_t bufsize;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_GET_RULES);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	bufsize = uap->outbuflen;
 	if (bufsize > rctl_maxbufsize) {
 		sx_sunlock(&allproc_lock);
 		return (E2BIG);
 	}
 
 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
 	KASSERT(sb != NULL, ("sbuf_new failed"));
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		RACCT_LOCK();
 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 			/*
 			 * Non-process rules will be added to the buffer later.
 			 * Adding them here would result in duplicated output.
 			 */
 			if (link->rrl_rule->rr_subject_type !=
 			    RCTL_SUBJECT_TYPE_PROCESS)
 				continue;
 			if (!rctl_rule_matches(link->rrl_rule, filter))
 				continue;
 			rctl_rule_to_sbuf(sb, link->rrl_rule);
 			sbuf_printf(sb, ",");
 		}
 		RACCT_UNLOCK();
 	}
 
 	loginclass_racct_foreach(rctl_get_rules_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, sb);
 	ui_racct_foreach(rctl_get_rules_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, sb);
 	prison_racct_foreach(rctl_get_rules_callback,
 	    rctl_rule_pre_callback, rctl_rule_post_callback,
 	    filter, sb);
 	if (sbuf_error(sb) == ENOMEM) {
 		error = ERANGE;
 		goto out;
 	}
 
 	/*
 	 * Remove trailing ",".
 	 */
 	if (sbuf_len(sb) > 0)
 		sbuf_setpos(sb, sbuf_len(sb) - 1);
 
 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
 out:
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	free(buf, M_RCTL);
 	return (error);
 }
 
 int
 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
 {
 	struct sbuf *sb;
 	struct rctl_rule *filter;
 	struct rctl_rule_link *link;
 	char *inputstr, *buf;
 	size_t bufsize;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EINVAL);
 	}
 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EOPNOTSUPP);
 	}
 	if (filter->rr_subject.rs_proc == NULL) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (EINVAL);
 	}
 
 	bufsize = uap->outbuflen;
 	if (bufsize > rctl_maxbufsize) {
 		rctl_rule_release(filter);
 		sx_sunlock(&allproc_lock);
 		return (E2BIG);
 	}
 
 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
 	KASSERT(sb != NULL, ("sbuf_new failed"));
 
 	RACCT_LOCK();
 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
 	    rrl_next) {
 		rctl_rule_to_sbuf(sb, link->rrl_rule);
 		sbuf_printf(sb, ",");
 	}
 	RACCT_UNLOCK();
 	if (sbuf_error(sb) == ENOMEM) {
 		error = ERANGE;
 		sbuf_delete(sb);
 		goto out;
 	}
 
 	/*
 	 * Remove trailing ",".
 	 */
 	if (sbuf_len(sb) > 0)
 		sbuf_setpos(sb, sbuf_len(sb) - 1);
 
 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
 out:
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 	free(buf, M_RCTL);
 	return (error);
 }
 
 int
 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
 {
 	struct rctl_rule *rule;
 	char *inputstr;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &rule);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 	/*
 	 * The 'per' part of a rule is optional.
 	 */
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
 		rule->rr_per = rule->rr_subject_type;
 
 	if (!rctl_rule_fully_specified(rule)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	error = rctl_rule_add(rule);
 
 out:
 	rctl_rule_release(rule);
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 int
 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
 {
 	struct rctl_rule *filter;
 	char *inputstr;
 	int error;
 
 	if (!racct_enable)
 		return (ENOSYS);
 
 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
 	if (error != 0)
 		return (error);
 
 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
 	if (error != 0)
 		return (error);
 
 	sx_slock(&allproc_lock);
 	error = rctl_string_to_rule(inputstr, &filter);
 	free(inputstr, M_RCTL);
 	if (error != 0) {
 		sx_sunlock(&allproc_lock);
 		return (error);
 	}
 
 	error = rctl_rule_remove(filter);
 	rctl_rule_release(filter);
 	sx_sunlock(&allproc_lock);
 
 	return (error);
 }
 
 /*
  * Update RCTL rule list after credential change.
  */
 void
 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
 {
 	LIST_HEAD(, rctl_rule_link) newrules;
 	struct rctl_rule_link *link, *newlink;
 	struct uidinfo *newuip;
 	struct loginclass *newlc;
 	struct prison_racct *newprr;
 	int rulecnt, i;
 
 	ASSERT_RACCT_ENABLED();
 
 	newuip = newcred->cr_ruidinfo;
 	newlc = newcred->cr_loginclass;
 	newprr = newcred->cr_prison->pr_prison_racct;
 	
 	LIST_INIT(&newrules);
 
 again:
 	/*
 	 * First, count the rules that apply to the process with new
 	 * credentials.
 	 */
 	rulecnt = 0;
 	RACCT_LOCK();
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS)
 			rulecnt++;
 	}
 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
 		rulecnt++;
 	RACCT_UNLOCK();
 
 	/*
 	 * Create temporary list.  We've dropped the rctl_lock in order
 	 * to use M_WAITOK.
 	 */
 	for (i = 0; i < rulecnt; i++) {
 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
 		newlink->rrl_rule = NULL;
 		newlink->rrl_exceeded = 0;
 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
 	}
 
 	newlink = LIST_FIRST(&newrules);
 
 	/*
 	 * Assign rules to the newly allocated list entries.
 	 */
 	RACCT_LOCK();
 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS) {
 			if (newlink == NULL)
 				goto goaround;
 			rctl_rule_acquire(link->rrl_rule);
 			newlink->rrl_rule = link->rrl_rule;
 			newlink->rrl_exceeded = link->rrl_exceeded;
 			newlink = LIST_NEXT(newlink, rrl_next);
 			rulecnt--;
 		}
 	}
 	
 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink->rrl_exceeded = link->rrl_exceeded;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink->rrl_exceeded = link->rrl_exceeded;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
 		if (newlink == NULL)
 			goto goaround;
 		rctl_rule_acquire(link->rrl_rule);
 		newlink->rrl_rule = link->rrl_rule;
 		newlink->rrl_exceeded = link->rrl_exceeded;
 		newlink = LIST_NEXT(newlink, rrl_next);
 		rulecnt--;
 	}
 
 	if (rulecnt == 0) {
 		/*
 		 * Free the old rule list.
 		 */
 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
 			link = LIST_FIRST(&p->p_racct->r_rule_links);
 			LIST_REMOVE(link, rrl_next);
 			rctl_rule_release(link->rrl_rule);
 			uma_zfree(rctl_rule_link_zone, link);
 		}
 
 		/*
 		 * Replace lists and we're done.
 		 *
 		 * XXX: Is there any way to switch list heads instead
 		 *      of iterating here?
 		 */
 		while (!LIST_EMPTY(&newrules)) {
 			newlink = LIST_FIRST(&newrules);
 			LIST_REMOVE(newlink, rrl_next);
 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
 			    newlink, rrl_next);
 		}
 
 		RACCT_UNLOCK();
 
 		return;
 	}
 
 goaround:
 	RACCT_UNLOCK();
 
 	/*
 	 * Rule list changed while we were not holding the rctl_lock.
 	 * Free the new list and try again.
 	 */
 	while (!LIST_EMPTY(&newrules)) {
 		newlink = LIST_FIRST(&newrules);
 		LIST_REMOVE(newlink, rrl_next);
 		if (newlink->rrl_rule != NULL)
 			rctl_rule_release(newlink->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, newlink);
 	}
 
 	goto again;
 }
 
 /*
  * Assign RCTL rules to the newly created process.
  */
 int
 rctl_proc_fork(struct proc *parent, struct proc *child)
 {
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	int error;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
 
 	LIST_INIT(&child->p_racct->r_rule_links);
 
 	/*
 	 * Go through limits applicable to the parent and assign them
 	 * to the child.  Rules with 'process' subject have to be duplicated
 	 * in order to make their rr_subject point to the new process.
 	 */
 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
 		if (link->rrl_rule->rr_subject_type ==
 		    RCTL_SUBJECT_TYPE_PROCESS) {
 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
 			if (rule == NULL)
 				goto fail;
 			KASSERT(rule->rr_subject.rs_proc == parent,
 			    ("rule->rr_subject.rs_proc != parent"));
 			rule->rr_subject.rs_proc = child;
 			error = rctl_racct_add_rule_locked(child->p_racct,
 			    rule);
 			rctl_rule_release(rule);
 			if (error != 0)
 				goto fail;
 		} else {
 			error = rctl_racct_add_rule_locked(child->p_racct,
 			    link->rrl_rule);
 			if (error != 0)
 				goto fail;
 		}
 	}
 
 	return (0);
 
 fail:
 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
 		link = LIST_FIRST(&child->p_racct->r_rule_links);
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 	}
 
 	return (EAGAIN);
 }
 
 /*
  * Release rules attached to the racct.
  */
 void
 rctl_racct_release(struct racct *racct)
 {
 	struct rctl_rule_link *link;
 
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
 	while (!LIST_EMPTY(&racct->r_rule_links)) {
 		link = LIST_FIRST(&racct->r_rule_links);
 		LIST_REMOVE(link, rrl_next);
 		rctl_rule_release(link->rrl_rule);
 		uma_zfree(rctl_rule_link_zone, link);
 	}
 }
 
 static void
 rctl_init(void)
 {
 
 	if (!racct_enable)
 		return;
 
 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 
 	/*
 	 * Set default values, making sure not to overwrite the ones
 	 * fetched from tunables.  Most of those could be set at the
 	 * declaration, except for the rctl_throttle_max - we cannot
 	 * set it there due to hz not being compile time constant.
 	 */
 	if (rctl_throttle_min < 1)
 		rctl_throttle_min = 1;
 	if (rctl_throttle_max < rctl_throttle_min)
 		rctl_throttle_max = 2 * hz;
 	if (rctl_throttle_pct < 0)
 		rctl_throttle_pct = 100;
 	if (rctl_throttle_pct2 < 0)
 		rctl_throttle_pct2 = 100;
 }
 
 #else /* !RCTL */
 
 int
 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 int
 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 int
 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 int
 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 int
 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
 {
 	
 	return (ENOSYS);
 }
 
 #endif /* !RCTL */
Index: head/sys/kern/kern_rwlock.c
===================================================================
--- head/sys/kern/kern_rwlock.c	(revision 326270)
+++ head/sys/kern/kern_rwlock.c	(revision 326271)
@@ -1,1464 +1,1466 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Machine independent bits of reader/writer lock implementation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_no_adaptive_rwlocks.h"
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/turnstile.h>
 
 #include <machine/cpu.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
 #define	ADAPTIVE_RWLOCKS
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 /*
  * Return the rwlock address when the lock cookie address is provided.
  * This functionality assumes that struct rwlock* have a member named rw_lock.
  */
 #define	rwlock2rw(c)	(__containerof(c, struct rwlock, rw_lock))
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 static void	db_show_rwlock(const struct lock_object *lock);
 #endif
 static void	assert_rw(const struct lock_object *lock, int what);
 static void	lock_rw(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_rw(const struct lock_object *lock, struct thread **owner);
 #endif
 static uintptr_t unlock_rw(struct lock_object *lock);
 
 struct lock_class lock_class_rw = {
 	.lc_name = "rw",
 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
 	.lc_assert = assert_rw,
 #ifdef DDB
 	.lc_ddb_show = db_show_rwlock,
 #endif
 	.lc_lock = lock_rw,
 	.lc_unlock = unlock_rw,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_rw,
 #endif
 };
 
 #ifdef ADAPTIVE_RWLOCKS
 static int __read_frequently rowner_retries = 10;
 static int __read_frequently rowner_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
     "rwlock debugging");
 SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
 SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
 
 static struct lock_delay_config __read_frequently rw_delay;
 
 SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_base, CTLFLAG_RW, &rw_delay.base,
     0, "");
 SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max,
     0, "");
 
 LOCK_DELAY_SYSINIT_DEFAULT(rw_delay);
 #endif
 
 /*
  * Return a pointer to the owning thread if the lock is write-locked or
  * NULL if the lock is unlocked or read-locked.
  */
 
 #define	lv_rw_wowner(v)							\
 	((v) & RW_LOCK_READ ? NULL :					\
 	 (struct thread *)RW_OWNER((v)))
 
 #define	rw_wowner(rw)	lv_rw_wowner(RW_READ_VALUE(rw))
 
 /*
  * Returns if a write owner is recursed.  Write ownership is not assured
  * here and should be previously checked.
  */
 #define	rw_recursed(rw)		((rw)->rw_recurse != 0)
 
 /*
  * Return true if curthread helds the lock.
  */
 #define	rw_wlocked(rw)		(rw_wowner((rw)) == curthread)
 
 /*
  * Return a pointer to the owning thread for this lock who should receive
  * any priority lent by threads that block on this lock.  Currently this
  * is identical to rw_wowner().
  */
 #define	rw_owner(rw)		rw_wowner(rw)
 
 #ifndef INVARIANTS
 #define	__rw_assert(c, what, file, line)
 #endif
 
 void
 assert_rw(const struct lock_object *lock, int what)
 {
 
 	rw_assert((const struct rwlock *)lock, what);
 }
 
 void
 lock_rw(struct lock_object *lock, uintptr_t how)
 {
 	struct rwlock *rw;
 
 	rw = (struct rwlock *)lock;
 	if (how)
 		rw_rlock(rw);
 	else
 		rw_wlock(rw);
 }
 
 uintptr_t
 unlock_rw(struct lock_object *lock)
 {
 	struct rwlock *rw;
 
 	rw = (struct rwlock *)lock;
 	rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
 	if (rw->rw_lock & RW_LOCK_READ) {
 		rw_runlock(rw);
 		return (1);
 	} else {
 		rw_wunlock(rw);
 		return (0);
 	}
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_rw(const struct lock_object *lock, struct thread **owner)
 {
 	const struct rwlock *rw = (const struct rwlock *)lock;
 	uintptr_t x = rw->rw_lock;
 
 	*owner = rw_wowner(rw);
 	return ((x & RW_LOCK_READ) != 0 ?  (RW_READERS(x) != 0) :
 	    (*owner != NULL));
 }
 #endif
 
 void
 _rw_init_flags(volatile uintptr_t *c, const char *name, int opts)
 {
 	struct rwlock *rw;
 	int flags;
 
 	rw = rwlock2rw(c);
 
 	MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
 	    RW_RECURSE | RW_NEW)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock,
 	    ("%s: rw_lock not aligned for %s: %p", __func__, name,
 	    &rw->rw_lock));
 
 	flags = LO_UPGRADABLE;
 	if (opts & RW_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & RW_NOPROFILE)
 		flags |= LO_NOPROFILE;
 	if (!(opts & RW_NOWITNESS))
 		flags |= LO_WITNESS;
 	if (opts & RW_RECURSE)
 		flags |= LO_RECURSABLE;
 	if (opts & RW_QUIET)
 		flags |= LO_QUIET;
 	if (opts & RW_NEW)
 		flags |= LO_NEW;
 
 	lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
 	rw->rw_lock = RW_UNLOCKED;
 	rw->rw_recurse = 0;
 }
 
 void
 _rw_destroy(volatile uintptr_t *c)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw));
 	KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw));
 	rw->rw_lock = RW_DESTROYED;
 	lock_destroy(&rw->lock_object);
 }
 
 void
 rw_sysinit(void *arg)
 {
 	struct rw_args *args;
 
 	args = arg;
 	rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
 	    args->ra_flags);
 }
 
 int
 _rw_wowned(const volatile uintptr_t *c)
 {
 
 	return (rw_wowner(rwlock2rw(c)) == curthread);
 }
 
 void
 _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	uintptr_t tid, v;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
 	    !TD_IS_IDLETHREAD(curthread),
 	    ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
 	    line, NULL);
 	tid = (uintptr_t)curthread;
 	v = RW_UNLOCKED;
 	if (!_rw_write_lock_fetch(rw, &v, tid))
 		_rw_wlock_hard(rw, v, file, line);
 	else
 		LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw,
 		    0, 0, file, line, LOCKSTAT_WRITER);
 
 	LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
 	WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
 	TD_LOCKS_INC(curthread);
 }
 
 int
 __rw_try_wlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
 {
 	struct thread *td;
 	uintptr_t tid, v;
 	int rval;
 	bool recursed;
 
 	td = curthread;
 	tid = (uintptr_t)td;
 	if (SCHEDULER_STOPPED_TD(td))
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td),
 	    ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line));
 
 	rval = 1;
 	recursed = false;
 	v = RW_UNLOCKED;
 	for (;;) {
 		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid))
 			break;
 		if (v == RW_UNLOCKED)
 			continue;
 		if (v == tid && (rw->lock_object.lo_flags & LO_RECURSABLE)) {
 			rw->rw_recurse++;
 			atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
 			break;
 		}
 		rval = 0;
 		break;
 	}
 
 	LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		if (!recursed)
 			LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire,
 			    rw, 0, 0, file, line, LOCKSTAT_WRITER);
 		TD_LOCKS_INC(curthread);
 	}
 	return (rval);
 }
 
 int
 __rw_try_wlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 	return (__rw_try_wlock_int(rw LOCK_FILE_LINE_ARG));
 }
 
 void
 _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_WLOCKED, file, line);
 	WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
 	    line);
 
 #ifdef LOCK_PROFILING
 	_rw_wunlock_hard(rw, (uintptr_t)curthread, file, line);
 #else
 	__rw_wunlock(rw, curthread, file, line);
 #endif
 
 	TD_LOCKS_DEC(curthread);
 }
 
 /*
  * Determines whether a new reader can acquire a lock.  Succeeds if the
  * reader already owns a read lock and the lock is locked for read to
  * prevent deadlock from reader recursion.  Also succeeds if the lock
  * is unlocked and has no writer waiters or spinners.  Failing otherwise
  * prioritizes writers before readers.
  */
 static bool __always_inline
 __rw_can_read(struct thread *td, uintptr_t v, bool fp)
 {
 
 	if ((v & (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER))
 	    == RW_LOCK_READ)
 		return (true);
 	if (!fp && td->td_rw_rlocks && (v & RW_LOCK_READ))
 		return (true);
 	return (false);
 }
 
 static bool __always_inline
 __rw_rlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp, bool fp
     LOCK_FILE_LINE_ARG_DEF)
 {
 
 	/*
 	 * Handle the easy case.  If no other thread has a write
 	 * lock, then try to bump up the count of read locks.  Note
 	 * that we have to preserve the current state of the
 	 * RW_LOCK_WRITE_WAITERS flag.  If we fail to acquire a
 	 * read lock, then rw_lock must have changed, so restart
 	 * the loop.  Note that this handles the case of a
 	 * completely unlocked rwlock since such a lock is encoded
 	 * as a read lock with no waiters.
 	 */
 	while (__rw_can_read(td, *vp, fp)) {
 		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, vp,
 			*vp + RW_ONE_READER)) {
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR4(KTR_LOCK,
 				    "%s: %p succeed %p -> %p", __func__,
 				    rw, (void *)*vp,
 				    (void *)(*vp + RW_ONE_READER));
 			td->td_rw_rlocks++;
 			return (true);
 		}
 	}
 	return (false);
 }
 
 static void __noinline
 __rw_rlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v
     LOCK_FILE_LINE_ARG_DEF)
 {
 	struct turnstile *ts;
 	struct thread *owner;
 #ifdef ADAPTIVE_RWLOCKS
 	int spintries = 0;
 	int i, n;
 #endif
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS)
 	struct lock_delay_arg lda;
 #endif
 #ifdef KDTRACE_HOOKS
 	u_int sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
 #endif
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	uintptr_t state;
 	int doing_lockprof;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 #if defined(ADAPTIVE_RWLOCKS)
 	lock_delay_arg_init(&lda, &rw_delay);
 #elif defined(KDTRACE_HOOKS)
 	lock_delay_arg_init(&lda, NULL);
 #endif
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&rw->lock_object,
 	    &contested, &waittime);
 
 #ifdef LOCK_PROFILING
 	doing_lockprof = 1;
 	state = v;
 #elif defined(KDTRACE_HOOKS)
 	doing_lockprof = lockstat_enabled;
 	if (__predict_false(doing_lockprof)) {
 		all_time -= lockstat_nsecs(&rw->lock_object);
 		state = v;
 	}
 #endif
 
 	for (;;) {
 		if (__rw_rlock_try(rw, td, &v, false LOCK_FILE_LINE_ARG))
 			break;
 #ifdef KDTRACE_HOOKS
 		lda.spin_cnt++;
 #endif
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if ((v & RW_LOCK_READ) == 0) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, rw, owner);
 				KTR_STATE1(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "spinning",
 				    "lockname:\"%s\"", rw->lock_object.lo_name);
 				do {
 					lock_delay(&lda);
 					v = RW_READ_VALUE(rw);
 					owner = lv_rw_wowner(v);
 				} while (owner != NULL && TD_IS_RUNNING(owner));
 				KTR_STATE0(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "running");
 				continue;
 			}
 		} else if (spintries < rowner_retries) {
 			spintries++;
 			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
 			    "spinning", "lockname:\"%s\"",
 			    rw->lock_object.lo_name);
 			for (i = 0; i < rowner_loops; i += n) {
 				n = RW_READERS(v);
 				lock_delay_spin(n);
 				v = RW_READ_VALUE(rw);
 				if ((v & RW_LOCK_READ) == 0 || __rw_can_read(td, v, false))
 					break;
 			}
 #ifdef KDTRACE_HOOKS
 			lda.spin_cnt += rowner_loops - i;
 #endif
 			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
 			    "running");
 			if (i != rowner_loops)
 				continue;
 		}
 #endif
 
 		/*
 		 * Okay, now it's the hard case.  Some other thread already
 		 * has a write lock or there are write waiters present,
 		 * acquire the turnstile lock so we can begin the process
 		 * of blocking.
 		 */
 		ts = turnstile_trywait(&rw->lock_object);
 
 		/*
 		 * The lock might have been released while we spun, so
 		 * recheck its state and restart the loop if needed.
 		 */
 		v = RW_READ_VALUE(rw);
 retry_ts:
 		if (__rw_can_read(td, v, false)) {
 			turnstile_cancel(ts);
 			continue;
 		}
 
 		owner = lv_rw_wowner(v);
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		if (owner != NULL) {
 			if (TD_IS_RUNNING(owner)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * The lock is held in write mode or it already has waiters.
 		 */
 		MPASS(!__rw_can_read(td, v, false));
 
 		/*
 		 * If the RW_LOCK_READ_WAITERS flag is already set, then
 		 * we can go ahead and block.  If it is not set then try
 		 * to set it.  If we fail to set it drop the turnstile
 		 * lock and restart the loop.
 		 */
 		if (!(v & RW_LOCK_READ_WAITERS)) {
 			if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
 			    v | RW_LOCK_READ_WAITERS))
 				goto retry_ts;
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set read waiters flag",
 				    __func__, rw);
 		}
 
 		/*
 		 * We were unable to acquire the lock and the read waiters
 		 * flag is set, so we must block on the turnstile.
 		 */
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
 			    rw);
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs(&rw->lock_object);
 #endif
 		MPASS(owner == rw_owner(rw));
 		turnstile_wait(ts, owner, TS_SHARED_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs(&rw->lock_object);
 		sleep_cnt++;
 #endif
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
 			    __func__, rw);
 		v = RW_READ_VALUE(rw);
 	}
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	if (__predict_true(!doing_lockprof))
 		return;
 #endif
 #ifdef KDTRACE_HOOKS
 	all_time += lockstat_nsecs(&rw->lock_object);
 	if (sleep_time)
 		LOCKSTAT_RECORD4(rw__block, rw, sleep_time,
 		    LOCKSTAT_READER, (state & RW_LOCK_READ) == 0,
 		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
 
 	/* Record only the loops spinning and not sleeping. */
 	if (lda.spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time,
 		    LOCKSTAT_READER, (state & RW_LOCK_READ) == 0,
 		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
 #endif
 	/*
 	 * TODO: acquire "owner of record" here.  Here be turnstile dragons
 	 * however.  turnstiles don't like owners changing between calls to
 	 * turnstile_wait() currently.
 	 */
 	LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested,
 	    waittime, file, line, LOCKSTAT_READER);
 }
 
 void
 __rw_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
 {
 	struct thread *td;
 	uintptr_t v;
 
 	td = curthread;
 
 	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED_TD(td) ||
 	    !TD_IS_IDLETHREAD(td),
 	    ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d",
 	    td, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
 	KASSERT(rw_wowner(rw) != td,
 	    ("rw_rlock: wlock already held for %s @ %s:%d",
 	    rw->lock_object.lo_name, file, line));
 	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL);
 
 	v = RW_READ_VALUE(rw);
 	if (__predict_false(LOCKSTAT_OOL_PROFILE_ENABLED(rw__acquire) ||
 	    !__rw_rlock_try(rw, td, &v, true LOCK_FILE_LINE_ARG)))
 		__rw_rlock_hard(rw, td, v LOCK_FILE_LINE_ARG);
 
 	LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
 	WITNESS_LOCK(&rw->lock_object, 0, file, line);
 	TD_LOCKS_INC(curthread);
 }
 
 void
 __rw_rlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 	__rw_rlock_int(rw LOCK_FILE_LINE_ARG);
 }
 
 int
 __rw_try_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t x;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 
 	x = rw->rw_lock;
 	for (;;) {
 		KASSERT(rw->rw_lock != RW_DESTROYED,
 		    ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line));
 		if (!(x & RW_LOCK_READ))
 			break;
 		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &x, x + RW_ONE_READER)) {
 			LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file,
 			    line);
 			WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line);
 			LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire,
 			    rw, 0, 0, file, line, LOCKSTAT_READER);
 			TD_LOCKS_INC(curthread);
 			curthread->td_rw_rlocks++;
 			return (1);
 		}
 	}
 
 	LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line);
 	return (0);
 }
 
 int
 __rw_try_rlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 	return (__rw_try_rlock_int(rw LOCK_FILE_LINE_ARG));
 }
 
 static bool __always_inline
 __rw_runlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp)
 {
 
 	for (;;) {
 		/*
 		 * See if there is more than one read lock held.  If so,
 		 * just drop one and return.
 		 */
 		if (RW_READERS(*vp) > 1) {
 			if (atomic_fcmpset_rel_ptr(&rw->rw_lock, vp,
 			    *vp - RW_ONE_READER)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeeded %p -> %p",
 					    __func__, rw, (void *)*vp,
 					    (void *)(*vp - RW_ONE_READER));
 				td->td_rw_rlocks--;
 				return (true);
 			}
 			continue;
 		}
 		/*
 		 * If there aren't any waiters for a write lock, then try
 		 * to drop it quickly.
 		 */
 		if (!(*vp & RW_LOCK_WAITERS)) {
 			MPASS((*vp & ~RW_LOCK_WRITE_SPINNER) ==
 			    RW_READERS_LOCK(1));
 			if (atomic_fcmpset_rel_ptr(&rw->rw_lock, vp,
 			    RW_UNLOCKED)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR2(KTR_LOCK, "%s: %p last succeeded",
 					    __func__, rw);
 				td->td_rw_rlocks--;
 				return (true);
 			}
 			continue;
 		}
 		break;
 	}
 	return (false);
 }
 
 static void __noinline
 __rw_runlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v
     LOCK_FILE_LINE_ARG_DEF)
 {
 	struct turnstile *ts;
 	uintptr_t x, queue;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	for (;;) {
 		if (__rw_runlock_try(rw, td, &v))
 			break;
 
 		/*
 		 * Ok, we know we have waiters and we think we are the
 		 * last reader, so grab the turnstile lock.
 		 */
 		turnstile_chain_lock(&rw->lock_object);
 		v = RW_READ_VALUE(rw);
 retry_ts:
 		if (__predict_false(RW_READERS(v) > 1)) {
 			turnstile_chain_unlock(&rw->lock_object);
 			continue;
 		}
 
 		v &= (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
 		MPASS(v & RW_LOCK_WAITERS);
 
 		/*
 		 * Try to drop our lock leaving the lock in a unlocked
 		 * state.
 		 *
 		 * If you wanted to do explicit lock handoff you'd have to
 		 * do it here.  You'd also want to use turnstile_signal()
 		 * and you'd have to handle the race where a higher
 		 * priority thread blocks on the write lock before the
 		 * thread you wakeup actually runs and have the new thread
 		 * "steal" the lock.  For now it's a lot simpler to just
 		 * wakeup all of the waiters.
 		 *
 		 * As above, if we fail, then another thread might have
 		 * acquired a read lock, so drop the turnstile lock and
 		 * restart.
 		 */
 		x = RW_UNLOCKED;
 		if (v & RW_LOCK_WRITE_WAITERS) {
 			queue = TS_EXCLUSIVE_QUEUE;
 			x |= (v & RW_LOCK_READ_WAITERS);
 		} else
 			queue = TS_SHARED_QUEUE;
 		v |= RW_READERS_LOCK(1);
 		if (!atomic_fcmpset_rel_ptr(&rw->rw_lock, &v, x))
 			goto retry_ts;
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
 			    __func__, rw);
 
 		/*
 		 * Ok.  The lock is released and all that's left is to
 		 * wake up the waiters.  Note that the lock might not be
 		 * free anymore, but in that case the writers will just
 		 * block again if they run before the new lock holder(s)
 		 * release the lock.
 		 */
 		ts = turnstile_lookup(&rw->lock_object);
 		MPASS(ts != NULL);
 		turnstile_broadcast(ts, queue);
 		turnstile_unpend(ts, TS_SHARED_LOCK);
 		turnstile_chain_unlock(&rw->lock_object);
 		td->td_rw_rlocks--;
 		break;
 	}
 	LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_READER);
 }
 
 void
 _rw_runlock_cookie_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
 {
 	struct thread *td;
 	uintptr_t v;
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(&rw->rw_lock, RA_RLOCKED, file, line);
 	WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
 
 	td = curthread;
 	v = RW_READ_VALUE(rw);
 
 	if (__predict_false(LOCKSTAT_OOL_PROFILE_ENABLED(rw__release) ||
 	    !__rw_runlock_try(rw, td, &v)))
 		__rw_runlock_hard(rw, td, v LOCK_FILE_LINE_ARG);
 
 	TD_LOCKS_DEC(curthread);
 }
 
 void
 _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 	_rw_runlock_cookie_int(rw LOCK_FILE_LINE_ARG);
 }
 
 /*
  * This function is called when we are unable to obtain a write lock on the
  * first try.  This means that at least one other thread holds either a
  * read or write lock.
  */
 void
 __rw_wlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t tid;
 	struct rwlock *rw;
 	struct turnstile *ts;
 	struct thread *owner;
 #ifdef ADAPTIVE_RWLOCKS
 	int spintries = 0;
 	int i, n;
 #endif
 	uintptr_t x;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS)
 	struct lock_delay_arg lda;
 #endif
 #ifdef KDTRACE_HOOKS
 	u_int sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
 #endif
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	uintptr_t state;
 	int doing_lockprof;
 #endif
 
 	tid = (uintptr_t)curthread;
 	if (SCHEDULER_STOPPED())
 		return;
 
 #if defined(ADAPTIVE_RWLOCKS)
 	lock_delay_arg_init(&lda, &rw_delay);
 #elif defined(KDTRACE_HOOKS)
 	lock_delay_arg_init(&lda, NULL);
 #endif
 	rw = rwlock2rw(c);
 	if (__predict_false(v == RW_UNLOCKED))
 		v = RW_READ_VALUE(rw);
 
 	if (__predict_false(lv_rw_wowner(v) == (struct thread *)tid)) {
 		KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE,
 		    ("%s: recursing but non-recursive rw %s @ %s:%d\n",
 		    __func__, rw->lock_object.lo_name, file, line));
 		rw->rw_recurse++;
 		atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
 		return;
 	}
 
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
 		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&rw->lock_object,
 	    &contested, &waittime);
 
 #ifdef LOCK_PROFILING
 	doing_lockprof = 1;
 	state = v;
 #elif defined(KDTRACE_HOOKS)
 	doing_lockprof = lockstat_enabled;
 	if (__predict_false(doing_lockprof)) {
 		all_time -= lockstat_nsecs(&rw->lock_object);
 		state = v;
 	}
 #endif
 
 	for (;;) {
 		if (v == RW_UNLOCKED) {
 			if (_rw_write_lock_fetch(rw, &v, tid))
 				break;
 			continue;
 		}
 #ifdef KDTRACE_HOOKS
 		lda.spin_cnt++;
 #endif
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * If the lock is write locked and the owner is
 		 * running on another CPU, spin until the owner stops
 		 * running or the state of the lock changes.
 		 */
 		owner = lv_rw_wowner(v);
 		if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
 				    __func__, rw, owner);
 			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
 			    "spinning", "lockname:\"%s\"",
 			    rw->lock_object.lo_name);
 			do {
 				lock_delay(&lda);
 				v = RW_READ_VALUE(rw);
 				owner = lv_rw_wowner(v);
 			} while (owner != NULL && TD_IS_RUNNING(owner));
 			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
 			    "running");
 			continue;
 		}
 		if ((v & RW_LOCK_READ) && RW_READERS(v) &&
 		    spintries < rowner_retries) {
 			if (!(v & RW_LOCK_WRITE_SPINNER)) {
 				if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
 				    v | RW_LOCK_WRITE_SPINNER)) {
 					continue;
 				}
 			}
 			spintries++;
 			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
 			    "spinning", "lockname:\"%s\"",
 			    rw->lock_object.lo_name);
 			for (i = 0; i < rowner_loops; i += n) {
 				n = RW_READERS(v);
 				lock_delay_spin(n);
 				v = RW_READ_VALUE(rw);
 				if ((v & RW_LOCK_WRITE_SPINNER) == 0)
 					break;
 			}
 			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
 			    "running");
 #ifdef KDTRACE_HOOKS
 			lda.spin_cnt += rowner_loops - i;
 #endif
 			if (i != rowner_loops)
 				continue;
 		}
 #endif
 		ts = turnstile_trywait(&rw->lock_object);
 		v = RW_READ_VALUE(rw);
 retry_ts:
 		owner = lv_rw_wowner(v);
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		if (owner != NULL) {
 			if (TD_IS_RUNNING(owner)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 		}
 #endif
 		/*
 		 * Check for the waiters flags about this rwlock.
 		 * If the lock was released, without maintain any pending
 		 * waiters queue, simply try to acquire it.
 		 * If a pending waiters queue is present, claim the lock
 		 * ownership and maintain the pending queue.
 		 */
 		x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
 		if ((v & ~x) == RW_UNLOCKED) {
 			x &= ~RW_LOCK_WRITE_SPINNER;
 			if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid | x)) {
 				if (x)
 					turnstile_claim(ts);
 				else
 					turnstile_cancel(ts);
 				break;
 			}
 			goto retry_ts;
 		}
 		/*
 		 * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
 		 * set it.  If we fail to set it, then loop back and try
 		 * again.
 		 */
 		if (!(v & RW_LOCK_WRITE_WAITERS)) {
 			if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
 			    v | RW_LOCK_WRITE_WAITERS))
 				goto retry_ts;
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set write waiters flag",
 				    __func__, rw);
 		}
 		/*
 		 * We were unable to acquire the lock and the write waiters
 		 * flag is set, so we must block on the turnstile.
 		 */
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
 			    rw);
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs(&rw->lock_object);
 #endif
 		MPASS(owner == rw_owner(rw));
 		turnstile_wait(ts, owner, TS_EXCLUSIVE_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs(&rw->lock_object);
 		sleep_cnt++;
 #endif
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
 			    __func__, rw);
 #ifdef ADAPTIVE_RWLOCKS
 		spintries = 0;
 #endif
 		v = RW_READ_VALUE(rw);
 	}
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	if (__predict_true(!doing_lockprof))
 		return;
 #endif
 #ifdef KDTRACE_HOOKS
 	all_time += lockstat_nsecs(&rw->lock_object);
 	if (sleep_time)
 		LOCKSTAT_RECORD4(rw__block, rw, sleep_time,
 		    LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0,
 		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
 
 	/* Record only the loops spinning and not sleeping. */
 	if (lda.spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time,
 		    LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0,
 		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
 #endif
 	LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested,
 	    waittime, file, line, LOCKSTAT_WRITER);
 }
 
 /*
  * This function is called if lockstat is active or the first try at releasing
  * a write lock failed.  The latter means that the lock is recursed or one of
  * the 2 waiter bits must be set indicating that at least one thread is waiting
  * on this lock.
  */
 void
 __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 	uintptr_t tid, setv;
 	int queue;
 
 	tid = (uintptr_t)curthread;
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 	if (__predict_false(v == tid))
 		v = RW_READ_VALUE(rw);
 
 	if (v & RW_LOCK_WRITER_RECURSED) {
 		if (--(rw->rw_recurse) == 0)
 			atomic_clear_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
 		return;
 	}
 
 	LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_WRITER);
 	if (v == tid && _rw_write_unlock(rw, tid))
 		return;
 
 	KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
 	    ("%s: neither of the waiter flags are set", __func__));
 
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
 
 	turnstile_chain_lock(&rw->lock_object);
 
 	/*
 	 * Use the same algo as sx locks for now.  Prefer waking up shared
 	 * waiters if we have any over writers.  This is probably not ideal.
 	 *
 	 * 'v' is the value we are going to write back to rw_lock.  If we
 	 * have waiters on both queues, we need to preserve the state of
 	 * the waiter flag for the queue we don't wake up.  For now this is
 	 * hardcoded for the algorithm mentioned above.
 	 *
 	 * In the case of both readers and writers waiting we wakeup the
 	 * readers but leave the RW_LOCK_WRITE_WAITERS flag set.  If a
 	 * new writer comes in before a reader it will claim the lock up
 	 * above.  There is probably a potential priority inversion in
 	 * there that could be worked around either by waking both queues
 	 * of waiters or doing some complicated lock handoff gymnastics.
 	 */
 	setv = RW_UNLOCKED;
 	v = RW_READ_VALUE(rw);
 	queue = TS_SHARED_QUEUE;
 	if (v & RW_LOCK_WRITE_WAITERS) {
 		queue = TS_EXCLUSIVE_QUEUE;
 		setv |= (v & RW_LOCK_READ_WAITERS);
 	}
 	atomic_store_rel_ptr(&rw->rw_lock, setv);
 
 	/* Wake up all waiters for the specific queue. */
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
 		    queue == TS_SHARED_QUEUE ? "read" : "write");
 
 	ts = turnstile_lookup(&rw->lock_object);
 	MPASS(ts != NULL);
 	turnstile_broadcast(ts, queue);
 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	turnstile_chain_unlock(&rw->lock_object);
 }
 
 /*
  * Attempt to do a non-blocking upgrade from a read lock to a write
  * lock.  This will only succeed if this thread holds a single read
  * lock.  Returns true if the upgrade succeeded and false otherwise.
  */
 int
 __rw_try_upgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t v, x, tid;
 	struct turnstile *ts;
 	int success;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(&rw->rw_lock, RA_RLOCKED, file, line);
 
 	/*
 	 * Attempt to switch from one reader to a writer.  If there
 	 * are any write waiters, then we will have to lock the
 	 * turnstile first to prevent races with another writer
 	 * calling turnstile_wait() before we have claimed this
 	 * turnstile.  So, do the simple case of no waiters first.
 	 */
 	tid = (uintptr_t)curthread;
 	success = 0;
 	for (;;) {
 		v = rw->rw_lock;
 		if (RW_READERS(v) > 1)
 			break;
 		if (!(v & RW_LOCK_WAITERS)) {
 			success = atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid);
 			if (!success)
 				continue;
 			break;
 		}
 
 		/*
 		 * Ok, we think we have waiters, so lock the turnstile.
 		 */
 		ts = turnstile_trywait(&rw->lock_object);
 		v = rw->rw_lock;
 		if (RW_READERS(v) > 1) {
 			turnstile_cancel(ts);
 			break;
 		}
 		/*
 		 * Try to switch from one reader to a writer again.  This time
 		 * we honor the current state of the waiters flags.
 		 * If we obtain the lock with the flags set, then claim
 		 * ownership of the turnstile.
 		 */
 		x = rw->rw_lock & RW_LOCK_WAITERS;
 		success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x);
 		if (success) {
 			if (x)
 				turnstile_claim(ts);
 			else
 				turnstile_cancel(ts);
 			break;
 		}
 		turnstile_cancel(ts);
 	}
 	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
 	if (success) {
 		curthread->td_rw_rlocks--;
 		WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		LOCKSTAT_RECORD0(rw__upgrade, rw);
 	}
 	return (success);
 }
 
 int
 __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 	return (__rw_try_upgrade_int(rw LOCK_FILE_LINE_ARG));
 }
 
 /*
  * Downgrade a write lock into a single read lock.
  */
 void
 __rw_downgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
 {
 	struct turnstile *ts;
 	uintptr_t tid, v;
 	int rwait, wwait;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(&rw->rw_lock, RA_WLOCKED | RA_NOTRECURSED, file, line);
 #ifndef INVARIANTS
 	if (rw_recursed(rw))
 		panic("downgrade of a recursed lock");
 #endif
 
 	WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
 
 	/*
 	 * Convert from a writer to a single reader.  First we handle
 	 * the easy case with no waiters.  If there are any waiters, we
 	 * lock the turnstile and "disown" the lock.
 	 */
 	tid = (uintptr_t)curthread;
 	if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
 		goto out;
 
 	/*
 	 * Ok, we think we have waiters, so lock the turnstile so we can
 	 * read the waiter flags without any races.
 	 */
 	turnstile_chain_lock(&rw->lock_object);
 	v = rw->rw_lock & RW_LOCK_WAITERS;
 	rwait = v & RW_LOCK_READ_WAITERS;
 	wwait = v & RW_LOCK_WRITE_WAITERS;
 	MPASS(rwait | wwait);
 
 	/*
 	 * Downgrade from a write lock while preserving waiters flag
 	 * and give up ownership of the turnstile.
 	 */
 	ts = turnstile_lookup(&rw->lock_object);
 	MPASS(ts != NULL);
 	if (!wwait)
 		v &= ~RW_LOCK_READ_WAITERS;
 	atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v);
 	/*
 	 * Wake other readers if there are no writers pending.  Otherwise they
 	 * won't be able to acquire the lock anyway.
 	 */
 	if (rwait && !wwait) {
 		turnstile_broadcast(ts, TS_SHARED_QUEUE);
 		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	} else
 		turnstile_disown(ts);
 	turnstile_chain_unlock(&rw->lock_object);
 out:
 	curthread->td_rw_rlocks++;
 	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
 	LOCKSTAT_RECORD0(rw__downgrade, rw);
 }
 
 void
 __rw_downgrade(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 	__rw_downgrade_int(rw LOCK_FILE_LINE_ARG);
 }
 
 #ifdef INVARIANT_SUPPORT
 #ifndef INVARIANTS
 #undef __rw_assert
 #endif
 
 /*
  * In the non-WITNESS case, rw_assert() can only detect that at least
  * *some* thread owns an rlock, but it cannot guarantee that *this*
  * thread owns an rlock.
  */
 void
 __rw_assert(const volatile uintptr_t *c, int what, const char *file, int line)
 {
 	const struct rwlock *rw;
 
 	if (panicstr != NULL)
 		return;
 
 	rw = rwlock2rw(c);
 
 	switch (what) {
 	case RA_LOCKED:
 	case RA_LOCKED | RA_RECURSED:
 	case RA_LOCKED | RA_NOTRECURSED:
 	case RA_RLOCKED:
 	case RA_RLOCKED | RA_RECURSED:
 	case RA_RLOCKED | RA_NOTRECURSED:
 #ifdef WITNESS
 		witness_assert(&rw->lock_object, what, file, line);
 #else
 		/*
 		 * If some other thread has a write lock or we have one
 		 * and are asserting a read lock, fail.  Also, if no one
 		 * has a lock at all, fail.
 		 */
 		if (rw->rw_lock == RW_UNLOCKED ||
 		    (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED ||
 		    rw_wowner(rw) != curthread)))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    rw->lock_object.lo_name, (what & RA_RLOCKED) ?
 			    "read " : "", file, line);
 
 		if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) {
 			if (rw_recursed(rw)) {
 				if (what & RA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    rw->lock_object.lo_name, file,
 					    line);
 			} else if (what & RA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    rw->lock_object.lo_name, file, line);
 		}
 #endif
 		break;
 	case RA_WLOCKED:
 	case RA_WLOCKED | RA_RECURSED:
 	case RA_WLOCKED | RA_NOTRECURSED:
 		if (rw_wowner(rw) != curthread)
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 		if (rw_recursed(rw)) {
 			if (what & RA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    rw->lock_object.lo_name, file, line);
 		} else if (what & RA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 		break;
 	case RA_UNLOCKED:
 #ifdef WITNESS
 		witness_assert(&rw->lock_object, what, file, line);
 #else
 		/*
 		 * If we hold a write lock fail.  We can't reliably check
 		 * to see if we hold a read lock or not.
 		 */
 		if (rw_wowner(rw) == curthread)
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 #endif
 		break;
 	default:
 		panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
 		    line);
 	}
 }
 #endif /* INVARIANT_SUPPORT */
 
 #ifdef DDB
 void
 db_show_rwlock(const struct lock_object *lock)
 {
 	const struct rwlock *rw;
 	struct thread *td;
 
 	rw = (const struct rwlock *)lock;
 
 	db_printf(" state: ");
 	if (rw->rw_lock == RW_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (rw->rw_lock == RW_DESTROYED) {
 		db_printf("DESTROYED\n");
 		return;
 	} else if (rw->rw_lock & RW_LOCK_READ)
 		db_printf("RLOCK: %ju locks\n",
 		    (uintmax_t)(RW_READERS(rw->rw_lock)));
 	else {
 		td = rw_wowner(rw);
 		db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (rw_recursed(rw))
 			db_printf(" recursed: %u\n", rw->rw_recurse);
 	}
 	db_printf(" waiters: ");
 	switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
 	case RW_LOCK_READ_WAITERS:
 		db_printf("readers\n");
 		break;
 	case RW_LOCK_WRITE_WAITERS:
 		db_printf("writers\n");
 		break;
 	case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
 		db_printf("readers and writers\n");
 		break;
 	default:
 		db_printf("none\n");
 		break;
 	}
 }
 
 #endif
Index: head/sys/kern/kern_sdt.c
===================================================================
--- head/sys/kern/kern_sdt.c	(revision 326270)
+++ head/sys/kern/kern_sdt.c	(revision 326271)
@@ -1,54 +1,56 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright 2006-2008 John Birrell <jb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/sdt.h>
 
 SDT_PROVIDER_DEFINE(sdt);
 
 /*
  * Hook for the DTrace probe function. The SDT provider will set this to
  * dtrace_probe() when it loads.
  */
 sdt_probe_func_t sdt_probe_func = sdt_probe_stub;
 volatile bool __read_frequently sdt_probes_enabled;
 
 /*
  * This is a stub for probe calls in case kernel DTrace support isn't
  * enabled. It should never get called because there is no DTrace support
  * to enable it.
  */
 void
 sdt_probe_stub(uint32_t id, uintptr_t arg0, uintptr_t arg1,
     uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
 {
 
 	printf("sdt_probe_stub: unexpectedly called\n");
 	kdb_backtrace();
 }
Index: head/sys/kern/kern_sema.c
===================================================================
--- head/sys/kern/kern_sema.c	(revision 326270)
+++ head/sys/kern/kern_sema.c	(revision 326271)
@@ -1,176 +1,178 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible 
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 /*
  * Counting semaphores.
  *
  * Priority propagation will not generally raise the priority of semaphore
  * "owners" (a misnomer in the context of semaphores), so should not be relied
  * upon in combination with semaphores.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sema.h>
 
 void
 sema_init(struct sema *sema, int value, const char *description)
 {
 
 	KASSERT((value >= 0), ("%s(): negative value\n", __func__));
 
 	bzero(sema, sizeof(*sema));
 	mtx_init(&sema->sema_mtx, description, "sema backing lock",
 	    MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
 	cv_init(&sema->sema_cv, description);
 	sema->sema_value = value;
 
 	CTR4(KTR_LOCK, "%s(%p, %d, \"%s\")", __func__, sema, value, description);
 }
 
 void
 sema_destroy(struct sema *sema)
 {
 
 	CTR3(KTR_LOCK, "%s(%p) \"%s\"", __func__, sema,
 	    cv_wmesg(&sema->sema_cv));
 
 	KASSERT((sema->sema_waiters == 0), ("%s(): waiters\n", __func__));
 
 	mtx_destroy(&sema->sema_mtx);
 	cv_destroy(&sema->sema_cv);
 }
 
 void
 _sema_post(struct sema *sema, const char *file, int line)
 {
 
 	mtx_lock(&sema->sema_mtx);
 	sema->sema_value++;
 	if (sema->sema_waiters && sema->sema_value > 0)
 		cv_signal(&sema->sema_cv);
 
 	CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
 	    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
 
 	mtx_unlock(&sema->sema_mtx);
 }
 
 void
 _sema_wait(struct sema *sema, const char *file, int line)
 {
 
 	mtx_lock(&sema->sema_mtx);
 	while (sema->sema_value == 0) {
 		sema->sema_waiters++;
 		cv_wait(&sema->sema_cv, &sema->sema_mtx);
 		sema->sema_waiters--;
 	}
 	sema->sema_value--;
 
 	CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
 	    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
 
 	mtx_unlock(&sema->sema_mtx);
 }
 
 int
 _sema_timedwait(struct sema *sema, int timo, const char *file, int line)
 {
 	int error;
 
 	mtx_lock(&sema->sema_mtx);
 
 	/*
 	 * A spurious wakeup will cause the timeout interval to start over.
 	 * This isn't a big deal as long as spurious wakeups don't occur
 	 * continuously, since the timeout period is merely a lower bound on how
 	 * long to wait.
 	 */
 	for (error = 0; sema->sema_value == 0 && error == 0;) {
 		sema->sema_waiters++;
 		error = cv_timedwait(&sema->sema_cv, &sema->sema_mtx, timo);
 		sema->sema_waiters--;
 	}
 	if (sema->sema_value > 0) {
 		/* Success. */
 		sema->sema_value--;
 		error = 0;
 
 		CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
 		    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
 	} else {
 		CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
 		    cv_wmesg(&sema->sema_cv), file, line);
 	}
 
 	mtx_unlock(&sema->sema_mtx);
 	return (error);
 }
 
 int
 _sema_trywait(struct sema *sema, const char *file, int line)
 {
 	int ret;
 
 	mtx_lock(&sema->sema_mtx);
 
 	if (sema->sema_value > 0) {
 		/* Success. */
 		sema->sema_value--;
 		ret = 1;
 
 		CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
 		    cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
 	} else {
 		ret = 0;
 
 		CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
 		    cv_wmesg(&sema->sema_cv), file, line);
 	}
 
 	mtx_unlock(&sema->sema_mtx);
 	return (ret);
 }
 
 int
 sema_value(struct sema *sema)
 {
 	int ret;
 
 	mtx_lock(&sema->sema_mtx);
 	ret = sema->sema_value;
 	mtx_unlock(&sema->sema_mtx);
 	return (ret);
 }
Index: head/sys/kern/kern_sharedpage.c
===================================================================
--- head/sys/kern/kern_sharedpage.c	(revision 326270)
+++ head/sys/kern/kern_sharedpage.c	(revision 326271)
@@ -1,288 +1,290 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2010, 2012 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/rwlock.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/vdso.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 static struct sx shared_page_alloc_sx;
 static vm_object_t shared_page_obj;
 static int shared_page_free;
 char *shared_page_mapping;
 
 void
 shared_page_write(int base, int size, const void *data)
 {
 
 	bcopy(data, shared_page_mapping + base, size);
 }
 
 static int
 shared_page_alloc_locked(int size, int align)
 {
 	int res;
 
 	res = roundup(shared_page_free, align);
 	if (res + size >= IDX_TO_OFF(shared_page_obj->size))
 		res = -1;
 	else
 		shared_page_free = res + size;
 	return (res);
 }
 
 int
 shared_page_alloc(int size, int align)
 {
 	int res;
 
 	sx_xlock(&shared_page_alloc_sx);
 	res = shared_page_alloc_locked(size, align);
 	sx_xunlock(&shared_page_alloc_sx);
 	return (res);
 }
 
 int
 shared_page_fill(int size, int align, const void *data)
 {
 	int res;
 
 	sx_xlock(&shared_page_alloc_sx);
 	res = shared_page_alloc_locked(size, align);
 	if (res != -1)
 		shared_page_write(res, size, data);
 	sx_xunlock(&shared_page_alloc_sx);
 	return (res);
 }
 
 static void
 shared_page_init(void *dummy __unused)
 {
 	vm_page_t m;
 	vm_offset_t addr;
 
 	sx_init(&shared_page_alloc_sx, "shpsx");
 	shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
 	    VM_PROT_DEFAULT, 0, NULL);
 	VM_OBJECT_WLOCK(shared_page_obj);
 	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_NOBUSY | VM_ALLOC_ZERO);
 	m->valid = VM_PAGE_BITS_ALL;
 	VM_OBJECT_WUNLOCK(shared_page_obj);
 	addr = kva_alloc(PAGE_SIZE);
 	pmap_qenter(addr, &m, 1);
 	shared_page_mapping = (char *)addr;
 }
 
 SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
     NULL);
 
 /*
  * Push the timehands update to the shared page.
  *
  * The lockless update scheme is similar to the one used to update the
  * in-kernel timehands, see sys/kern/kern_tc.c:tc_windup() (which
  * calls us after the timehands are updated).
  */
 static void
 timehands_update(struct vdso_sv_tk *svtk)
 {
 	struct vdso_timehands th;
 	struct vdso_timekeep *tk;
 	uint32_t enabled, idx;
 
 	enabled = tc_fill_vdso_timehands(&th);
 	th.th_gen = 0;
 	idx = svtk->sv_timekeep_curr;
 	if (++idx >= VDSO_TH_NUM)
 		idx = 0;
 	svtk->sv_timekeep_curr = idx;
 	if (++svtk->sv_timekeep_gen == 0)
 		svtk->sv_timekeep_gen = 1;
 
 	tk = (struct vdso_timekeep *)(shared_page_mapping +
 	    svtk->sv_timekeep_off);
 	tk->tk_th[idx].th_gen = 0;
 	atomic_thread_fence_rel();
 	if (enabled)
 		tk->tk_th[idx] = th;
 	atomic_store_rel_32(&tk->tk_th[idx].th_gen, svtk->sv_timekeep_gen);
 	atomic_store_rel_32(&tk->tk_current, idx);
 
 	/*
 	 * The ordering of the assignment to tk_enabled relative to
 	 * the update of the vdso_timehands is not important.
 	 */
 	tk->tk_enabled = enabled;
 }
 
 #ifdef COMPAT_FREEBSD32
 static void
 timehands_update32(struct vdso_sv_tk *svtk)
 {
 	struct vdso_timehands32 th;
 	struct vdso_timekeep32 *tk;
 	uint32_t enabled, idx;
 
 	enabled = tc_fill_vdso_timehands32(&th);
 	th.th_gen = 0;
 	idx = svtk->sv_timekeep_curr;
 	if (++idx >= VDSO_TH_NUM)
 		idx = 0;
 	svtk->sv_timekeep_curr = idx;
 	if (++svtk->sv_timekeep_gen == 0)
 		svtk->sv_timekeep_gen = 1;
 
 	tk = (struct vdso_timekeep32 *)(shared_page_mapping +
 	    svtk->sv_timekeep_off);
 	tk->tk_th[idx].th_gen = 0;
 	atomic_thread_fence_rel();
 	if (enabled)
 		tk->tk_th[idx] = th;
 	atomic_store_rel_32(&tk->tk_th[idx].th_gen, svtk->sv_timekeep_gen);
 	atomic_store_rel_32(&tk->tk_current, idx);
 	tk->tk_enabled = enabled;
 }
 #endif
 
 /*
  * This is hackish, but easiest way to avoid creating list structures
  * that needs to be iterated over from the hardclock interrupt
  * context.
  */
 static struct vdso_sv_tk *host_svtk;
 #ifdef COMPAT_FREEBSD32
 static struct vdso_sv_tk *compat32_svtk;
 #endif
 
 void
 timekeep_push_vdso(void)
 {
 
 	if (host_svtk != NULL)
 		timehands_update(host_svtk);
 #ifdef COMPAT_FREEBSD32
 	if (compat32_svtk != NULL)
 		timehands_update32(compat32_svtk);
 #endif
 }
 
 struct vdso_sv_tk *
 alloc_sv_tk(void)
 {
 	struct vdso_sv_tk *svtk;
 	int tk_base;
 	uint32_t tk_ver;
 
 	tk_ver = VDSO_TK_VER_CURR;
 	svtk = malloc(sizeof(struct vdso_sv_tk), M_TEMP, M_WAITOK | M_ZERO);
 	tk_base = shared_page_alloc(sizeof(struct vdso_timekeep) +
 	    sizeof(struct vdso_timehands) * VDSO_TH_NUM, 16);
 	KASSERT(tk_base != -1, ("tk_base -1 for native"));
 	shared_page_write(tk_base + offsetof(struct vdso_timekeep, tk_ver),
 	    sizeof(uint32_t), &tk_ver);
 	svtk->sv_timekeep_off = tk_base;
 	timekeep_push_vdso();
 	return (svtk);
 }
 
 #ifdef COMPAT_FREEBSD32
 struct vdso_sv_tk *
 alloc_sv_tk_compat32(void)
 {
 	struct vdso_sv_tk *svtk;
 	int tk_base;
 	uint32_t tk_ver;
 
 	svtk = malloc(sizeof(struct vdso_sv_tk), M_TEMP, M_WAITOK | M_ZERO);
 	tk_ver = VDSO_TK_VER_CURR;
 	tk_base = shared_page_alloc(sizeof(struct vdso_timekeep32) +
 	    sizeof(struct vdso_timehands32) * VDSO_TH_NUM, 16);
 	KASSERT(tk_base != -1, ("tk_base -1 for 32bit"));
 	shared_page_write(tk_base + offsetof(struct vdso_timekeep32,
 	    tk_ver), sizeof(uint32_t), &tk_ver);
 	svtk->sv_timekeep_off = tk_base;
 	timekeep_push_vdso();
 	return (svtk);
 }
 #endif
 
 void
 exec_sysvec_init(void *param)
 {
 	struct sysentvec *sv;
 
 	sv = (struct sysentvec *)param;
 	if ((sv->sv_flags & SV_SHP) == 0)
 		return;
 	sv->sv_shared_page_obj = shared_page_obj;
 	sv->sv_sigcode_base = sv->sv_shared_page_base +
 	    shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
 	if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD)
 		return;
 	if ((sv->sv_flags & SV_TIMEKEEP) != 0) {
 #ifdef COMPAT_FREEBSD32
 		if ((sv->sv_flags & SV_ILP32) != 0) {
 			KASSERT(compat32_svtk == NULL,
 			    ("Compat32 already registered"));
 			compat32_svtk = alloc_sv_tk_compat32();
 			sv->sv_timekeep_base = sv->sv_shared_page_base +
 			    compat32_svtk->sv_timekeep_off;
 		} else {
 #endif
 			KASSERT(host_svtk == NULL, ("Host already registered"));
 			host_svtk = alloc_sv_tk();
 			sv->sv_timekeep_base = sv->sv_shared_page_base +
 			    host_svtk->sv_timekeep_off;
 #ifdef COMPAT_FREEBSD32
 		}
 #endif
 	}
 }
Index: head/sys/kern/kern_switch.c
===================================================================
--- head/sys/kern/kern_switch.c	(revision 326270)
+++ head/sys/kern/kern_switch.c	(revision 326271)
@@ -1,541 +1,543 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <machine/cpu.h>
 
 /* Uncomment this to enable logging of critical_enter/exit. */
 #if 0
 #define	KTR_CRITICAL	KTR_SCHED
 #else
 #define	KTR_CRITICAL	0
 #endif
 
 #ifdef FULL_PREEMPTION
 #ifndef PREEMPTION
 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
 #endif
 #endif
 
 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
 
 /*
  * kern.sched.preemption allows user space to determine if preemption support
  * is compiled in or not.  It is not currently a boot or runtime flag that
  * can be changed.
  */
 #ifdef PREEMPTION
 static int kern_sched_preemption = 1;
 #else
 static int kern_sched_preemption = 0;
 #endif
 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
     &kern_sched_preemption, 0, "Kernel preemption enabled");
 
 /*
  * Support for scheduler stats exported via kern.sched.stats.  All stats may
  * be reset with kern.sched.stats.reset = 1.  Stats may be defined elsewhere
  * with SCHED_STAT_DEFINE().
  */
 #ifdef SCHED_STATS
 SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
 
 /* Switch reasons from mi_switch(). */
 DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
 SCHED_STAT_DEFINE_VAR(uncategorized,
     &DPCPU_NAME(sched_switch_stats[SWT_NONE]), "");
 SCHED_STAT_DEFINE_VAR(preempt,
     &DPCPU_NAME(sched_switch_stats[SWT_PREEMPT]), "");
 SCHED_STAT_DEFINE_VAR(owepreempt,
     &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
 SCHED_STAT_DEFINE_VAR(turnstile,
     &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
 SCHED_STAT_DEFINE_VAR(sleepq,
     &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
 SCHED_STAT_DEFINE_VAR(sleepqtimo,
     &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), "");
 SCHED_STAT_DEFINE_VAR(relinquish, 
     &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
 SCHED_STAT_DEFINE_VAR(needresched,
     &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
 SCHED_STAT_DEFINE_VAR(idle, 
     &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
 SCHED_STAT_DEFINE_VAR(iwait,
     &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
 SCHED_STAT_DEFINE_VAR(suspend,
     &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
 SCHED_STAT_DEFINE_VAR(remotepreempt,
     &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
 SCHED_STAT_DEFINE_VAR(remotewakeidle,
     &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
 
 static int
 sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *p;
 	uintptr_t counter;
         int error;
 	int val;
 	int i;
 
         val = 0;
         error = sysctl_handle_int(oidp, &val, 0, req);
         if (error != 0 || req->newptr == NULL)
                 return (error);
         if (val == 0)
                 return (0);
 	/*
 	 * Traverse the list of children of _kern_sched_stats and reset each
 	 * to 0.  Skip the reset entry.
 	 */
 	SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
 		if (p == oidp || p->oid_arg1 == NULL)
 			continue;
 		counter = (uintptr_t)p->oid_arg1;
 		CPU_FOREACH(i) {
 			*(long *)(dpcpu_off[i] + counter) = 0;
 		}
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
     0, sysctl_stats_reset, "I", "Reset scheduler statistics");
 #endif
 
 /************************************************************************
  * Functions that manipulate runnability from a thread perspective.	*
  ************************************************************************/
 /*
  * Select the thread that will be run next.
  */
 
 static __noinline struct thread *
 choosethread_panic(struct thread *td)
 {
 
 	/*
 	 * If we are in panic, only allow system threads,
 	 * plus the one we are running in, to be run.
 	 */
 retry:
 	if (((td->td_proc->p_flag & P_SYSTEM) == 0 &&
 	    (td->td_flags & TDF_INPANIC) == 0)) {
 		/* note that it is no longer on the run queue */
 		TD_SET_CAN_RUN(td);
 		td = sched_choose();
 		goto retry;
 	}
 
 	TD_SET_RUNNING(td);
 	return (td);
 }
 
 struct thread *
 choosethread(void)
 {
 	struct thread *td;
 
 	td = sched_choose();
 
 	if (__predict_false(panicstr != NULL))
 		return (choosethread_panic(td));
 
 	TD_SET_RUNNING(td);
 	return (td);
 }
 
 /*
  * Kernel thread preemption implementation.  Critical sections mark
  * regions of code in which preemptions are not allowed.
  *
  * It might seem a good idea to inline critical_enter() but, in order
  * to prevent instructions reordering by the compiler, a __compiler_membar()
  * would have to be used here (the same as sched_pin()).  The performance
  * penalty imposed by the membar could, then, produce slower code than
  * the function call itself, for most cases.
  */
 void
 critical_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	td->td_critnest++;
 	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
 	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
 }
 
 void
 critical_exit(void)
 {
 	struct thread *td;
 	int flags;
 
 	td = curthread;
 	KASSERT(td->td_critnest != 0,
 	    ("critical_exit: td_critnest == 0"));
 
 	if (td->td_critnest == 1) {
 		td->td_critnest = 0;
 
 		/*
 		 * Interrupt handlers execute critical_exit() on
 		 * leave, and td_owepreempt may be left set by an
 		 * interrupt handler only when td_critnest > 0.  If we
 		 * are decrementing td_critnest from 1 to 0, read
 		 * td_owepreempt after decrementing, to not miss the
 		 * preempt.  Disallow compiler to reorder operations.
 		 */
 		__compiler_membar();
 		if (td->td_owepreempt && !kdb_active) {
 			/*
 			 * Microoptimization: we committed to switch,
 			 * disable preemption in interrupt handlers
 			 * while spinning for the thread lock.
 			 */
 			td->td_critnest = 1;
 			thread_lock(td);
 			td->td_critnest--;
 			flags = SW_INVOL | SW_PREEMPT;
 			if (TD_IS_IDLETHREAD(td))
 				flags |= SWT_IDLE;
 			else
 				flags |= SWT_OWEPREEMPT;
 			mi_switch(flags, NULL);
 			thread_unlock(td);
 		}
 	} else
 		td->td_critnest--;
 
 	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
 	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
 }
 
 /************************************************************************
  * SYSTEM RUN QUEUE manipulations and tests				*
  ************************************************************************/
 /*
  * Initialize a run structure.
  */
 void
 runq_init(struct runq *rq)
 {
 	int i;
 
 	bzero(rq, sizeof *rq);
 	for (i = 0; i < RQ_NQS; i++)
 		TAILQ_INIT(&rq->rq_queues[i]);
 }
 
 /*
  * Clear the status bit of the queue corresponding to priority level pri,
  * indicating that it is empty.
  */
 static __inline void
 runq_clrbit(struct runq *rq, int pri)
 {
 	struct rqbits *rqb;
 
 	rqb = &rq->rq_status;
 	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
 	    rqb->rqb_bits[RQB_WORD(pri)],
 	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
 	    RQB_BIT(pri), RQB_WORD(pri));
 	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
 }
 
 /*
  * Find the index of the first non-empty run queue.  This is done by
  * scanning the status bits, a set bit indicates a non-empty queue.
  */
 static __inline int
 runq_findbit(struct runq *rq)
 {
 	struct rqbits *rqb;
 	int pri;
 	int i;
 
 	rqb = &rq->rq_status;
 	for (i = 0; i < RQB_LEN; i++)
 		if (rqb->rqb_bits[i]) {
 			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
 			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
 			    rqb->rqb_bits[i], i, pri);
 			return (pri);
 		}
 
 	return (-1);
 }
 
 static __inline int
 runq_findbit_from(struct runq *rq, u_char pri)
 {
 	struct rqbits *rqb;
 	rqb_word_t mask;
 	int i;
 
 	/*
 	 * Set the mask for the first word so we ignore priorities before 'pri'.
 	 */
 	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
 	rqb = &rq->rq_status;
 again:
 	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
 		mask = rqb->rqb_bits[i] & mask;
 		if (mask == 0)
 			continue;
 		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
 		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
 		    mask, i, pri);
 		return (pri);
 	}
 	if (pri == 0)
 		return (-1);
 	/*
 	 * Wrap back around to the beginning of the list just once so we
 	 * scan the whole thing.
 	 */
 	pri = 0;
 	goto again;
 }
 
 /*
  * Set the status bit of the queue corresponding to priority level pri,
  * indicating that it is non-empty.
  */
 static __inline void
 runq_setbit(struct runq *rq, int pri)
 {
 	struct rqbits *rqb;
 
 	rqb = &rq->rq_status;
 	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
 	    rqb->rqb_bits[RQB_WORD(pri)],
 	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
 	    RQB_BIT(pri), RQB_WORD(pri));
 	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
 }
 
 /*
  * Add the thread to the queue specified by its priority, and set the
  * corresponding status bit.
  */
 void
 runq_add(struct runq *rq, struct thread *td, int flags)
 {
 	struct rqhead *rqh;
 	int pri;
 
 	pri = td->td_priority / RQ_PPQ;
 	td->td_rqindex = pri;
 	runq_setbit(rq, pri);
 	rqh = &rq->rq_queues[pri];
 	CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
 	    td, td->td_priority, pri, rqh);
 	if (flags & SRQ_PREEMPTED) {
 		TAILQ_INSERT_HEAD(rqh, td, td_runq);
 	} else {
 		TAILQ_INSERT_TAIL(rqh, td, td_runq);
 	}
 }
 
 void
 runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
 {
 	struct rqhead *rqh;
 
 	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
 	td->td_rqindex = pri;
 	runq_setbit(rq, pri);
 	rqh = &rq->rq_queues[pri];
 	CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
 	    td, td->td_priority, pri, rqh);
 	if (flags & SRQ_PREEMPTED) {
 		TAILQ_INSERT_HEAD(rqh, td, td_runq);
 	} else {
 		TAILQ_INSERT_TAIL(rqh, td, td_runq);
 	}
 }
 /*
  * Return true if there are runnable processes of any priority on the run
  * queue, false otherwise.  Has no side effects, does not modify the run
  * queue structure.
  */
 int
 runq_check(struct runq *rq)
 {
 	struct rqbits *rqb;
 	int i;
 
 	rqb = &rq->rq_status;
 	for (i = 0; i < RQB_LEN; i++)
 		if (rqb->rqb_bits[i]) {
 			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
 			    rqb->rqb_bits[i], i);
 			return (1);
 		}
 	CTR0(KTR_RUNQ, "runq_check: empty");
 
 	return (0);
 }
 
 /*
  * Find the highest priority process on the run queue.
  */
 struct thread *
 runq_choose_fuzz(struct runq *rq, int fuzz)
 {
 	struct rqhead *rqh;
 	struct thread *td;
 	int pri;
 
 	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
 		/* fuzz == 1 is normal.. 0 or less are ignored */
 		if (fuzz > 1) {
 			/*
 			 * In the first couple of entries, check if
 			 * there is one for our CPU as a preference.
 			 */
 			int count = fuzz;
 			int cpu = PCPU_GET(cpuid);
 			struct thread *td2;
 			td2 = td = TAILQ_FIRST(rqh);
 
 			while (count-- && td2) {
 				if (td2->td_lastcpu == cpu) {
 					td = td2;
 					break;
 				}
 				td2 = TAILQ_NEXT(td2, td_runq);
 			}
 		} else
 			td = TAILQ_FIRST(rqh);
 		KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
 		CTR3(KTR_RUNQ,
 		    "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
 		return (td);
 	}
 	CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
 
 	return (NULL);
 }
 
 /*
  * Find the highest priority process on the run queue.
  */
 struct thread *
 runq_choose(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct thread *td;
 	int pri;
 
 	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
 		td = TAILQ_FIRST(rqh);
 		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
 		CTR3(KTR_RUNQ,
 		    "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
 		return (td);
 	}
 	CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
 
 	return (NULL);
 }
 
 struct thread *
 runq_choose_from(struct runq *rq, u_char idx)
 {
 	struct rqhead *rqh;
 	struct thread *td;
 	int pri;
 
 	if ((pri = runq_findbit_from(rq, idx)) != -1) {
 		rqh = &rq->rq_queues[pri];
 		td = TAILQ_FIRST(rqh);
 		KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
 		CTR4(KTR_RUNQ,
 		    "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
 		    pri, td, td->td_rqindex, rqh);
 		return (td);
 	}
 	CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
 
 	return (NULL);
 }
 /*
  * Remove the thread from the queue specified by its priority, and clear the
  * corresponding status bit if the queue becomes empty.
  * Caller must set state afterwards.
  */
 void
 runq_remove(struct runq *rq, struct thread *td)
 {
 
 	runq_remove_idx(rq, td, NULL);
 }
 
 void
 runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
 {
 	struct rqhead *rqh;
 	u_char pri;
 
 	KASSERT(td->td_flags & TDF_INMEM,
 		("runq_remove_idx: thread swapped out"));
 	pri = td->td_rqindex;
 	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
 	rqh = &rq->rq_queues[pri];
 	CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
 	    td, td->td_priority, pri, rqh);
 	TAILQ_REMOVE(rqh, td, td_runq);
 	if (TAILQ_EMPTY(rqh)) {
 		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
 		runq_clrbit(rq, pri);
 		if (idx != NULL && *idx == pri)
 			*idx = (pri + 1) % RQ_NQS;
 	}
 }
Index: head/sys/kern/kern_sx.c
===================================================================
--- head/sys/kern/kern_sx.c	(revision 326270)
+++ head/sys/kern/kern_sx.c	(revision 326271)
@@ -1,1404 +1,1406 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2007 Attilio Rao <attilio@freebsd.org>
  * Copyright (c) 2001 Jason Evans <jasone@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 /*
  * Shared/exclusive locks.  This implementation attempts to ensure
  * deterministic lock granting behavior, so that slocks and xlocks are
  * interleaved.
  *
  * Priority propagation will not generally raise the priority of lock holders,
  * so should not be relied upon in combination with sx locks.
  */
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_no_adaptive_sx.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/sx.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
 #include <machine/cpu.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
 #define	ADAPTIVE_SX
 #endif
 
 CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE);
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 /* Handy macros for sleep queues. */
 #define	SQ_EXCLUSIVE_QUEUE	0
 #define	SQ_SHARED_QUEUE		1
 
 /*
  * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file.  We
  * drop Giant anytime we have to sleep or if we adaptively spin.
  */
 #define	GIANT_DECLARE							\
 	int _giantcnt = 0;						\
 	WITNESS_SAVE_DECL(Giant)					\
 
 #define	GIANT_SAVE(work) do {						\
 	if (mtx_owned(&Giant)) {					\
 		work++;							\
 		WITNESS_SAVE(&Giant.lock_object, Giant);		\
 		while (mtx_owned(&Giant)) {				\
 			_giantcnt++;					\
 			mtx_unlock(&Giant);				\
 		}							\
 	}								\
 } while (0)
 
 #define GIANT_RESTORE() do {						\
 	if (_giantcnt > 0) {						\
 		mtx_assert(&Giant, MA_NOTOWNED);			\
 		while (_giantcnt--)					\
 			mtx_lock(&Giant);				\
 		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
 	}								\
 } while (0)
 
 /*
  * Returns true if an exclusive lock is recursed.  It assumes
  * curthread currently has an exclusive lock.
  */
 #define	sx_recursed(sx)		((sx)->sx_recurse != 0)
 
 static void	assert_sx(const struct lock_object *lock, int what);
 #ifdef DDB
 static void	db_show_sx(const struct lock_object *lock);
 #endif
 static void	lock_sx(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_sx(const struct lock_object *lock, struct thread **owner);
 #endif
 static uintptr_t unlock_sx(struct lock_object *lock);
 
 struct lock_class lock_class_sx = {
 	.lc_name = "sx",
 	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
 	.lc_assert = assert_sx,
 #ifdef DDB
 	.lc_ddb_show = db_show_sx,
 #endif
 	.lc_lock = lock_sx,
 	.lc_unlock = unlock_sx,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_sx,
 #endif
 };
 
 #ifndef INVARIANTS
 #define	_sx_assert(sx, what, file, line)
 #endif
 
 #ifdef ADAPTIVE_SX
 static __read_frequently u_int asx_retries = 10;
 static __read_frequently u_int asx_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging");
 SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, "");
 SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, "");
 
 static struct lock_delay_config __read_frequently sx_delay;
 
 SYSCTL_INT(_debug_sx, OID_AUTO, delay_base, CTLFLAG_RW, &sx_delay.base,
     0, "");
 SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max,
     0, "");
 
 LOCK_DELAY_SYSINIT_DEFAULT(sx_delay);
 #endif
 
 void
 assert_sx(const struct lock_object *lock, int what)
 {
 
 	sx_assert((const struct sx *)lock, what);
 }
 
 void
 lock_sx(struct lock_object *lock, uintptr_t how)
 {
 	struct sx *sx;
 
 	sx = (struct sx *)lock;
 	if (how)
 		sx_slock(sx);
 	else
 		sx_xlock(sx);
 }
 
 uintptr_t
 unlock_sx(struct lock_object *lock)
 {
 	struct sx *sx;
 
 	sx = (struct sx *)lock;
 	sx_assert(sx, SA_LOCKED | SA_NOTRECURSED);
 	if (sx_xlocked(sx)) {
 		sx_xunlock(sx);
 		return (0);
 	} else {
 		sx_sunlock(sx);
 		return (1);
 	}
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_sx(const struct lock_object *lock, struct thread **owner)
 {
 	const struct sx *sx;
 	uintptr_t x;
 
 	sx = (const struct sx *)lock;
 	x = sx->sx_lock;
 	*owner = NULL;
 	return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) :
 	    ((*owner = (struct thread *)SX_OWNER(x)) != NULL));
 }
 #endif
 
 void
 sx_sysinit(void *arg)
 {
 	struct sx_args *sargs = arg;
 
 	sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags);
 }
 
 void
 sx_init_flags(struct sx *sx, const char *description, int opts)
 {
 	int flags;
 
 	MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK |
 	    SX_NOPROFILE | SX_NOADAPTIVE | SX_NEW)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock,
 	    ("%s: sx_lock not aligned for %s: %p", __func__, description,
 	    &sx->sx_lock));
 
 	flags = LO_SLEEPABLE | LO_UPGRADABLE;
 	if (opts & SX_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & SX_NOPROFILE)
 		flags |= LO_NOPROFILE;
 	if (!(opts & SX_NOWITNESS))
 		flags |= LO_WITNESS;
 	if (opts & SX_RECURSE)
 		flags |= LO_RECURSABLE;
 	if (opts & SX_QUIET)
 		flags |= LO_QUIET;
 	if (opts & SX_NEW)
 		flags |= LO_NEW;
 
 	flags |= opts & SX_NOADAPTIVE;
 	lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags);
 	sx->sx_lock = SX_LOCK_UNLOCKED;
 	sx->sx_recurse = 0;
 }
 
 void
 sx_destroy(struct sx *sx)
 {
 
 	KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held"));
 	KASSERT(sx->sx_recurse == 0, ("sx lock still recursed"));
 	sx->sx_lock = SX_LOCK_DESTROYED;
 	lock_destroy(&sx->lock_object);
 }
 
 int
 sx_try_slock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t x;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_try_slock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 
 	x = sx->sx_lock;
 	for (;;) {
 		KASSERT(x != SX_LOCK_DESTROYED,
 		    ("sx_try_slock() of destroyed sx @ %s:%d", file, line));
 		if (!(x & SX_LOCK_SHARED))
 			break;
 		if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, x + SX_ONE_SHARER)) {
 			LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line);
 			WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line);
 			LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire,
 			    sx, 0, 0, file, line, LOCKSTAT_READER);
 			TD_LOCKS_INC(curthread);
 			return (1);
 		}
 	}
 
 	LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line);
 	return (0);
 }
 
 int
 sx_try_slock_(struct sx *sx, const char *file, int line)
 {
 
 	return (sx_try_slock_int(sx LOCK_FILE_LINE_ARG));
 }
 
 int
 _sx_xlock(struct sx *sx, int opts, const char *file, int line)
 {
 	uintptr_t tid, x;
 	int error = 0;
 
 	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
 	    !TD_IS_IDLETHREAD(curthread),
 	    ("sx_xlock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_xlock() of destroyed sx @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
 	    line, NULL);
 	tid = (uintptr_t)curthread;
 	x = SX_LOCK_UNLOCKED;
 	if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid))
 		error = _sx_xlock_hard(sx, x, opts LOCK_FILE_LINE_ARG);
 	else
 		LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx,
 		    0, 0, file, line, LOCKSTAT_WRITER);
 	if (!error) {
 		LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse,
 		    file, line);
 		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
 		TD_LOCKS_INC(curthread);
 	}
 
 	return (error);
 }
 
 int
 sx_try_xlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
 {
 	struct thread *td;
 	uintptr_t tid, x;
 	int rval;
 	bool recursed;
 
 	td = curthread;
 	tid = (uintptr_t)td;
 	if (SCHEDULER_STOPPED_TD(td))
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td),
 	    ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_try_xlock() of destroyed sx @ %s:%d", file, line));
 
 	rval = 1;
 	recursed = false;
 	x = SX_LOCK_UNLOCKED;
 	for (;;) {
 		if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid))
 			break;
 		if (x == SX_LOCK_UNLOCKED)
 			continue;
 		if (x == tid && (sx->lock_object.lo_flags & LO_RECURSABLE)) {
 			sx->sx_recurse++;
 			atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 			break;
 		}
 		rval = 0;
 		break;
 	}
 
 	LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		if (!recursed)
 			LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire,
 			    sx, 0, 0, file, line, LOCKSTAT_WRITER);
 		TD_LOCKS_INC(curthread);
 	}
 
 	return (rval);
 }
 
 int
 sx_try_xlock_(struct sx *sx, const char *file, int line)
 {
 
 	return (sx_try_xlock_int(sx LOCK_FILE_LINE_ARG));
 }
 
 void
 _sx_xunlock(struct sx *sx, const char *file, int line)
 {
 
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_xunlock() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_XLOCKED, file, line);
 	WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file,
 	    line);
 #if LOCK_DEBUG > 0
 	_sx_xunlock_hard(sx, (uintptr_t)curthread, file, line);
 #else
 	__sx_xunlock(sx, curthread, file, line);
 #endif
 	TD_LOCKS_DEC(curthread);
 }
 
 /*
  * Try to do a non-blocking upgrade from a shared lock to an exclusive lock.
  * This will only succeed if this thread holds a single shared lock.
  * Return 1 if if the upgrade succeed, 0 otherwise.
  */
 int
 sx_try_upgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t x;
 	int success;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_SLOCKED, file, line);
 
 	/*
 	 * Try to switch from one shared lock to an exclusive lock.  We need
 	 * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that
 	 * we will wake up the exclusive waiters when we drop the lock.
 	 */
 	x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS;
 	success = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x,
 	    (uintptr_t)curthread | x);
 	LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line);
 	if (success) {
 		WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		LOCKSTAT_RECORD0(sx__upgrade, sx);
 	}
 	return (success);
 }
 
 int
 sx_try_upgrade_(struct sx *sx, const char *file, int line)
 {
 
 	return (sx_try_upgrade_int(sx LOCK_FILE_LINE_ARG));
 }
 
 /*
  * Downgrade an unrecursed exclusive lock into a single shared lock.
  */
 void
 sx_downgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t x;
 	int wakeup_swapper;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_downgrade() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line);
 #ifndef INVARIANTS
 	if (sx_recursed(sx))
 		panic("downgrade of a recursed lock");
 #endif
 
 	WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line);
 
 	/*
 	 * Try to switch from an exclusive lock with no shared waiters
 	 * to one sharer with no shared waiters.  If there are
 	 * exclusive waiters, we don't need to lock the sleep queue so
 	 * long as we preserve the flag.  We do one quick try and if
 	 * that fails we grab the sleepq lock to keep the flags from
 	 * changing and do it the slow way.
 	 *
 	 * We have to lock the sleep queue if there are shared waiters
 	 * so we can wake them up.
 	 */
 	x = sx->sx_lock;
 	if (!(x & SX_LOCK_SHARED_WAITERS) &&
 	    atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) |
 	    (x & SX_LOCK_EXCLUSIVE_WAITERS)))
 		goto out;
 
 	/*
 	 * Lock the sleep queue so we can read the waiters bits
 	 * without any races and wakeup any shared waiters.
 	 */
 	sleepq_lock(&sx->lock_object);
 
 	/*
 	 * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single
 	 * shared lock.  If there are any shared waiters, wake them up.
 	 */
 	wakeup_swapper = 0;
 	x = sx->sx_lock;
 	atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) |
 	    (x & SX_LOCK_EXCLUSIVE_WAITERS));
 	if (x & SX_LOCK_SHARED_WAITERS)
 		wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
 		    0, SQ_SHARED_QUEUE);
 	sleepq_release(&sx->lock_object);
 
 	if (wakeup_swapper)
 		kick_proc0();
 
 out:
 	LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
 	LOCKSTAT_RECORD0(sx__downgrade, sx);
 }
 
 void
 sx_downgrade_(struct sx *sx, const char *file, int line)
 {
 
 	sx_downgrade_int(sx LOCK_FILE_LINE_ARG);
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_xlock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 int
 _sx_xlock_hard(struct sx *sx, uintptr_t x, int opts LOCK_FILE_LINE_ARG_DEF)
 {
 	GIANT_DECLARE;
 	uintptr_t tid;
 #ifdef ADAPTIVE_SX
 	volatile struct thread *owner;
 	u_int i, n, spintries = 0;
 #endif
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	int error = 0;
 #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS)
 	struct lock_delay_arg lda;
 #endif
 #ifdef	KDTRACE_HOOKS
 	u_int sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
 #endif
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	uintptr_t state;
 #endif
 	int extra_work = 0;
 
 	tid = (uintptr_t)curthread;
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 #if defined(ADAPTIVE_SX)
 	lock_delay_arg_init(&lda, &sx_delay);
 #elif defined(KDTRACE_HOOKS)
 	lock_delay_arg_init(&lda, NULL);
 #endif
 
 	if (__predict_false(x == SX_LOCK_UNLOCKED))
 		x = SX_READ_VALUE(sx);
 
 	/* If we already hold an exclusive lock, then recurse. */
 	if (__predict_false(lv_sx_owner(x) == (struct thread *)tid)) {
 		KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0,
 	    ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n",
 		    sx->lock_object.lo_name, file, line));
 		sx->sx_recurse++;
 		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx);
 		return (0);
 	}
 
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
 		    sx->lock_object.lo_name, (void *)sx->sx_lock, file, line);
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
 	    &waittime);
 
 #ifdef LOCK_PROFILING
 	extra_work = 1;
 	state = x;
 #elif defined(KDTRACE_HOOKS)
 	extra_work = lockstat_enabled;
 	if (__predict_false(extra_work)) {
 		all_time -= lockstat_nsecs(&sx->lock_object);
 		state = x;
 	}
 #endif
 
 	for (;;) {
 		if (x == SX_LOCK_UNLOCKED) {
 			if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid))
 				break;
 			continue;
 		}
 #ifdef KDTRACE_HOOKS
 		lda.spin_cnt++;
 #endif
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the lock is write locked and the owner is
 		 * running on another CPU, spin until the owner stops
 		 * running or the state of the lock changes.
 		 */
 		if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			if ((x & SX_LOCK_SHARED) == 0) {
 				owner = lv_sx_owner(x);
 				if (TD_IS_RUNNING(owner)) {
 					if (LOCK_LOG_TEST(&sx->lock_object, 0))
 						CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 						    __func__, sx, owner);
 					KTR_STATE1(KTR_SCHED, "thread",
 					    sched_tdname(curthread), "spinning",
 					    "lockname:\"%s\"",
 					    sx->lock_object.lo_name);
 					GIANT_SAVE(extra_work);
 					do {
 						lock_delay(&lda);
 						x = SX_READ_VALUE(sx);
 						owner = lv_sx_owner(x);
 					} while (owner != NULL &&
 						    TD_IS_RUNNING(owner));
 					KTR_STATE0(KTR_SCHED, "thread",
 					    sched_tdname(curthread), "running");
 					continue;
 				}
 			} else if (SX_SHARERS(x) && spintries < asx_retries) {
 				KTR_STATE1(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "spinning",
 				    "lockname:\"%s\"", sx->lock_object.lo_name);
 				GIANT_SAVE(extra_work);
 				spintries++;
 				for (i = 0; i < asx_loops; i += n) {
 					if (LOCK_LOG_TEST(&sx->lock_object, 0))
 						CTR4(KTR_LOCK,
 				    "%s: shared spinning on %p with %u and %u",
 						    __func__, sx, spintries, i);
 					n = SX_SHARERS(x);
 					lock_delay_spin(n);
 					x = SX_READ_VALUE(sx);
 					if ((x & SX_LOCK_SHARED) == 0 ||
 					    SX_SHARERS(x) == 0)
 						break;
 				}
 #ifdef KDTRACE_HOOKS
 				lda.spin_cnt += i;
 #endif
 				KTR_STATE0(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "running");
 				if (i != asx_loops)
 					continue;
 			}
 		}
 #endif
 
 		sleepq_lock(&sx->lock_object);
 		x = SX_READ_VALUE(sx);
 retry_sleepq:
 
 		/*
 		 * If the lock was released while spinning on the
 		 * sleep queue chain lock, try again.
 		 */
 		if (x == SX_LOCK_UNLOCKED) {
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the sleep queue
 		 * chain lock.  If so, drop the sleep queue lock and try
 		 * again.
 		 */
 		if (!(x & SX_LOCK_SHARED) &&
 		    (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			owner = (struct thread *)SX_OWNER(x);
 			if (TD_IS_RUNNING(owner)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * If an exclusive lock was released with both shared
 		 * and exclusive waiters and a shared waiter hasn't
 		 * woken up and acquired the lock yet, sx_lock will be
 		 * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS.
 		 * If we see that value, try to acquire it once.  Note
 		 * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS
 		 * as there are other exclusive waiters still.  If we
 		 * fail, restart the loop.
 		 */
 		if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) {
 			if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x,
 			    tid | SX_LOCK_EXCLUSIVE_WAITERS))
 				goto retry_sleepq;
 			sleepq_release(&sx->lock_object);
 			CTR2(KTR_LOCK, "%s: %p claimed by new writer",
 			    __func__, sx);
 			break;
 		}
 
 		/*
 		 * Try to set the SX_LOCK_EXCLUSIVE_WAITERS.  If we fail,
 		 * than loop back and retry.
 		 */
 		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
 			if (!atomic_fcmpset_ptr(&sx->sx_lock, &x,
 			    x | SX_LOCK_EXCLUSIVE_WAITERS)) {
 				goto retry_sleepq;
 			}
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set excl waiters flag",
 				    __func__, sx);
 		}
 
 		/*
 		 * Since we have been unable to acquire the exclusive
 		 * lock and the exclusive waiters flag is set, we have
 		 * to sleep.
 		 */
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
 			    __func__, sx);
 
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs(&sx->lock_object);
 #endif
 		GIANT_SAVE(extra_work);
 		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
 		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
 		    SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE);
 		if (!(opts & SX_INTERRUPTIBLE))
 			sleepq_wait(&sx->lock_object, 0);
 		else
 			error = sleepq_wait_sig(&sx->lock_object, 0);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs(&sx->lock_object);
 		sleep_cnt++;
 #endif
 		if (error) {
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK,
 			"%s: interruptible sleep by %p suspended by signal",
 				    __func__, sx);
 			break;
 		}
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
 			    __func__, sx);
 		x = SX_READ_VALUE(sx);
 	}
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	if (__predict_true(!extra_work))
 		return (error);
 #endif
 #ifdef KDTRACE_HOOKS
 	all_time += lockstat_nsecs(&sx->lock_object);
 	if (sleep_time)
 		LOCKSTAT_RECORD4(sx__block, sx, sleep_time,
 		    LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0,
 		    (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state));
 	if (lda.spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time,
 		    LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0,
 		    (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state));
 #endif
 	if (!error)
 		LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx,
 		    contested, waittime, file, line, LOCKSTAT_WRITER);
 	GIANT_RESTORE();
 	return (error);
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_xunlock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 void
 _sx_xunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t tid, setx;
 	int queue, wakeup_swapper;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	tid = (uintptr_t)curthread;
 
 	if (__predict_false(x == tid))
 		x = SX_READ_VALUE(sx);
 
 	MPASS(!(x & SX_LOCK_SHARED));
 
 	if (__predict_false(x & SX_LOCK_RECURSED)) {
 		/* The lock is recursed, unrecurse one level. */
 		if ((--sx->sx_recurse) == 0)
 			atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx);
 		return;
 	}
 
 	LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_WRITER);
 	if (x == tid &&
 	    atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED))
 		return;
 
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR2(KTR_LOCK, "%s: %p contested", __func__, sx);
 
 	sleepq_lock(&sx->lock_object);
 	x = SX_READ_VALUE(sx);
 	MPASS(x & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS));
 
 	/*
 	 * The wake up algorithm here is quite simple and probably not
 	 * ideal.  It gives precedence to shared waiters if they are
 	 * present.  For this condition, we have to preserve the
 	 * state of the exclusive waiters flag.
 	 * If interruptible sleeps left the shared queue empty avoid a
 	 * starvation for the threads sleeping on the exclusive queue by giving
 	 * them precedence and cleaning up the shared waiters bit anyway.
 	 */
 	setx = SX_LOCK_UNLOCKED;
 	queue = SQ_EXCLUSIVE_QUEUE;
 	if ((x & SX_LOCK_SHARED_WAITERS) != 0 &&
 	    sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) {
 		queue = SQ_SHARED_QUEUE;
 		setx |= (x & SX_LOCK_EXCLUSIVE_WAITERS);
 	}
 	atomic_store_rel_ptr(&sx->sx_lock, setx);
 
 	/* Wake up all the waiters for the specific queue. */
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue",
 		    __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" :
 		    "exclusive");
 
 	wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0,
 	    queue);
 	sleepq_release(&sx->lock_object);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 static bool __always_inline
 __sx_slock_try(struct sx *sx, uintptr_t *xp LOCK_FILE_LINE_ARG_DEF)
 {
 
 	/*
 	 * If no other thread has an exclusive lock then try to bump up
 	 * the count of sharers.  Since we have to preserve the state
 	 * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the
 	 * shared lock loop back and retry.
 	 */
 	while (*xp & SX_LOCK_SHARED) {
 		MPASS(!(*xp & SX_LOCK_SHARED_WAITERS));
 		if (atomic_fcmpset_acq_ptr(&sx->sx_lock, xp,
 		    *xp + SX_ONE_SHARER)) {
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR4(KTR_LOCK, "%s: %p succeed %p -> %p",
 				    __func__, sx, (void *)*xp,
 				    (void *)(*xp + SX_ONE_SHARER));
 			return (true);
 		}
 	}
 	return (false);
 }
 
 static int __noinline
 _sx_slock_hard(struct sx *sx, int opts, uintptr_t x LOCK_FILE_LINE_ARG_DEF)
 {
 	GIANT_DECLARE;
 #ifdef ADAPTIVE_SX
 	volatile struct thread *owner;
 #endif
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	int error = 0;
 #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS)
 	struct lock_delay_arg lda;
 #endif
 #ifdef KDTRACE_HOOKS
 	u_int sleep_cnt = 0;
 	int64_t sleep_time = 0;
 	int64_t all_time = 0;
 #endif
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	uintptr_t state;
 #endif
 	int extra_work = 0;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 #if defined(ADAPTIVE_SX)
 	lock_delay_arg_init(&lda, &sx_delay);
 #elif defined(KDTRACE_HOOKS)
 	lock_delay_arg_init(&lda, NULL);
 #endif
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
 	    &waittime);
 
 #ifdef LOCK_PROFILING
 	extra_work = 1;
 	state = x;
 #elif defined(KDTRACE_HOOKS)
 	extra_work = lockstat_enabled;
 	if (__predict_false(extra_work)) {
 		all_time -= lockstat_nsecs(&sx->lock_object);
 		state = x;
 	}
 #endif
 
 	/*
 	 * As with rwlocks, we don't make any attempt to try to block
 	 * shared locks once there is an exclusive waiter.
 	 */
 	for (;;) {
 		if (__sx_slock_try(sx, &x LOCK_FILE_LINE_ARG))
 			break;
 #ifdef KDTRACE_HOOKS
 		lda.spin_cnt++;
 #endif
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			owner = lv_sx_owner(x);
 			if (TD_IS_RUNNING(owner)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, sx, owner);
 				KTR_STATE1(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "spinning",
 				    "lockname:\"%s\"", sx->lock_object.lo_name);
 				GIANT_SAVE(extra_work);
 				do {
 					lock_delay(&lda);
 					x = SX_READ_VALUE(sx);
 					owner = lv_sx_owner(x);
 				} while (owner != NULL && TD_IS_RUNNING(owner));
 				KTR_STATE0(KTR_SCHED, "thread",
 				    sched_tdname(curthread), "running");
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * Some other thread already has an exclusive lock, so
 		 * start the process of blocking.
 		 */
 		sleepq_lock(&sx->lock_object);
 		x = SX_READ_VALUE(sx);
 retry_sleepq:
 		/*
 		 * The lock could have been released while we spun.
 		 * In this case loop back and retry.
 		 */
 		if (x & SX_LOCK_SHARED) {
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if (!(x & SX_LOCK_SHARED) &&
 		    (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			owner = (struct thread *)SX_OWNER(x);
 			if (TD_IS_RUNNING(owner)) {
 				sleepq_release(&sx->lock_object);
 				x = SX_READ_VALUE(sx);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * Try to set the SX_LOCK_SHARED_WAITERS flag.  If we
 		 * fail to set it drop the sleep queue lock and loop
 		 * back.
 		 */
 		if (!(x & SX_LOCK_SHARED_WAITERS)) {
 			if (!atomic_fcmpset_ptr(&sx->sx_lock, &x,
 			    x | SX_LOCK_SHARED_WAITERS))
 				goto retry_sleepq;
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set shared waiters flag",
 				    __func__, sx);
 		}
 
 		/*
 		 * Since we have been unable to acquire the shared lock,
 		 * we have to sleep.
 		 */
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
 			    __func__, sx);
 
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs(&sx->lock_object);
 #endif
 		GIANT_SAVE(extra_work);
 		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
 		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
 		    SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE);
 		if (!(opts & SX_INTERRUPTIBLE))
 			sleepq_wait(&sx->lock_object, 0);
 		else
 			error = sleepq_wait_sig(&sx->lock_object, 0);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs(&sx->lock_object);
 		sleep_cnt++;
 #endif
 		if (error) {
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK,
 			"%s: interruptible sleep by %p suspended by signal",
 				    __func__, sx);
 			break;
 		}
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
 			    __func__, sx);
 		x = SX_READ_VALUE(sx);
 	}
 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
 	if (__predict_true(!extra_work))
 		return (error);
 #endif
 #ifdef KDTRACE_HOOKS
 	all_time += lockstat_nsecs(&sx->lock_object);
 	if (sleep_time)
 		LOCKSTAT_RECORD4(sx__block, sx, sleep_time,
 		    LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0,
 		    (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state));
 	if (lda.spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time,
 		    LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0,
 		    (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state));
 #endif
 	if (error == 0) {
 		LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx,
 		    contested, waittime, file, line, LOCKSTAT_READER);
 	}
 	GIANT_RESTORE();
 	return (error);
 }
 
 int
 _sx_slock_int(struct sx *sx, int opts LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t x;
 	int error;
 
 	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
 	    !TD_IS_IDLETHREAD(curthread),
 	    ("sx_slock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_slock() of destroyed sx @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL);
 
 	error = 0;
 	x = SX_READ_VALUE(sx);
 	if (__predict_false(LOCKSTAT_OOL_PROFILE_ENABLED(sx__acquire) ||
 	    !__sx_slock_try(sx, &x LOCK_FILE_LINE_ARG)))
 		error = _sx_slock_hard(sx, opts, x LOCK_FILE_LINE_ARG);
 	if (error == 0) {
 		LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line);
 		WITNESS_LOCK(&sx->lock_object, 0, file, line);
 		TD_LOCKS_INC(curthread);
 	}
 	return (error);
 }
 
 int
 _sx_slock(struct sx *sx, int opts, const char *file, int line)
 {
 
 	return (_sx_slock_int(sx, opts LOCK_FILE_LINE_ARG));
 }
 
 static bool __always_inline
 _sx_sunlock_try(struct sx *sx, uintptr_t *xp)
 {
 
 	for (;;) {
 		/*
 		 * We should never have sharers while at least one thread
 		 * holds a shared lock.
 		 */
 		KASSERT(!(*xp & SX_LOCK_SHARED_WAITERS),
 		    ("%s: waiting sharers", __func__));
 
 		/*
 		 * See if there is more than one shared lock held.  If
 		 * so, just drop one and return.
 		 */
 		if (SX_SHARERS(*xp) > 1) {
 			if (atomic_fcmpset_rel_ptr(&sx->sx_lock, xp,
 			    *xp - SX_ONE_SHARER)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeeded %p -> %p",
 					    __func__, sx, (void *)*xp,
 					    (void *)(*xp - SX_ONE_SHARER));
 				return (true);
 			}
 			continue;
 		}
 
 		/*
 		 * If there aren't any waiters for an exclusive lock,
 		 * then try to drop it quickly.
 		 */
 		if (!(*xp & SX_LOCK_EXCLUSIVE_WAITERS)) {
 			MPASS(*xp == SX_SHARERS_LOCK(1));
 			*xp = SX_SHARERS_LOCK(1);
 			if (atomic_fcmpset_rel_ptr(&sx->sx_lock,
 			    xp, SX_LOCK_UNLOCKED)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR2(KTR_LOCK, "%s: %p last succeeded",
 					    __func__, sx);
 				return (true);
 			}
 			continue;
 		}
 		break;
 	}
 	return (false);
 }
 
 static void __noinline
 _sx_sunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF)
 {
 	int wakeup_swapper;
 	uintptr_t setx;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	if (_sx_sunlock_try(sx, &x))
 		goto out_lockstat;
 
 	/*
 	 * At this point, there should just be one sharer with
 	 * exclusive waiters.
 	 */
 	MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS));
 
 	sleepq_lock(&sx->lock_object);
 	x = SX_READ_VALUE(sx);
 	for (;;) {
 		MPASS(x & SX_LOCK_EXCLUSIVE_WAITERS);
 		MPASS(!(x & SX_LOCK_SHARED_WAITERS));
 		/*
 		 * Wake up semantic here is quite simple:
 		 * Just wake up all the exclusive waiters.
 		 * Note that the state of the lock could have changed,
 		 * so if it fails loop back and retry.
 		 */
 		setx = x - SX_ONE_SHARER;
 		setx &= ~SX_LOCK_EXCLUSIVE_WAITERS;
 		if (!atomic_fcmpset_rel_ptr(&sx->sx_lock, &x, setx))
 			continue;
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p waking up all thread on"
 			    "exclusive queue", __func__, sx);
 		wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
 		    0, SQ_EXCLUSIVE_QUEUE);
 		break;
 	}
 	sleepq_release(&sx->lock_object);
 	if (wakeup_swapper)
 		kick_proc0();
 out_lockstat:
 	LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_READER);
 }
 
 void
 _sx_sunlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF)
 {
 	uintptr_t x;
 
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_sunlock() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_SLOCKED, file, line);
 	WITNESS_UNLOCK(&sx->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line);
 
 	x = SX_READ_VALUE(sx);
 	if (__predict_false(LOCKSTAT_OOL_PROFILE_ENABLED(sx__release) ||
 	    !_sx_sunlock_try(sx, &x)))
 		_sx_sunlock_hard(sx, x LOCK_FILE_LINE_ARG);
 
 	TD_LOCKS_DEC(curthread);
 }
 
 void
 _sx_sunlock(struct sx *sx, const char *file, int line)
 {
 
 	_sx_sunlock_int(sx LOCK_FILE_LINE_ARG);
 }
 
 #ifdef INVARIANT_SUPPORT
 #ifndef INVARIANTS
 #undef	_sx_assert
 #endif
 
 /*
  * In the non-WITNESS case, sx_assert() can only detect that at least
  * *some* thread owns an slock, but it cannot guarantee that *this*
  * thread owns an slock.
  */
 void
 _sx_assert(const struct sx *sx, int what, const char *file, int line)
 {
 #ifndef WITNESS
 	int slocked = 0;
 #endif
 
 	if (panicstr != NULL)
 		return;
 	switch (what) {
 	case SA_SLOCKED:
 	case SA_SLOCKED | SA_NOTRECURSED:
 	case SA_SLOCKED | SA_RECURSED:
 #ifndef WITNESS
 		slocked = 1;
 		/* FALLTHROUGH */
 #endif
 	case SA_LOCKED:
 	case SA_LOCKED | SA_NOTRECURSED:
 	case SA_LOCKED | SA_RECURSED:
 #ifdef WITNESS
 		witness_assert(&sx->lock_object, what, file, line);
 #else
 		/*
 		 * If some other thread has an exclusive lock or we
 		 * have one and are asserting a shared lock, fail.
 		 * Also, if no one has a lock at all, fail.
 		 */
 		if (sx->sx_lock == SX_LOCK_UNLOCKED ||
 		    (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked ||
 		    sx_xholder(sx) != curthread)))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    sx->lock_object.lo_name, slocked ? "share " : "",
 			    file, line);
 
 		if (!(sx->sx_lock & SX_LOCK_SHARED)) {
 			if (sx_recursed(sx)) {
 				if (what & SA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    sx->lock_object.lo_name, file,
 					    line);
 			} else if (what & SA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    sx->lock_object.lo_name, file, line);
 		}
 #endif
 		break;
 	case SA_XLOCKED:
 	case SA_XLOCKED | SA_NOTRECURSED:
 	case SA_XLOCKED | SA_RECURSED:
 		if (sx_xholder(sx) != curthread)
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 		if (sx_recursed(sx)) {
 			if (what & SA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    sx->lock_object.lo_name, file, line);
 		} else if (what & SA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 		break;
 	case SA_UNLOCKED:
 #ifdef WITNESS
 		witness_assert(&sx->lock_object, what, file, line);
 #else
 		/*
 		 * If we hold an exclusve lock fail.  We can't
 		 * reliably check to see if we hold a shared lock or
 		 * not.
 		 */
 		if (sx_xholder(sx) == curthread)
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 #endif
 		break;
 	default:
 		panic("Unknown sx lock assertion: %d @ %s:%d", what, file,
 		    line);
 	}
 }
 #endif	/* INVARIANT_SUPPORT */
 
 #ifdef DDB
 static void
 db_show_sx(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct sx *sx;
 
 	sx = (const struct sx *)lock;
 
 	db_printf(" state: ");
 	if (sx->sx_lock == SX_LOCK_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (sx->sx_lock == SX_LOCK_DESTROYED) {
 		db_printf("DESTROYED\n");
 		return;
 	} else if (sx->sx_lock & SX_LOCK_SHARED)
 		db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock));
 	else {
 		td = sx_xholder(sx);
 		db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (sx_recursed(sx))
 			db_printf(" recursed: %d\n", sx->sx_recurse);
 	}
 
 	db_printf(" waiters: ");
 	switch(sx->sx_lock &
 	    (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) {
 	case SX_LOCK_SHARED_WAITERS:
 		db_printf("shared\n");
 		break;
 	case SX_LOCK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive\n");
 		break;
 	case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive and shared\n");
 		break;
 	default:
 		db_printf("none\n");
 	}
 }
 
 /*
  * Check to see if a thread that is blocked on a sleep queue is actually
  * blocked on an sx lock.  If so, output some details and return true.
  * If the lock has an exclusive owner, return that in *ownerp.
  */
 int
 sx_chain(struct thread *td, struct thread **ownerp)
 {
 	struct sx *sx;
 
 	/*
 	 * Check to see if this thread is blocked on an sx lock.
 	 * First, we check the lock class.  If that is ok, then we
 	 * compare the lock name against the wait message.
 	 */
 	sx = td->td_wchan;
 	if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
 	    sx->lock_object.lo_name != td->td_wmesg)
 		return (0);
 
 	/* We think we have an sx lock, so output some details. */
 	db_printf("blocked on sx \"%s\" ", td->td_wmesg);
 	*ownerp = sx_xholder(sx);
 	if (sx->sx_lock & SX_LOCK_SHARED)
 		db_printf("SLOCK (count %ju)\n",
 		    (uintmax_t)SX_SHARERS(sx->sx_lock));
 	else
 		db_printf("XLOCK\n");
 	return (1);
 }
 #endif
Index: head/sys/kern/kern_syscalls.c
===================================================================
--- head/sys/kern/kern_syscalls.c	(revision 326270)
+++ head/sys/kern/kern_syscalls.c	(revision 326271)
@@ -1,231 +1,233 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999 Assar Westerlund
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <machine/atomic.h>
 
 /*
  * Acts like "nosys" but can be identified in sysent for dynamic call
  * number assignment for a limited number of calls.
  *
  * Place holder for system call slots reserved for loadable modules.
  */
 int
 lkmnosys(struct thread *td, struct nosys_args *args)
 {
 
 	return (nosys(td, args));
 }
 
 int
 lkmressys(struct thread *td, struct nosys_args *args)
 {
 
 	return (nosys(td, args));
 }
 
 static void
 syscall_thread_drain(struct sysent *se)
 {
 	u_int32_t cnt, oldcnt;
 
 	do {
 		oldcnt = se->sy_thrcnt;
 		KASSERT((oldcnt & SY_THR_STATIC) == 0,
 		    ("drain on static syscall"));
 		cnt = oldcnt | SY_THR_DRAINING;
 	} while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
 	while (atomic_cmpset_32(&se->sy_thrcnt, SY_THR_DRAINING,
 	    SY_THR_ABSENT) == 0)
 		pause("scdrn", hz/2);
 }
 
 int
 syscall_thread_enter(struct thread *td, struct sysent *se)
 {
 	u_int32_t cnt, oldcnt;
 
 	do {
 		oldcnt = se->sy_thrcnt;
 		if ((oldcnt & SY_THR_STATIC) != 0)
 			return (0);
 		if ((oldcnt & (SY_THR_DRAINING | SY_THR_ABSENT)) != 0)
 			return (ENOSYS);
 		cnt = oldcnt + SY_THR_INCR;
 	} while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
 	return (0);
 }
 
 void
 syscall_thread_exit(struct thread *td, struct sysent *se)
 {
 	u_int32_t cnt, oldcnt;
 
 	do {
 		oldcnt = se->sy_thrcnt;
 		if ((oldcnt & SY_THR_STATIC) != 0)
 			return;
 		cnt = oldcnt - SY_THR_INCR;
 	} while (atomic_cmpset_rel_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
 }
 
 int
 syscall_register(int *offset, struct sysent *new_sysent,
     struct sysent *old_sysent, int flags)
 {
 	int i;
 
 	if ((flags & ~SY_THR_STATIC) != 0)
 		return (EINVAL);
 
 	if (*offset == NO_SYSCALL) {
 		for (i = 1; i < SYS_MAXSYSCALL; ++i)
 			if (sysent[i].sy_call == (sy_call_t *)lkmnosys)
 				break;
 		if (i == SYS_MAXSYSCALL)
 			return (ENFILE);
 		*offset = i;
 	} else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
 		return (EINVAL);
 	else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys &&
 	    sysent[*offset].sy_call != (sy_call_t *)lkmressys)
 		return (EEXIST);
 
 	KASSERT(sysent[*offset].sy_thrcnt == SY_THR_ABSENT,
 	    ("dynamic syscall is not protected"));
 	*old_sysent = sysent[*offset];
 	new_sysent->sy_thrcnt = SY_THR_ABSENT;
 	sysent[*offset] = *new_sysent;
 	atomic_store_rel_32(&sysent[*offset].sy_thrcnt, flags);
 	return (0);
 }
 
 int
 syscall_deregister(int *offset, struct sysent *old_sysent)
 {
 	struct sysent *se;
 
 	if (*offset == 0)
 		return (0); /* XXX? */
 
 	se = &sysent[*offset];
 	if ((se->sy_thrcnt & SY_THR_STATIC) != 0)
 		return (EINVAL);
 	syscall_thread_drain(se);
 	sysent[*offset] = *old_sysent;
 	return (0);
 }
 
 int
 syscall_module_handler(struct module *mod, int what, void *arg)
 {
 	struct syscall_module_data *data = arg;
 	modspecific_t ms;
 	int error;
 
 	switch (what) {
 	case MOD_LOAD:
 		error = syscall_register(data->offset, data->new_sysent,
 		    &data->old_sysent, data->flags);
 		if (error) {
 			/* Leave a mark so we know to safely unload below. */
 			data->offset = NULL;
 			return (error);
 		}
 		ms.intval = *data->offset;
 		MOD_XLOCK;
 		module_setspecific(mod, &ms);
 		MOD_XUNLOCK;
 		if (data->chainevh)
 			error = data->chainevh(mod, what, data->chainarg);
 		return (error);
 	case MOD_UNLOAD:
 		/*
 		 * MOD_LOAD failed, so just return without calling the
 		 * chained handler since we didn't pass along the MOD_LOAD
 		 * event.
 		 */
 		if (data->offset == NULL)
 			return (0);
 		if (data->chainevh) {
 			error = data->chainevh(mod, what, data->chainarg);
 			if (error)
 				return error;
 		}
 		error = syscall_deregister(data->offset, &data->old_sysent);
 		return (error);
 	default:
 		if (data->chainevh)
 			return (data->chainevh(mod, what, data->chainarg));
 		return (EOPNOTSUPP);
 	}
 
 	/* NOTREACHED */
 }
 
 int
 syscall_helper_register(struct syscall_helper_data *sd, int flags)
 {
 	struct syscall_helper_data *sd1;
 	int error;
 
 	for (sd1 = sd; sd1->syscall_no != NO_SYSCALL; sd1++) {
 		error = syscall_register(&sd1->syscall_no, &sd1->new_sysent,
 		    &sd1->old_sysent, flags);
 		if (error != 0) {
 			syscall_helper_unregister(sd);
 			return (error);
 		}
 		sd1->registered = 1;
 	}
 	return (0);
 }
 
 int
 syscall_helper_unregister(struct syscall_helper_data *sd)
 {
 	struct syscall_helper_data *sd1;
 
 	for (sd1 = sd; sd1->registered != 0; sd1++) {
 		syscall_deregister(&sd1->syscall_no, &sd1->old_sysent);
 		sd1->registered = 0;
 	}
 	return (0);
 }
Index: head/sys/kern/kern_thr.c
===================================================================
--- head/sys/kern/kern_thr.c	(revision 326270)
+++ head/sys/kern/kern_thr.c	(revision 326271)
@@ -1,616 +1,618 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/ptrace.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucontext.h>
 #include <sys/thr.h>
 #include <sys/rtprio.h>
 #include <sys/umtx.h>
 #include <sys/limits.h>
 
 #include <vm/vm_domain.h>
 
 #include <machine/frame.h>
 
 #include <security/audit/audit.h>
 
 static SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0,
     "thread allocation");
 
 static int max_threads_per_proc = 1500;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
     &max_threads_per_proc, 0, "Limit on threads per proc");
 
 static int max_threads_hits;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
     &max_threads_hits, 0, "kern.threads.max_threads_per_proc hit count");
 
 #ifdef COMPAT_FREEBSD32
 
 static inline int
 suword_lwpid(void *addr, lwpid_t lwpid)
 {
 	int error;
 
 	if (SV_CURPROC_FLAG(SV_LP64))
 		error = suword(addr, lwpid);
 	else
 		error = suword32(addr, lwpid);
 	return (error);
 }
 
 #else
 #define suword_lwpid	suword
 #endif
 
 /*
  * System call interface.
  */
 
 struct thr_create_initthr_args {
 	ucontext_t ctx;
 	long *tid;
 };
 
 static int
 thr_create_initthr(struct thread *td, void *thunk)
 {
 	struct thr_create_initthr_args *args;
 
 	/* Copy out the child tid. */
 	args = thunk;
 	if (args->tid != NULL && suword_lwpid(args->tid, td->td_tid))
 		return (EFAULT);
 
 	return (set_mcontext(td, &args->ctx.uc_mcontext));
 }
 
 int
 sys_thr_create(struct thread *td, struct thr_create_args *uap)
     /* ucontext_t *ctx, long *id, int flags */
 {
 	struct thr_create_initthr_args args;
 	int error;
 
 	if ((error = copyin(uap->ctx, &args.ctx, sizeof(args.ctx))))
 		return (error);
 	args.tid = uap->id;
 	return (thread_create(td, NULL, thr_create_initthr, &args));
 }
 
 int
 sys_thr_new(struct thread *td, struct thr_new_args *uap)
     /* struct thr_param * */
 {
 	struct thr_param param;
 	int error;
 
 	if (uap->param_size < 0 || uap->param_size > sizeof(param))
 		return (EINVAL);
 	bzero(&param, sizeof(param));
 	if ((error = copyin(uap->param, &param, uap->param_size)))
 		return (error);
 	return (kern_thr_new(td, &param));
 }
 
 static int
 thr_new_initthr(struct thread *td, void *thunk)
 {
 	stack_t stack;
 	struct thr_param *param;
 
 	/*
 	 * Here we copy out tid to two places, one for child and one
 	 * for parent, because pthread can create a detached thread,
 	 * if parent wants to safely access child tid, it has to provide
 	 * its storage, because child thread may exit quickly and
 	 * memory is freed before parent thread can access it.
 	 */
 	param = thunk;
 	if ((param->child_tid != NULL &&
 	    suword_lwpid(param->child_tid, td->td_tid)) ||
 	    (param->parent_tid != NULL &&
 	    suword_lwpid(param->parent_tid, td->td_tid)))
 		return (EFAULT);
 
 	/* Set up our machine context. */
 	stack.ss_sp = param->stack_base;
 	stack.ss_size = param->stack_size;
 	/* Set upcall address to user thread entry function. */
 	cpu_set_upcall(td, param->start_func, param->arg, &stack);
 	/* Setup user TLS address and TLS pointer register. */
 	return (cpu_set_user_tls(td, param->tls_base));
 }
 
 int
 kern_thr_new(struct thread *td, struct thr_param *param)
 {
 	struct rtprio rtp, *rtpp;
 	int error;
 
 	rtpp = NULL;
 	if (param->rtp != 0) {
 		error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
 		if (error)
 			return (error);
 		rtpp = &rtp;
 	}
 	return (thread_create(td, rtpp, thr_new_initthr, param));
 }
 
 int
 thread_create(struct thread *td, struct rtprio *rtp,
     int (*initialize_thread)(struct thread *, void *), void *thunk)
 {
 	struct thread *newtd;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 
 	if (rtp != NULL) {
 		switch(rtp->type) {
 		case RTP_PRIO_REALTIME:
 		case RTP_PRIO_FIFO:
 			/* Only root can set scheduler policy */
 			if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
 				return (EPERM);
 			if (rtp->prio > RTP_PRIO_MAX)
 				return (EINVAL);
 			break;
 		case RTP_PRIO_NORMAL:
 			rtp->prio = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		error = racct_add(p, RACCT_NTHR, 1);
 		PROC_UNLOCK(p);
 		if (error != 0)
 			return (EPROCLIM);
 	}
 #endif
 
 	/* Initialize our td */
 	error = kern_thr_alloc(p, 0, &newtd);
 	if (error)
 		goto fail;
 
 	cpu_copy_thread(newtd, td);
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 	newtd->td_proc = td->td_proc;
 	newtd->td_rb_list = newtd->td_rbp_list = newtd->td_rb_inact = 0;
 	thread_cow_get(newtd, td);
 
 	error = initialize_thread(newtd, thunk);
 	if (error != 0) {
 		thread_cow_free(newtd);
 		thread_free(newtd);
 		goto fail;
 	}
 
 	PROC_LOCK(p);
 	p->p_flag |= P_HADTHREADS;
 	thread_link(newtd, p);
 	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
 	thread_lock(td);
 	/* let the scheduler know about these things. */
 	sched_fork_thread(td, newtd);
 	thread_unlock(td);
 	if (P_SHOULDSTOP(p))
 		newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 	if (p->p_ptevents & PTRACE_LWP)
 		newtd->td_dbgflags |= TDB_BORN;
 
 	/*
 	 * Copy the existing thread VM policy into the new thread.
 	 */
 	vm_domain_policy_localcopy(&newtd->td_vm_dom_policy,
 	    &td->td_vm_dom_policy);
 
 	PROC_UNLOCK(p);
 
 	tidhash_add(newtd);
 
 	thread_lock(newtd);
 	if (rtp != NULL) {
 		if (!(td->td_pri_class == PRI_TIMESHARE &&
 		      rtp->type == RTP_PRIO_NORMAL)) {
 			rtp_to_pri(rtp, newtd);
 			sched_prio(newtd, newtd->td_user_pri);
 		} /* ignore timesharing class */
 	}
 	TD_SET_CAN_RUN(newtd);
 	sched_add(newtd, SRQ_BORING);
 	thread_unlock(newtd);
 
 	return (0);
 
 fail:
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_sub(p, RACCT_NTHR, 1);
 		PROC_UNLOCK(p);
 	}
 #endif
 	return (error);
 }
 
 int
 sys_thr_self(struct thread *td, struct thr_self_args *uap)
     /* long *id */
 {
 	int error;
 
 	error = suword_lwpid(uap->id, (unsigned)td->td_tid);
 	if (error == -1)
 		return (EFAULT);
 	return (0);
 }
 
 int
 sys_thr_exit(struct thread *td, struct thr_exit_args *uap)
     /* long *state */
 {
 
 	umtx_thread_exit(td);
 
 	/* Signal userland that it can free the stack. */
 	if ((void *)uap->state != NULL) {
 		suword_lwpid(uap->state, 1);
 		kern_umtx_wake(td, uap->state, INT_MAX, 0);
 	}
 
 	return (kern_thr_exit(td));
 }
 
 int
 kern_thr_exit(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	/*
 	 * If all of the threads in a process call this routine to
 	 * exit (e.g. all threads call pthread_exit()), exactly one
 	 * thread should return to the caller to terminate the process
 	 * instead of the thread.
 	 *
 	 * Checking p_numthreads alone is not sufficient since threads
 	 * might be committed to terminating while the PROC_LOCK is
 	 * dropped in either ptracestop() or while removing this thread
 	 * from the tidhash.  Instead, the p_pendingexits field holds
 	 * the count of threads in either of those states and a thread
 	 * is considered the "last" thread if all of the other threads
 	 * in a process are already terminating.
 	 */
 	PROC_LOCK(p);
 	if (p->p_numthreads == p->p_pendingexits + 1) {
 		/*
 		 * Ignore attempts to shut down last thread in the
 		 * proc.  This will actually call _exit(2) in the
 		 * usermode trampoline when it returns.
 		 */
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	p->p_pendingexits++;
 	td->td_dbgflags |= TDB_EXIT;
 	if (p->p_ptevents & PTRACE_LWP)
 		ptracestop(td, SIGTRAP, NULL);
 	PROC_UNLOCK(p);
 	tidhash_remove(td);
 	PROC_LOCK(p);
 	p->p_pendingexits--;
 
 	/*
 	 * The check above should prevent all other threads from this
 	 * process from exiting while the PROC_LOCK is dropped, so
 	 * there must be at least one other thread other than the
 	 * current thread.
 	 */
 	KASSERT(p->p_numthreads > 1, ("too few threads"));
 	racct_sub(p, RACCT_NTHR, 1);
 	tdsigcleanup(td);
 	PROC_SLOCK(p);
 	thread_stopped(p);
 	thread_exit();
 	/* NOTREACHED */
 }
 
 int
 sys_thr_kill(struct thread *td, struct thr_kill_args *uap)
     /* long id, int sig */
 {
 	ksiginfo_t ksi;
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->sig;
 	ksi.ksi_code = SI_LWP;
 	ksi.ksi_pid = p->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	if (uap->id == -1) {
 		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
 			error = EINVAL;
 		} else {
 			error = ESRCH;
 			PROC_LOCK(p);
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				if (ttd != td) {
 					error = 0;
 					if (uap->sig == 0)
 						break;
 					tdksignal(ttd, uap->sig, &ksi);
 				}
 			}
 			PROC_UNLOCK(p);
 		}
 	} else {
 		error = 0;
 		ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 		if (ttd == NULL)
 			return (ESRCH);
 		if (uap->sig == 0)
 			;
 		else if (!_SIG_VALID(uap->sig))
 			error = EINVAL;
 		else 
 			tdksignal(ttd, uap->sig, &ksi);
 		PROC_UNLOCK(ttd->td_proc);
 	}
 	return (error);
 }
 
 int
 sys_thr_kill2(struct thread *td, struct thr_kill2_args *uap)
     /* pid_t pid, long id, int sig */
 {
 	ksiginfo_t ksi;
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 	AUDIT_ARG_SIGNUM(uap->sig);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->sig;
 	ksi.ksi_code = SI_LWP;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	if (uap->id == -1) {
 		if ((p = pfind(uap->pid)) == NULL)
 			return (ESRCH);
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->sig);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
 			error = EINVAL;
 		} else {
 			error = ESRCH;
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				if (ttd != td) {
 					error = 0;
 					if (uap->sig == 0)
 						break;
 					tdksignal(ttd, uap->sig, &ksi);
 				}
 			}
 		}
 		PROC_UNLOCK(p);
 	} else {
 		ttd = tdfind((lwpid_t)uap->id, uap->pid);
 		if (ttd == NULL)
 			return (ESRCH);
 		p = ttd->td_proc;
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->sig);
 		if (uap->sig == 0)
 			;
 		else if (!_SIG_VALID(uap->sig))
 			error = EINVAL;
 		else
 			tdksignal(ttd, uap->sig, &ksi);
 		PROC_UNLOCK(p);
 	}
 	return (error);
 }
 
 int
 sys_thr_suspend(struct thread *td, struct thr_suspend_args *uap)
 	/* const struct timespec *timeout */
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	tsp = NULL;
 	if (uap->timeout != NULL) {
 		error = umtx_copyin_timeout(uap->timeout, &ts);
 		if (error != 0)
 			return (error);
 		tsp = &ts;
 	}
 
 	return (kern_thr_suspend(td, tsp));
 }
 
 int
 kern_thr_suspend(struct thread *td, struct timespec *tsp)
 {
 	struct proc *p = td->td_proc;
 	struct timeval tv;
 	int error = 0;
 	int timo = 0;
 
 	if (td->td_pflags & TDP_WAKEUP) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		return (0);
 	}
 
 	if (tsp != NULL) {
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			error = EWOULDBLOCK;
 		else {
 			TIMESPEC_TO_TIMEVAL(&tv, tsp);
 			timo = tvtohz(&tv);
 		}
 	}
 
 	PROC_LOCK(p);
 	if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
 		error = msleep((void *)td, &p->p_mtx,
 			 PCATCH, "lthr", timo);
 
 	if (td->td_flags & TDF_THRWAKEUP) {
 		thread_lock(td);
 		td->td_flags &= ~TDF_THRWAKEUP;
 		thread_unlock(td);
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	PROC_UNLOCK(p);
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 	else if (error == ERESTART) {
 		if (timo != 0)
 			error = EINTR;
 	}
 	return (error);
 }
 
 int
 sys_thr_wake(struct thread *td, struct thr_wake_args *uap)
 	/* long id */
 {
 	struct proc *p;
 	struct thread *ttd;
 
 	if (uap->id == td->td_tid) {
 		td->td_pflags |= TDP_WAKEUP;
 		return (0);
 	} 
 
 	p = td->td_proc;
 	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 	if (ttd == NULL)
 		return (ESRCH);
 	thread_lock(ttd);
 	ttd->td_flags |= TDF_THRWAKEUP;
 	thread_unlock(ttd);
 	wakeup((void *)ttd);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 int
 sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap)
 {
 	struct proc *p;
 	char name[MAXCOMLEN + 1];
 	struct thread *ttd;
 	int error;
 
 	error = 0;
 	name[0] = '\0';
 	if (uap->name != NULL) {
 		error = copyinstr(uap->name, name, sizeof(name), NULL);
 		if (error == ENAMETOOLONG) {
 			error = copyin(uap->name, name, sizeof(name) - 1);
 			name[sizeof(name) - 1] = '\0';
 		}
 		if (error)
 			return (error);
 	}
 	p = td->td_proc;
 	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 	if (ttd == NULL)
 		return (ESRCH);
 	strcpy(ttd->td_name, name);
 #ifdef KTR
 	sched_clear_tdname(ttd);
 #endif
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 kern_thr_alloc(struct proc *p, int pages, struct thread **ntd)
 {
 
 	/* Have race condition but it is cheap. */
 	if (p->p_numthreads >= max_threads_per_proc) {
 		++max_threads_hits;
 		return (EPROCLIM);
 	}
 
 	*ntd = thread_alloc(pages);
 	if (*ntd == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
Index: head/sys/kern/kern_thread.c
===================================================================
--- head/sys/kern/kern_thread.c	(revision 326270)
+++ head/sys/kern/kern_thread.c	(revision 326271)
@@ -1,1265 +1,1267 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include "opt_witness.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangelock.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/selinfo.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/turnstile.h>
 #include <sys/ktr.h>
 #include <sys/rwlock.h>
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/vm_domain.h>
 #include <sys/eventhandler.h>
 
 /*
  * Asserts below verify the stability of struct thread and struct proc
  * layout, as exposed by KBI to modules.  On head, the KBI is allowed
  * to drift, change to the structures must be accompanied by the
  * assert update.
  *
  * On the stable branches after KBI freeze, conditions must not be
  * violated.  Typically new fields are moved to the end of the
  * structures.
  */
 #ifdef __amd64__
 _Static_assert(offsetof(struct thread, td_flags) == 0xf4,
     "struct thread KBI td_flags");
 _Static_assert(offsetof(struct thread, td_pflags) == 0xfc,
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x460,
     "struct thread KBI td_frame");
 _Static_assert(offsetof(struct thread, td_emuldata) == 0x508,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0xb0,
     "struct proc KBI p_flag");
 _Static_assert(offsetof(struct proc, p_pid) == 0xbc,
     "struct proc KBI p_pid");
 _Static_assert(offsetof(struct proc, p_filemon) == 0x3d0,
     "struct proc KBI p_filemon");
 _Static_assert(offsetof(struct proc, p_comm) == 0x3e0,
     "struct proc KBI p_comm");
 _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8,
     "struct proc KBI p_emuldata");
 #endif
 #ifdef __i386__
 _Static_assert(offsetof(struct thread, td_flags) == 0x9c,
     "struct thread KBI td_flags");
 _Static_assert(offsetof(struct thread, td_pflags) == 0xa4,
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x2ec,
     "struct thread KBI td_frame");
 _Static_assert(offsetof(struct thread, td_emuldata) == 0x338,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0x68,
     "struct proc KBI p_flag");
 _Static_assert(offsetof(struct proc, p_pid) == 0x74,
     "struct proc KBI p_pid");
 _Static_assert(offsetof(struct proc, p_filemon) == 0x27c,
     "struct proc KBI p_filemon");
 _Static_assert(offsetof(struct proc, p_comm) == 0x288,
     "struct proc KBI p_comm");
 _Static_assert(offsetof(struct proc, p_emuldata) == 0x314,
     "struct proc KBI p_emuldata");
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE(proc, , , lwp__exit);
 
 /*
  * thread related storage.
  */
 static uma_zone_t thread_zone;
 
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 static struct mtx zombie_lock;
 MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
 
 static void thread_zombie(struct thread *);
 static int thread_unsuspend_one(struct thread *td, struct proc *p,
     bool boundary);
 
 #define TID_BUFFER_SIZE	1024
 
 struct mtx tid_lock;
 static struct unrhdr *tid_unrhdr;
 static lwpid_t tid_buffer[TID_BUFFER_SIZE];
 static int tid_head, tid_tail;
 static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
 
 struct	tidhashhead *tidhashtbl;
 u_long	tidhash;
 struct	rwlock tidhash_lock;
 
 EVENTHANDLER_LIST_DEFINE(thread_ctor);
 EVENTHANDLER_LIST_DEFINE(thread_dtor);
 EVENTHANDLER_LIST_DEFINE(thread_init);
 EVENTHANDLER_LIST_DEFINE(thread_fini);
 
 static lwpid_t
 tid_alloc(void)
 {
 	lwpid_t	tid;
 
 	tid = alloc_unr(tid_unrhdr);
 	if (tid != -1)
 		return (tid);
 	mtx_lock(&tid_lock);
 	if (tid_head == tid_tail) {
 		mtx_unlock(&tid_lock);
 		return (-1);
 	}
 	tid = tid_buffer[tid_head];
 	tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
 	mtx_unlock(&tid_lock);
 	return (tid);
 }
 
 static void
 tid_free(lwpid_t tid)
 {
 	lwpid_t tmp_tid = -1;
 
 	mtx_lock(&tid_lock);
 	if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) {
 		tmp_tid = tid_buffer[tid_head];
 		tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
 	}
 	tid_buffer[tid_tail] = tid;
 	tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE;
 	mtx_unlock(&tid_lock);
 	if (tmp_tid != -1)
 		free_unr(tid_unrhdr, tmp_tid);
 }
 
 /*
  * Prepare a thread for use.
  */
 static int
 thread_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_oncpu = NOCPU;
 
 	td->td_tid = tid_alloc();
 
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
 	 * end of a context switch.
 	 */
 	td->td_critnest = 1;
 	td->td_lend_user_pri = PRI_MAX;
 	EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td);
 #ifdef AUDIT
 	audit_thread_alloc(td);
 #endif
 	umtx_thread_alloc(td);
 	return (0);
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 #ifdef AUDIT
 	audit_thread_free(td);
 #endif
 	/* Free all OSD associated to this thread. */
 	osd_thread_exit(td);
 	td_softdep_cleanup(td);
 	MPASS(td->td_su == NULL);
 
 	EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td);
 	tid_free(td->td_tid);
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static int
 thread_init(void *mem, int size, int flags)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
 	td->td_rlqe = NULL;
 	EVENTHANDLER_DIRECT_INVOKE(thread_init, td);
 	umtx_thread_init(td);
 	td->td_kstack = 0;
 	td->td_sel = NULL;
 	return (0);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_DIRECT_INVOKE(thread_fini, td);
 	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
 	seltdfini(td);
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  * called from:
  * {arch}/{arch}/machdep.c   {arch}_init(), init386() etc.
  * proc_dtor() (should go away)
  * proc_init()
  */
 void
 proc_linkup0(struct proc *p, struct thread *td)
 {
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	proc_linkup(p, td);
 }
 
 void
 proc_linkup(struct proc *p, struct thread *td)
 {
 
 	sigqueue_init(&p->p_sigqueue, p);
 	p->p_ksi = ksiginfo_alloc(1);
 	if (p->p_ksi != NULL) {
 		/* XXX p_ksi may be null if ksiginfo zone is not ready */
 		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
 	}
 	LIST_INIT(&p->p_mqnotifier);
 	p->p_numthreads = 0;
 	thread_link(td, p);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
 
 	/*
 	 * pid_max cannot be greater than PID_MAX.
 	 * leave one number for thread0.
 	 */
 	tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock);
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    32 - 1, UMA_ZONE_NOFREE);
 	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
 	rw_init(&tidhash_lock, "tidhash");
 }
 
 /*
  * Place an unused thread on the zombie list.
  * Use the slpq as that must be unused by now.
  */
 void
 thread_zombie(struct thread *td)
 {
 	mtx_lock_spin(&zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
 	mtx_unlock_spin(&zombie_lock);
 }
 
 /*
  * Release a thread that has exited after cpu_throw().
  */
 void
 thread_stash(struct thread *td)
 {
 	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
 	thread_zombie(td);
 }
 
 /*
  * Reap zombie resources.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant.
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
 		mtx_lock_spin(&zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		mtx_unlock_spin(&zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_slpq);
 			thread_cow_free(td_first);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 	}
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(int pages)
 {
 	struct thread *td;
 
 	thread_reap(); /* check if any zombies to get */
 
 	td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
 	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
 	if (!vm_thread_new(td, pages)) {
 		uma_zfree(thread_zone, td);
 		return (NULL);
 	}
 	cpu_thread_alloc(td);
 	vm_domain_policy_init(&td->td_vm_dom_policy);
 	return (td);
 }
 
 int
 thread_alloc_stack(struct thread *td, int pages)
 {
 
 	KASSERT(td->td_kstack == 0,
 	    ("thread_alloc_stack called on a thread with kstack"));
 	if (!vm_thread_new(td, pages))
 		return (0);
 	cpu_thread_alloc(td);
 	return (1);
 }
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 
 	lock_profile_thread_exit(td);
 	if (td->td_cpuset)
 		cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_free(td);
 	if (td->td_kstack != 0)
 		vm_thread_dispose(td);
 	vm_domain_policy_cleanup(&td->td_vm_dom_policy);
 	callout_drain(&td->td_slpcallout);
 	uma_zfree(thread_zone, td);
 }
 
 void
 thread_cow_get_proc(struct thread *newtd, struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	newtd->td_ucred = crhold(p->p_ucred);
 	newtd->td_limit = lim_hold(p->p_limit);
 	newtd->td_cowgen = p->p_cowgen;
 }
 
 void
 thread_cow_get(struct thread *newtd, struct thread *td)
 {
 
 	newtd->td_ucred = crhold(td->td_ucred);
 	newtd->td_limit = lim_hold(td->td_limit);
 	newtd->td_cowgen = td->td_cowgen;
 }
 
 void
 thread_cow_free(struct thread *td)
 {
 
 	if (td->td_ucred != NULL)
 		crfree(td->td_ucred);
 	if (td->td_limit != NULL)
 		lim_free(td->td_limit);
 }
 
 void
 thread_cow_update(struct thread *td)
 {
 	struct proc *p;
 	struct ucred *oldcred;
 	struct plimit *oldlimit;
 
 	p = td->td_proc;
 	oldcred = NULL;
 	oldlimit = NULL;
 	PROC_LOCK(p);
 	if (td->td_ucred != p->p_ucred) {
 		oldcred = td->td_ucred;
 		td->td_ucred = crhold(p->p_ucred);
 	}
 	if (td->td_limit != p->p_limit) {
 		oldlimit = td->td_limit;
 		td->td_limit = lim_hold(p->p_limit);
 	}
 	td->td_cowgen = p->p_cowgen;
 	PROC_UNLOCK(p);
 	if (oldcred != NULL)
 		crfree(oldcred);
 	if (oldlimit != NULL)
 		lim_free(oldlimit);
 }
 
 /*
  * Discard the current thread and exit from its context.
  * Always called with scheduler locked.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().
  */
 void
 thread_exit(void)
 {
 	uint64_t runtime, new_switchtime;
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, td->td_name);
 	SDT_PROBE0(proc, , , lwp__exit);
 	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
 
 #ifdef AUDIT
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 	/*
 	 * drop FPU & debug register state storage, or any other
 	 * architecture specific resources that
 	 * would not be on a new untouched process.
 	 */
 	cpu_thread_exit(td);
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff if we never had threads.
 	 * EXIT clears all sign of other threads when
 	 * it goes to single threading, so the last thread always
 	 * takes the short path.
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
 			atomic_add_int(&td->td_proc->p_exitthreads, 1);
 			thread_unlink(td);
 			td2 = FIRST_THREAD_IN_PROC(p);
 			sched_exit_thread(td2, td);
 
 			/*
 			 * The test below is NOT true if we are the
 			 * sole exiting thread. P_STOPPED_SINGLE is unset
 			 * in exit1() after it is the only survivor.
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
 					thread_lock(p->p_singlethread);
 					wakeup_swapper = thread_unsuspend_one(
 						p->p_singlethread, p, false);
 					thread_unlock(p->p_singlethread);
 					if (wakeup_swapper)
 						kick_proc0();
 				}
 			}
 
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
 			 * The last thread is exiting.. but not through exit()
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
 	} 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * If this thread is part of a process that is being tracked by hwpmc(4),
 	 * inform the module of the thread's impending exit.
 	 */
 	if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 	PROC_UNLOCK(p);
 	PROC_STATLOCK(p);
 	thread_lock(td);
 	PROC_SUNLOCK(p);
 
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
 	new_switchtime = cpu_ticks();
 	runtime = new_switchtime - PCPU_GET(switchtime);
 	td->td_runtime += runtime;
 	td->td_incruntime += runtime;
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	VM_CNT_INC(v_swtch);
 
 	/* Save our resource usage in our process. */
 	td->td_ru.ru_nvcsw++;
 	ruxagg(p, td);
 	rucollect(&p->p_ru, &td->td_ru);
 	PROC_STATUNLOCK(p);
 
 	td->td_state = TDS_INACTIVE;
 #ifdef WITNESS
 	witness_thread_exit(td);
 #endif
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
 	sched_throw(td);
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /*
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()"));
 	KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking"));
 	td = FIRST_THREAD_IN_PROC(p);
 	/* Lock the last thread so we spin until it exits cpu_throw(). */
 	thread_lock(td);
 	thread_unlock(td);
 	lock_profile_thread_exit(td);
 	cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_clean(td);
 	thread_cow_free(td);
 	callout_drain(&td->td_slpcallout);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  */
 void
 thread_link(struct thread *td, struct proc *p)
 {
 
 	/*
 	 * XXX This can't be enabled because it's called for proc0 before
 	 * its lock has been created.
 	 * PROC_LOCK_ASSERT(p, MA_OWNED);
 	 */
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_flags    = TDF_INMEM;
 
 	LIST_INIT(&td->td_contested);
 	LIST_INIT(&td->td_lprof[0]);
 	LIST_INIT(&td->td_lprof[1]);
 	sigqueue_init(&td->td_sigqueue, p);
 	callout_init(&td->td_slpcallout, 1);
 	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
 
 /*
  * Called from:
  *  thread_exit()
  */
 void
 thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	/* could clear a few other things here */
 	/* Must  NOT clear links to proc! */
 }
 
 static int
 calc_remaining(struct proc *p, int mode)
 {
 	int remaining;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
 	else if (mode == SINGLE_BOUNDARY)
 		remaining = p->p_numthreads - p->p_boundary_count;
 	else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC)
 		remaining = p->p_numthreads - p->p_suspcount;
 	else
 		panic("calc_remaining: wrong mode %d", mode);
 	return (remaining);
 }
 
 static int
 remain_for_mode(int mode)
 {
 
 	return (mode == SINGLE_ALLPROC ? 0 : 1);
 }
 
 static int
 weed_inhib(int mode, struct thread *td2, struct proc *p)
 {
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td2, MA_OWNED);
 
 	wakeup_swapper = 0;
 	switch (mode) {
 	case SINGLE_EXIT:
 		if (TD_IS_SUSPENDED(td2))
 			wakeup_swapper |= thread_unsuspend_one(td2, p, true);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
 			wakeup_swapper |= sleepq_abort(td2, EINTR);
 		break;
 	case SINGLE_BOUNDARY:
 	case SINGLE_NO_EXIT:
 		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0)
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
 			wakeup_swapper |= sleepq_abort(td2, ERESTART);
 		break;
 	case SINGLE_ALLPROC:
 		/*
 		 * ALLPROC suspend tries to avoid spurious EINTR for
 		 * threads sleeping interruptable, by suspending the
 		 * thread directly, similarly to sig_suspend_threads().
 		 * Since such sleep is not performed at the user
 		 * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP
 		 * is used to avoid immediate un-suspend.
 		 */
 		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY |
 		    TDF_ALLPROCSUSP)) == 0)
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) {
 			if ((td2->td_flags & TDF_SBDRY) == 0) {
 				thread_suspend_one(td2);
 				td2->td_flags |= TDF_ALLPROCSUSP;
 			} else {
 				wakeup_swapper |= sleepq_abort(td2, ERESTART);
 			}
 		}
 		break;
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accelerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(struct proc *p, int mode)
 {
 	struct thread *td;
 	struct thread *td2;
 	int remaining, wakeup_swapper;
 
 	td = curthread;
 	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
 	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
 	    ("invalid mode %d", mode));
 	/*
 	 * If allowing non-ALLPROC singlethreading for non-curproc
 	 * callers, calc_remaining() and remain_for_mode() should be
 	 * adjusted to also account for td->td_proc != p.  For now
 	 * this is not implemented because it is not used.
 	 */
 	KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) ||
 	    (mode != SINGLE_ALLPROC && td->td_proc == p),
 	    ("mode %d proc %p curproc %p", mode, p, td->td_proc));
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread != NULL && p->p_singlethread != td)
 		return (1);
 
 	if (mode == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 		p->p_flag &= ~P_SINGLE_BOUNDARY;
 	} else {
 		p->p_flag &= ~P_SINGLE_EXIT;
 		if (mode == SINGLE_BOUNDARY)
 			p->p_flag |= P_SINGLE_BOUNDARY;
 		else
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	if (mode == SINGLE_ALLPROC)
 		p->p_flag |= P_TOTAL_STOP;
 	p->p_flag |= P_STOPPED_SINGLE;
 	PROC_SLOCK(p);
 	p->p_singlethread = td;
 	remaining = calc_remaining(p, mode);
 	while (remaining != remain_for_mode(mode)) {
 		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
 			goto stopme;
 		wakeup_swapper = 0;
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 			if (TD_IS_INHIBITED(td2)) {
 				wakeup_swapper |= weed_inhib(mode, td2, p);
 #ifdef SMP
 			} else if (TD_IS_RUNNING(td2) && td != td2) {
 				forward_signal(td2);
 #endif
 			}
 			thread_unlock(td2);
 		}
 		if (wakeup_swapper)
 			kick_proc0();
 		remaining = calc_remaining(p, mode);
 
 		/*
 		 * Maybe we suspended some threads.. was it enough?
 		 */
 		if (remaining == remain_for_mode(mode))
 			break;
 
 stopme:
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_switch(td, p);
 		remaining = calc_remaining(p, mode);
 	}
 	if (mode == SINGLE_EXIT) {
 		/*
 		 * Convert the process to an unthreaded process.  The
 		 * SINGLE_EXIT is called by exit1() or execve(), in
 		 * both cases other threads must be retired.
 		 */
 		KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads"));
 		p->p_singlethread = NULL;
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS);
 
 		/*
 		 * Wait for any remaining threads to exit cpu_throw().
 		 */
 		while (p->p_exitthreads != 0) {
 			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			sched_relinquish(td);
 			PROC_LOCK(p);
 			PROC_SLOCK(p);
 		}
 	} else if (mode == SINGLE_BOUNDARY) {
 		/*
 		 * Wait until all suspended threads are removed from
 		 * the processors.  The thread_suspend_check()
 		 * increments p_boundary_count while it is still
 		 * running, which makes it possible for the execve()
 		 * to destroy vmspace while our other threads are
 		 * still using the address space.
 		 *
 		 * We lock the thread, which is only allowed to
 		 * succeed after context switch code finished using
 		 * the address space.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			KASSERT((td2->td_flags & TDF_BOUNDARY) != 0,
 			    ("td %p not on boundary", td2));
 			KASSERT(TD_IS_SUSPENDED(td2),
 			    ("td %p is not suspended", td2));
 			thread_unlock(td2);
 		}
 	}
 	PROC_SUNLOCK(p);
 	return (0);
 }
 
 bool
 thread_suspend_check_needed(void)
 {
 	struct proc *p;
 	struct thread *td;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 &&
 	    (td->td_dbgflags & TDB_SUSPEND) != 0));
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediately
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediately
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (thread_suspend_check_needed()) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * It is safe to access p->p_singlethread unlocked
 			 * because it can only be set to our address by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		}
 		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
 			return (EINTR);
 
 		/* Should we goto user boundary if we didn't come from there? */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
 			return (ERESTART);
 
 		/*
 		 * Ignore suspend requests if they are deferred.
 		 */
 		if ((td->td_flags & TDF_SBDRY) != 0) {
 			KASSERT(return_instead,
 			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
 			KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
 			    (TDF_SEINTR | TDF_SERESTART),
 			    ("both TDF_SEINTR and TDF_SERESTART"));
 			return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0);
 		}
 
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			PROC_UNLOCK(p);
 
 			/*
 			 * Allow Linux emulation layer to do some work
 			 * before thread suicide.
 			 */
 			if (__predict_false(p->p_sysent->sv_thread_detach != NULL))
 				(p->p_sysent->sv_thread_detach)(td);
 			umtx_thread_exit(td);
 			kern_thr_exit(td);
 			panic("stopped thread did not exit");
 		}
 
 		PROC_SLOCK(p);
 		thread_stopped(p);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount + 1) {
 				thread_lock(p->p_singlethread);
 				wakeup_swapper = thread_unsuspend_one(
 				    p->p_singlethread, p, false);
 				thread_unlock(p->p_singlethread);
 				if (wakeup_swapper)
 					kick_proc0();
 			}
 		}
 		PROC_UNLOCK(p);
 		thread_lock(td);
 		/*
 		 * When a thread suspends, it just
 		 * gets taken off all queues.
 		 */
 		thread_suspend_one(td);
 		if (return_instead == 0) {
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
 		PROC_SUNLOCK(p);
 		mi_switch(SW_INVOL | SWT_SUSPEND, NULL);
 		thread_unlock(td);
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 thread_suspend_switch(struct thread *td, struct proc *p)
 {
 
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * We implement thread_suspend_one in stages here to avoid
 	 * dropping the proc lock while the thread lock is owned.
 	 */
 	if (p == td->td_proc) {
 		thread_stopped(p);
 		p->p_suspcount++;
 	}
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 	PROC_SUNLOCK(p);
 	DROP_GIANT();
 	mi_switch(SW_VOL | SWT_SUSPEND, NULL);
 	thread_unlock(td);
 	PICKUP_GIANT();
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 }
 
 static int
 thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	td->td_flags &= ~TDF_ALLPROCSUSP;
 	if (td->td_proc == p) {
 		PROC_SLOCK_ASSERT(p, MA_OWNED);
 		p->p_suspcount--;
 		if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) {
 			td->td_flags &= ~TDF_BOUNDARY;
 			p->p_boundary_count--;
 		}
 	}
 	return (setrunnable(td));
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	wakeup_swapper = 0;
 	if (!P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td, p,
 				    true);
 			}
 			thread_unlock(td);
 		}
 	} else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 	    p->p_numthreads == p->p_suspcount) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		if (p->p_singlethread->td_proc == p) {
 			thread_lock(p->p_singlethread);
 			wakeup_swapper = thread_unsuspend_one(
 			    p->p_singlethread, p, false);
 			thread_unlock(p->p_singlethread);
 		}
 	}
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * End the single threading mode..
  */
 void
 thread_single_end(struct proc *p, int mode)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
 	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
 	    ("invalid mode %d", mode));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) ||
 	    (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0),
 	    ("mode %d does not match P_TOTAL_STOP", mode));
 	KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread,
 	    ("thread_single_end from other thread %p %p",
 	    curthread, p->p_singlethread));
 	KASSERT(mode != SINGLE_BOUNDARY ||
 	    (p->p_flag & P_SINGLE_BOUNDARY) != 0,
 	    ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag));
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY |
 	    P_TOTAL_STOP);
 	PROC_SLOCK(p);
 	p->p_singlethread = NULL;
 	wakeup_swapper = 0;
 	/*
 	 * If there are other threads they may now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td, p,
 				    mode == SINGLE_BOUNDARY);
 			}
 			thread_unlock(td);
 		}
 	}
 	KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
 	    ("inconsistent boundary count %d", p->p_boundary_count));
 	PROC_SUNLOCK(p);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 struct thread *
 thread_find(struct proc *p, lwpid_t tid)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_tid == tid)
 			break;
 	}
 	return (td);
 }
 
 /* Locate a thread by number; return with proc lock held. */
 struct thread *
 tdfind(lwpid_t tid, pid_t pid)
 {
 #define RUN_THRESH	16
 	struct thread *td;
 	int run = 0;
 
 	rw_rlock(&tidhash_lock);
 	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
 		if (td->td_tid == tid) {
 			if (pid != -1 && td->td_proc->p_pid != pid) {
 				td = NULL;
 				break;
 			}
 			PROC_LOCK(td->td_proc);
 			if (td->td_proc->p_state == PRS_NEW) {
 				PROC_UNLOCK(td->td_proc);
 				td = NULL;
 				break;
 			}
 			if (run > RUN_THRESH) {
 				if (rw_try_upgrade(&tidhash_lock)) {
 					LIST_REMOVE(td, td_hash);
 					LIST_INSERT_HEAD(TIDHASH(td->td_tid),
 						td, td_hash);
 					rw_wunlock(&tidhash_lock);
 					return (td);
 				}
 			}
 			break;
 		}
 		run++;
 	}
 	rw_runlock(&tidhash_lock);
 	return (td);
 }
 
 void
 tidhash_add(struct thread *td)
 {
 	rw_wlock(&tidhash_lock);
 	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
 	rw_wunlock(&tidhash_lock);
 }
 
 void
 tidhash_remove(struct thread *td)
 {
 	rw_wlock(&tidhash_lock);
 	LIST_REMOVE(td, td_hash);
 	rw_wunlock(&tidhash_lock);
 }
Index: head/sys/kern/kern_umtx.c
===================================================================
--- head/sys/kern/kern_umtx.c	(revision 326270)
+++ head/sys/kern/kern_umtx.c	(revision 326271)
@@ -1,4574 +1,4576 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2015, 2016 The FreeBSD Foundation
  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_umtx_profiling.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/eventhandler.h>
 #include <sys/umtx.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_proto.h>
 #endif
 
 #define _UMUTEX_TRY		1
 #define _UMUTEX_WAIT		2
 
 #ifdef UMTX_PROFILING
 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
 #endif
 
 /* Priority inheritance mutex info. */
 struct umtx_pi {
 	/* Owner thread */
 	struct thread		*pi_owner;
 
 	/* Reference count */
 	int			pi_refcount;
 
  	/* List entry to link umtx holding by thread */
 	TAILQ_ENTRY(umtx_pi)	pi_link;
 
 	/* List entry in hash */
 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
 
 	/* List for waiters */
 	TAILQ_HEAD(,umtx_q)	pi_blocked;
 
 	/* Identify a userland lock object */
 	struct umtx_key		pi_key;
 };
 
 /* A userland synchronous object user. */
 struct umtx_q {
 	/* Linked list for the hash. */
 	TAILQ_ENTRY(umtx_q)	uq_link;
 
 	/* Umtx key. */
 	struct umtx_key		uq_key;
 
 	/* Umtx flags. */
 	int			uq_flags;
 #define UQF_UMTXQ	0x0001
 
 	/* The thread waits on. */
 	struct thread		*uq_thread;
 
 	/*
 	 * Blocked on PI mutex. read can use chain lock
 	 * or umtx_lock, write must have both chain lock and
 	 * umtx_lock being hold.
 	 */
 	struct umtx_pi		*uq_pi_blocked;
 
 	/* On blocked list */
 	TAILQ_ENTRY(umtx_q)	uq_lockq;
 
 	/* Thread contending with us */
 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
 
 	/* Inherited priority from PP mutex */
 	u_char			uq_inherited_pri;
 	
 	/* Spare queue ready to be reused */
 	struct umtxq_queue	*uq_spare_queue;
 
 	/* The queue we on */
 	struct umtxq_queue	*uq_cur_queue;
 };
 
 TAILQ_HEAD(umtxq_head, umtx_q);
 
 /* Per-key wait-queue */
 struct umtxq_queue {
 	struct umtxq_head	head;
 	struct umtx_key		key;
 	LIST_ENTRY(umtxq_queue)	link;
 	int			length;
 };
 
 LIST_HEAD(umtxq_list, umtxq_queue);
 
 /* Userland lock object's wait-queue chain */
 struct umtxq_chain {
 	/* Lock for this chain. */
 	struct mtx		uc_lock;
 
 	/* List of sleep queues. */
 	struct umtxq_list	uc_queue[2];
 #define UMTX_SHARED_QUEUE	0
 #define UMTX_EXCLUSIVE_QUEUE	1
 
 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
 
 	/* Busy flag */
 	char			uc_busy;
 
 	/* Chain lock waiters */
 	int			uc_waiters;
 
 	/* All PI in the list */
 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
 
 #ifdef UMTX_PROFILING
 	u_int 			length;
 	u_int			max_length;
 #endif
 };
 
 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
 
 /*
  * Don't propagate time-sharing priority, there is a security reason,
  * a user can simply introduce PI-mutex, let thread A lock the mutex,
  * and let another thread B block on the mutex, because B is
  * sleeping, its priority will be boosted, this causes A's priority to
  * be boosted via priority propagating too and will never be lowered even
  * if it is using 100%CPU, this is unfair to other processes.
  */
 
 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
 
 #define	GOLDEN_RATIO_PRIME	2654404609U
 #ifndef	UMTX_CHAINS
 #define	UMTX_CHAINS		512
 #endif
 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
 
 #define	GET_SHARE(flags)	\
     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
 
 #define BUSY_SPINS		200
 
 struct abs_timeout {
 	int clockid;
 	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
 	struct timespec cur;
 	struct timespec end;
 };
 
 #ifdef COMPAT_FREEBSD32
 struct umutex32 {
 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
 	__uint32_t		m_flags;	/* Flags of the mutex */
 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
 	__uint32_t		m_rb_lnk;	/* Robust linkage */
 	__uint32_t		m_pad;
 	__uint32_t		m_spare[2];
 };
 
 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
 #endif
 
 int umtx_shm_vnobj_persistent = 0;
 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
     &umtx_shm_vnobj_persistent, 0,
     "False forces destruction of umtx attached to file, on last close");
 static int umtx_max_rb = 1000;
 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
     &umtx_max_rb, 0,
     "");
 
 static uma_zone_t		umtx_pi_zone;
 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
 static int			umtx_pi_allocated;
 
 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
     &umtx_pi_allocated, 0, "Allocated umtx_pi");
 static int umtx_verbose_rb = 1;
 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
     &umtx_verbose_rb, 0,
     "");
 
 #ifdef UMTX_PROFILING
 static long max_length;
 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
 #endif
 
 static void abs_timeout_update(struct abs_timeout *timo);
 
 static void umtx_shm_init(void);
 static void umtxq_sysinit(void *);
 static void umtxq_hash(struct umtx_key *key);
 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
 static void umtxq_lock(struct umtx_key *key);
 static void umtxq_unlock(struct umtx_key *key);
 static void umtxq_busy(struct umtx_key *key);
 static void umtxq_unbusy(struct umtx_key *key);
 static void umtxq_insert_queue(struct umtx_q *uq, int q);
 static void umtxq_remove_queue(struct umtx_q *uq, int q);
 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
 static int umtxq_count(struct umtx_key *key);
 static struct umtx_pi *umtx_pi_alloc(int);
 static void umtx_pi_free(struct umtx_pi *pi);
 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     bool rb);
 static void umtx_thread_cleanup(struct thread *td);
 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
     struct image_params *imgp __unused);
 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
 
 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
 
 static struct mtx umtx_lock;
 
 #ifdef UMTX_PROFILING
 static void
 umtx_init_profiling(void) 
 {
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 	int i;
 
 	for (i = 0; i < UMTX_CHAINS; ++i) {
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, 
 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
 	}
 }
 
 static int
 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
 {
 	char buf[512];
 	struct sbuf sb;
 	struct umtxq_chain *uc;
 	u_int fract, i, j, tot, whole;
 	u_int sf0, sf1, sf2, sf3, sf4;
 	u_int si0, si1, si2, si3, si4;
 	u_int sw0, sw1, sw2, sw3, sw4;
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 	for (i = 0; i < 2; i++) {
 		tot = 0;
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			uc = &umtxq_chains[i][j];
 			mtx_lock(&uc->uc_lock);
 			tot += uc->max_length;
 			mtx_unlock(&uc->uc_lock);
 		}
 		if (tot == 0)
 			sbuf_printf(&sb, "%u) Empty ", i);
 		else {
 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
 			si0 = si1 = si2 = si3 = si4 = 0;
 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
 			for (j = 0; j < UMTX_CHAINS; j++) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				whole = uc->max_length * 100;
 				mtx_unlock(&uc->uc_lock);
 				fract = (whole % tot) * 100;
 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
 					sf0 = fract;
 					si0 = j;
 					sw0 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
 				    sf1)) {
 					sf1 = fract;
 					si1 = j;
 					sw1 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
 				    sf2)) {
 					sf2 = fract;
 					si2 = j;
 					sw2 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
 				    sf3)) {
 					sf3 = fract;
 					si3 = j;
 					sw3 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
 				    sf4)) {
 					sf4 = fract;
 					si4 = j;
 					sw4 = whole;
 				}
 			}
 			sbuf_printf(&sb, "queue %u:\n", i);
 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
 			    sf0 / tot, si0);
 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
 			    sf1 / tot, si1);
 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
 			    sf2 / tot, si2);
 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
 			    sf3 / tot, si3);
 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
 			    sf4 / tot, si4);
 		}
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (0);
 }
 
 static int
 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
 {
 	struct umtxq_chain *uc;
 	u_int i, j;
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (clear != 0) {
 		for (i = 0; i < 2; ++i) {
 			for (j = 0; j < UMTX_CHAINS; ++j) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				uc->length = 0;
 				uc->max_length = 0;	
 				mtx_unlock(&uc->uc_lock);
 			}
 		}
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
 #endif
 
 static void
 umtxq_sysinit(void *arg __unused)
 {
 	int i, j;
 
 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	for (i = 0; i < 2; ++i) {
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
 				 MTX_DEF | MTX_DUPOK);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
 			umtxq_chains[i][j].uc_busy = 0;
 			umtxq_chains[i][j].uc_waiters = 0;
 #ifdef UMTX_PROFILING
 			umtxq_chains[i][j].length = 0;
 			umtxq_chains[i][j].max_length = 0;	
 #endif
 		}
 	}
 #ifdef UMTX_PROFILING
 	umtx_init_profiling();
 #endif
 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	umtx_shm_init();
 }
 
 struct umtx_q *
 umtxq_alloc(void)
 {
 	struct umtx_q *uq;
 
 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
 	    M_WAITOK | M_ZERO);
 	TAILQ_INIT(&uq->uq_spare_queue->head);
 	TAILQ_INIT(&uq->uq_pi_contested);
 	uq->uq_inherited_pri = PRI_MAX;
 	return (uq);
 }
 
 void
 umtxq_free(struct umtx_q *uq)
 {
 
 	MPASS(uq->uq_spare_queue != NULL);
 	free(uq->uq_spare_queue, M_UMTX);
 	free(uq, M_UMTX);
 }
 
 static inline void
 umtxq_hash(struct umtx_key *key)
 {
 	unsigned n;
 
 	n = (uintptr_t)key->info.both.a + key->info.both.b;
 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
 }
 
 static inline struct umtxq_chain *
 umtxq_getchain(struct umtx_key *key)
 {
 
 	if (key->type <= TYPE_SEM)
 		return (&umtxq_chains[1][key->hash]);
 	return (&umtxq_chains[0][key->hash]);
 }
 
 /*
  * Lock a chain.
  */
 static inline void
 umtxq_lock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_lock(&uc->uc_lock);
 }
 
 /*
  * Unlock a chain.
  */
 static inline void
 umtxq_unlock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_unlock(&uc->uc_lock);
 }
 
 /*
  * Set chain to busy state when following operation
  * may be blocked (kernel mutex can not be used).
  */
 static inline void
 umtxq_busy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	if (uc->uc_busy) {
 #ifdef SMP
 		if (smp_cpus > 1) {
 			int count = BUSY_SPINS;
 			if (count > 0) {
 				umtxq_unlock(key);
 				while (uc->uc_busy && --count > 0)
 					cpu_spinwait();
 				umtxq_lock(key);
 			}
 		}
 #endif
 		while (uc->uc_busy) {
 			uc->uc_waiters++;
 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
 			uc->uc_waiters--;
 		}
 	}
 	uc->uc_busy = 1;
 }
 
 /*
  * Unbusy a chain.
  */
 static inline void
 umtxq_unbusy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	KASSERT(uc->uc_busy != 0, ("not busy"));
 	uc->uc_busy = 0;
 	if (uc->uc_waiters)
 		wakeup_one(uc);
 }
 
 static inline void
 umtxq_unbusy_unlocked(struct umtx_key *key)
 {
 
 	umtxq_lock(key);
 	umtxq_unbusy(key);
 	umtxq_unlock(key);
 }
 
 static struct umtxq_queue *
 umtxq_queue_lookup(struct umtx_key *key, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
 		if (umtx_key_match(&uh->key, key))
 			return (uh);
 	}
 
 	return (NULL);
 }
 
 static inline void
 umtxq_insert_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
 	uh = umtxq_queue_lookup(&uq->uq_key, q);
 	if (uh != NULL) {
 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
 	} else {
 		uh = uq->uq_spare_queue;
 		uh->key = uq->uq_key;
 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
 #ifdef UMTX_PROFILING
 		uc->length++;
 		if (uc->length > uc->max_length) {
 			uc->max_length = uc->length;
 			if (uc->max_length > max_length)
 				max_length = uc->max_length;	
 		}
 #endif
 	}
 	uq->uq_spare_queue = NULL;
 
 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
 	uh->length++;
 	uq->uq_flags |= UQF_UMTXQ;
 	uq->uq_cur_queue = uh;
 	return;
 }
 
 static inline void
 umtxq_remove_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	if (uq->uq_flags & UQF_UMTXQ) {
 		uh = uq->uq_cur_queue;
 		TAILQ_REMOVE(&uh->head, uq, uq_link);
 		uh->length--;
 		uq->uq_flags &= ~UQF_UMTXQ;
 		if (TAILQ_EMPTY(&uh->head)) {
 			KASSERT(uh->length == 0,
 			    ("inconsistent umtxq_queue length"));
 #ifdef UMTX_PROFILING
 			uc->length--;
 #endif
 			LIST_REMOVE(uh, link);
 		} else {
 			uh = LIST_FIRST(&uc->uc_spare_queue);
 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
 			LIST_REMOVE(uh, link);
 		}
 		uq->uq_spare_queue = uh;
 		uq->uq_cur_queue = NULL;
 	}
 }
 
 /*
  * Check if there are multiple waiters
  */
 static int
 umtxq_count(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL)
 		return (uh->length);
 	return (0);
 }
 
 /*
  * Check if there are multiple PI waiters and returns first
  * waiter.
  */
 static int
 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	*first = NULL;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL) {
 		*first = TAILQ_FIRST(&uh->head);
 		return (uh->length);
 	}
 	return (0);
 }
 
 static int
 umtxq_check_susp(struct thread *td)
 {
 	struct proc *p;
 	int error;
 
 	/*
 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
 	 * eventually break the lockstep loop.
 	 */
 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
 		return (0);
 	error = 0;
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (P_SHOULDSTOP(p) ||
 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
 		if (p->p_flag & P_SINGLE_EXIT)
 			error = EINTR;
 		else
 			error = ERESTART;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Wake up threads waiting on an userland object.
  */
 
 static int
 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 	struct umtx_q *uq;
 	int ret;
 
 	ret = 0;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, q);
 	if (uh != NULL) {
 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
 			umtxq_remove_queue(uq, q);
 			wakeup(uq);
 			if (++ret >= n_wake)
 				return (ret);
 		}
 	}
 	return (ret);
 }
 
 
 /*
  * Wake up specified thread.
  */
 static inline void
 umtxq_signal_thread(struct umtx_q *uq)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	umtxq_remove(uq);
 	wakeup(uq);
 }
 
 static inline int 
 tstohz(const struct timespec *tsp)
 {
 	struct timeval tv;
 
 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
 	return tvtohz(&tv);
 }
 
 static void
 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
 	const struct timespec *timeout)
 {
 
 	timo->clockid = clockid;
 	if (!absolute) {
 		timo->is_abs_real = false;
 		abs_timeout_update(timo);
 		timo->end = timo->cur;
 		timespecadd(&timo->end, timeout);
 	} else {
 		timo->end = *timeout;
 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
 		    clockid == CLOCK_REALTIME_FAST ||
 		    clockid == CLOCK_REALTIME_PRECISE;
 		/*
 		 * If is_abs_real, umtxq_sleep will read the clock
 		 * after setting td_rtcgen; otherwise, read it here.
 		 */
 		if (!timo->is_abs_real) {
 			abs_timeout_update(timo);
 		}
 	}
 }
 
 static void
 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
 {
 
 	abs_timeout_init(timo, umtxtime->_clockid,
 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
 }
 
 static inline void
 abs_timeout_update(struct abs_timeout *timo)
 {
 
 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
 }
 
 static int
 abs_timeout_gethz(struct abs_timeout *timo)
 {
 	struct timespec tts;
 
 	if (timespeccmp(&timo->end, &timo->cur, <=))
 		return (-1); 
 	tts = timo->end;
 	timespecsub(&tts, &timo->cur);
 	return (tstohz(&tts));
 }
 
 static uint32_t
 umtx_unlock_val(uint32_t flags, bool rb)
 {
 
 	if (rb)
 		return (UMUTEX_RB_OWNERDEAD);
 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
 		return (UMUTEX_RB_NOTRECOV);
 	else
 		return (UMUTEX_UNOWNED);
 
 }
 
 /*
  * Put thread into sleep state, before sleeping, check if
  * thread was removed from umtx queue.
  */
 static inline int
 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
 {
 	struct umtxq_chain *uc;
 	int error, timo;
 
 	if (abstime != NULL && abstime->is_abs_real) {
 		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
 		abs_timeout_update(abstime);
 	}
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	for (;;) {
 		if (!(uq->uq_flags & UQF_UMTXQ)) {
 			error = 0;
 			break;
 		}
 		if (abstime != NULL) {
 			timo = abs_timeout_gethz(abstime);
 			if (timo < 0) {
 				error = ETIMEDOUT;
 				break;
 			}
 		} else
 			timo = 0;
 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
 		if (error == EINTR || error == ERESTART) {
 			umtxq_lock(&uq->uq_key);
 			break;
 		}
 		if (abstime != NULL) {
 			if (abstime->is_abs_real)
 				curthread->td_rtcgen =
 				    atomic_load_acq_int(&rtc_generation);
 			abs_timeout_update(abstime);
 		}
 		umtxq_lock(&uq->uq_key);
 	}
 
 	curthread->td_rtcgen = 0;
 	return (error);
 }
 
 /*
  * Convert userspace address into unique logical address.
  */
 int
 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
 {
 	struct thread *td = curthread;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 
 	key->type = type;
 	if (share == THREAD_SHARE) {
 		key->shared = 0;
 		key->info.private.vs = td->td_proc->p_vmspace;
 		key->info.private.addr = (uintptr_t)addr;
 	} else {
 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
 		map = &td->td_proc->p_vmspace->vm_map;
 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
 		    &entry, &key->info.shared.object, &pindex, &prot,
 		    &wired) != KERN_SUCCESS) {
 			return (EFAULT);
 		}
 
 		if ((share == PROCESS_SHARE) ||
 		    (share == AUTO_SHARE &&
 		     VM_INHERIT_SHARE == entry->inheritance)) {
 			key->shared = 1;
 			key->info.shared.offset = (vm_offset_t)addr -
 			    entry->start + entry->offset;
 			vm_object_reference(key->info.shared.object);
 		} else {
 			key->shared = 0;
 			key->info.private.vs = td->td_proc->p_vmspace;
 			key->info.private.addr = (uintptr_t)addr;
 		}
 		vm_map_lookup_done(map, entry);
 	}
 
 	umtxq_hash(key);
 	return (0);
 }
 
 /*
  * Release key.
  */
 void
 umtx_key_release(struct umtx_key *key)
 {
 	if (key->shared)
 		vm_object_deallocate(key->info.shared.object);
 }
 
 /*
  * Fetch and compare value, sleep on the address if value is not changed.
  */
 static int
 do_wait(struct thread *td, void *addr, u_long id,
     struct _umtx_time *timeout, int compat32, int is_private)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	u_long tmp;
 	uint32_t tmp32;
 	int error = 0;
 
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	if (compat32 == 0) {
 		error = fueword(addr, &tmp);
 		if (error != 0)
 			error = EFAULT;
 	} else {
 		error = fueword32(addr, &tmp32);
 		if (error == 0)
 			tmp = tmp32;
 		else
 			error = EFAULT;
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		if (tmp == id)
 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
 			    NULL : &timo);
 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
 			error = 0;
 		else
 			umtxq_remove(uq);
 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 		umtxq_remove(uq);
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 /*
  * Wake up threads sleeping on the specified address.
  */
 int
 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
 {
 	struct umtx_key key;
 	int ret;
 	
 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
 	umtxq_signal(&key, n_wake);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (0);
 }
 
 /*
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int mode)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner, old, id;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	error = 0;
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		rv = fueword32(&m->m_owner, &owner);
 		if (rv == -1)
 			return (EFAULT);
 		if (mode == _UMUTEX_WAIT) {
 			if (owner == UMUTEX_UNOWNED ||
 			    owner == UMUTEX_CONTESTED ||
 			    owner == UMUTEX_RB_OWNERDEAD ||
 			    owner == UMUTEX_RB_NOTRECOV)
 				return (0);
 		} else {
 			/*
 			 * Robust mutex terminated.  Kernel duty is to
 			 * return EOWNERDEAD to the userspace.  The
 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
 			 * by the common userspace code.
 			 */
 			if (owner == UMUTEX_RB_OWNERDEAD) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_RB_OWNERDEAD, &owner,
 				    id | UMUTEX_CONTESTED);
 				if (rv == -1)
 					return (EFAULT);
 				if (owner == UMUTEX_RB_OWNERDEAD)
 					return (EOWNERDEAD); /* success */
 				rv = umtxq_check_susp(td);
 				if (rv != 0)
 					return (rv);
 				continue;
 			}
 			if (owner == UMUTEX_RB_NOTRECOV)
 				return (ENOTRECOVERABLE);
 
 
 			/*
 			 * Try the uncontested case.  This should be
 			 * done in userland.
 			 */
 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
 			    &owner, id);
 			/* The address was invalid. */
 			if (rv == -1)
 				return (EFAULT);
 
 			/* The acquire succeeded. */
 			if (owner == UMUTEX_UNOWNED)
 				return (0);
 
 			/*
 			 * If no one owns it but it is contested try
 			 * to acquire it.
 			 */
 			if (owner == UMUTEX_CONTESTED) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_CONTESTED, &owner,
 				    id | UMUTEX_CONTESTED);
 				/* The address was invalid. */
 				if (rv == -1)
 					return (EFAULT);
 
 				if (owner == UMUTEX_CONTESTED)
 					return (0);
 
 				rv = umtxq_check_susp(td);
 				if (rv != 0)
 					return (rv);
 
 				/*
 				 * If this failed the lock has
 				 * changed, restart.
 				 */
 				continue;
 			}
 		}
 
 		if (mode == _UMUTEX_TRY)
 			return (EBUSY);
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
 		    GET_SHARE(flags), &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old,
 		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
 			    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 
 		if (error == 0)
 			error = umtxq_check_susp(td);
 	}
 
 	return (0);
 }
 
 /*
  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	uint32_t owner, old, id, newlock;
 	int error, count;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	newlock = umtx_unlock_val(flags, rb);
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, newlock);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	if (count > 1)
 		newlock |= UMUTEX_CONTESTED;
 	error = casueword32(&m->m_owner, owner, &old, newlock);
 	umtxq_lock(&key);
 	umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Check if the mutex is available and wake up a waiter,
  * only for simple mutex.
  */
 static int
 do_wake_umutex(struct thread *td, struct umutex *m)
 {
 	struct umtx_key key;
 	uint32_t owner;
 	uint32_t flags;
 	int error;
 	int count;
 
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
 	    owner != UMUTEX_RB_NOTRECOV)
 		return (0);
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
 	    owner != UMUTEX_RB_NOTRECOV) {
 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    UMUTEX_UNOWNED);
 		if (error == -1)
 			error = EFAULT;
 	}
 
 	umtxq_lock(&key);
 	if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 /*
  * Check if the mutex has waiters and tries to fix contention bit.
  */
 static int
 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	uint32_t owner, old;
 	int type;
 	int error;
 	int count;
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
 	    UMUTEX_ROBUST)) {
 	case 0:
 	case UMUTEX_ROBUST:
 		type = TYPE_NORMAL_UMUTEX;
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		type = TYPE_PI_UMUTEX;
 		break;
 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
 		type = TYPE_PI_ROBUST_UMUTEX;
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		type = TYPE_PP_UMUTEX;
 		break;
 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
 		type = TYPE_PP_ROBUST_UMUTEX;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
 		return (error);
 
 	owner = 0;
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 	/*
 	 * Only repair contention bit if there is a waiter, this means the mutex
 	 * is still being referenced by userland code, otherwise don't update
 	 * any memory.
 	 */
 	if (count > 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	} else if (count == 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
 		    (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	}
 	umtxq_lock(&key);
 	if (error == EFAULT) {
 		umtxq_signal(&key, INT_MAX);
 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static inline struct umtx_pi *
 umtx_pi_alloc(int flags)
 {
 	struct umtx_pi *pi;
 
 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
 	TAILQ_INIT(&pi->pi_blocked);
 	atomic_add_int(&umtx_pi_allocated, 1);
 	return (pi);
 }
 
 static inline void
 umtx_pi_free(struct umtx_pi *pi)
 {
 	uma_zfree(umtx_pi_zone, pi);
 	atomic_add_int(&umtx_pi_allocated, -1);
 }
 
 /*
  * Adjust the thread's position on a pi_state after its priority has been
  * changed.
  */
 static int
 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
 {
 	struct umtx_q *uq, *uq1, *uq2;
 	struct thread *td1;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (0);
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
 	uq2 = TAILQ_NEXT(uq, uq_lockq);
 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 			td1 = uq1->uq_thread;
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (UPRI(td1) > UPRI(td))
 				break;
 		}
 
 		if (uq1 == NULL)
 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 		else
 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	}
 	return (1);
 }
 
 static struct umtx_pi *
 umtx_pi_next(struct umtx_pi *pi)
 {
 	struct umtx_q *uq_owner;
 
 	if (pi->pi_owner == NULL)
 		return (NULL);
 	uq_owner = pi->pi_owner->td_umtxq;
 	if (uq_owner == NULL)
 		return (NULL);
 	return (uq_owner->uq_pi_blocked);
 }
 
 /*
  * Floyd's Cycle-Finding Algorithm.
  */
 static bool
 umtx_pi_check_loop(struct umtx_pi *pi)
 {
 	struct umtx_pi *pi1;	/* fast iterator */
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (false);
 	pi1 = pi;
 	for (;;) {
 		pi = umtx_pi_next(pi);
 		if (pi == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		if (pi == pi1)
 			return (true);
 	}
 	return (false);
 }
 
 /*
  * Propagate priority when a thread is blocked on POSIX
  * PI mutex.
  */ 
 static void
 umtx_propagate_priority(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	pri = UPRI(td);
 	uq = td->td_umtxq;
 	pi = uq->uq_pi_blocked;
 	if (pi == NULL)
 		return;
 	if (umtx_pi_check_loop(pi))
 		return;
 
 	for (;;) {
 		td = pi->pi_owner;
 		if (td == NULL || td == curthread)
 			return;
 
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		thread_lock(td);
 		if (td->td_lend_user_pri > pri)
 			sched_lend_user_prio(td, pri);
 		else {
 			thread_unlock(td);
 			break;
 		}
 		thread_unlock(td);
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		uq = td->td_umtxq;
 		pi = uq->uq_pi_blocked;
 		if (pi == NULL)
 			break;
 		/* Resort td on the list if needed. */
 		umtx_pi_adjust_thread(pi, td);
 	}
 }
 
 /*
  * Unpropagate priority for a PI mutex when a thread blocked on
  * it is interrupted by signal or resumed by others.
  */
 static void
 umtx_repropagate_priority(struct umtx_pi *pi)
 {
 	struct umtx_q *uq, *uq_owner;
 	struct umtx_pi *pi2;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 
 	if (umtx_pi_check_loop(pi))
 		return;
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
 		uq_owner = pi->pi_owner->td_umtxq;
 
 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
 			uq = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq != NULL) {
 				if (pri > UPRI(uq->uq_thread))
 					pri = UPRI(uq->uq_thread);
 			}
 		}
 
 		if (pri > uq_owner->uq_inherited_pri)
 			pri = uq_owner->uq_inherited_pri;
 		thread_lock(pi->pi_owner);
 		sched_lend_user_prio(pi->pi_owner, pri);
 		thread_unlock(pi->pi_owner);
 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
 	}
 }
 
 /*
  * Insert a PI mutex into owned list.
  */
 static void
 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq_owner;
 
 	uq_owner = owner->td_umtxq;
 	mtx_assert(&umtx_lock, MA_OWNED);
 	MPASS(pi->pi_owner == NULL);
 	pi->pi_owner = owner;
 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
 }
 
 
 /*
  * Disown a PI mutex, and remove it from the owned list.
  */
 static void
 umtx_pi_disown(struct umtx_pi *pi)
 {
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
 	pi->pi_owner = NULL;
 }
 
 /*
  * Claim ownership of a PI mutex.
  */
 static int
 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq;
 	int pri;
 
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == owner) {
 		mtx_unlock(&umtx_lock);
 		return (0);
 	}
 
 	if (pi->pi_owner != NULL) {
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
 		mtx_unlock(&umtx_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
 	uq = TAILQ_FIRST(&pi->pi_blocked);
 	if (uq != NULL) {
 		pri = UPRI(uq->uq_thread);
 		thread_lock(owner);
 		if (pri < UPRI(owner))
 			sched_lend_user_prio(owner, pri);
 		thread_unlock(owner);
 	}
 	mtx_unlock(&umtx_lock);
 	return (0);
 }
 
 /*
  * Adjust a thread's order position in its blocked PI mutex,
  * this may result new priority propagating process.
  */
 void
 umtx_pi_adjust(struct thread *td, u_char oldpri)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	uq = td->td_umtxq;
 	mtx_lock(&umtx_lock);
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	pi = uq->uq_pi_blocked;
 	if (pi != NULL) {
 		umtx_pi_adjust_thread(pi, td);
 		umtx_repropagate_priority(pi);
 	}
 	mtx_unlock(&umtx_lock);
 }
 
 /*
  * Sleep on a PI mutex.
  */
 static int
 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
     const char *wmesg, struct abs_timeout *timo, bool shared)
 {
 	struct umtxq_chain *uc;
 	struct thread *td, *td1;
 	struct umtx_q *uq1;
 	int error, pri;
 
 	error = 0;
 	td = uq->uq_thread;
 	KASSERT(td == curthread, ("inconsistent uq_thread"));
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
 	umtxq_insert(uq);
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == NULL) {
 		mtx_unlock(&umtx_lock);
 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
 		mtx_lock(&umtx_lock);
 		if (td1 != NULL) {
 			if (pi->pi_owner == NULL)
 				umtx_pi_setowner(pi, td1);
 			PROC_UNLOCK(td1->td_proc);
 		}
 	}
 
 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 		pri = UPRI(uq1->uq_thread);
 		if (pri > UPRI(td))
 			break;
 	}
 
 	if (uq1 != NULL)
 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	else
 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 
 	uq->uq_pi_blocked = pi;
 	thread_lock(td);
 	td->td_flags |= TDF_UPIBLOCKED;
 	thread_unlock(td);
 	umtx_propagate_priority(td);
 	mtx_unlock(&umtx_lock);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, wmesg, timo);
 	umtxq_remove(uq);
 
 	mtx_lock(&umtx_lock);
 	uq->uq_pi_blocked = NULL;
 	thread_lock(td);
 	td->td_flags &= ~TDF_UPIBLOCKED;
 	thread_unlock(td);
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_repropagate_priority(pi);
 	mtx_unlock(&umtx_lock);
 	umtxq_unlock(&uq->uq_key);
 
 	return (error);
 }
 
 /*
  * Add reference count for a PI mutex.
  */
 static void
 umtx_pi_ref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	pi->pi_refcount++;
 }
 
 /*
  * Decrease reference count for a PI mutex, if the counter
  * is decreased to zero, its memory space is freed.
  */ 
 static void
 umtx_pi_unref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
 		mtx_lock(&umtx_lock);
 		if (pi->pi_owner != NULL)
 			umtx_pi_disown(pi);
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
 		mtx_unlock(&umtx_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		umtx_pi_free(pi);
 	}
 }
 
 /*
  * Find a PI mutex in hash table.
  */
 static struct umtx_pi *
 umtx_pi_lookup(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtx_pi *pi;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 
 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
 		if (umtx_key_match(&pi->pi_key, key)) {
 			return (pi);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Insert a PI mutex into hash table.
  */
 static inline void
 umtx_pi_insert(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
 }
 
 /*
  * Lock a PI mutex.
  */
 static int
 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	struct umtx_pi *pi, *new_pi;
 	uint32_t id, old_owner, owner, old;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	pi = umtx_pi_lookup(&uq->uq_key);
 	if (pi == NULL) {
 		new_pi = umtx_pi_alloc(M_NOWAIT);
 		if (new_pi == NULL) {
 			umtxq_unlock(&uq->uq_key);
 			new_pi = umtx_pi_alloc(M_WAITOK);
 			umtxq_lock(&uq->uq_key);
 			pi = umtx_pi_lookup(&uq->uq_key);
 			if (pi != NULL) {
 				umtx_pi_free(new_pi);
 				new_pi = NULL;
 			}
 		}
 		if (new_pi != NULL) {
 			new_pi->pi_key = uq->uq_key;
 			umtx_pi_insert(new_pi);
 			pi = new_pi;
 		}
 	}
 	umtx_pi_ref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure.  It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED) {
 			error = 0;
 			break;
 		}
 
 		if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
 			old_owner = owner;
 			rv = casueword32(&m->m_owner, owner, &owner,
 			    id | UMUTEX_CONTESTED);
 			/* The address was invalid. */
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 
 			if (owner == old_owner) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				error = umtx_pi_claim(pi, td);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 				if (error != 0) {
 					/*
 					 * Since we're going to return an
 					 * error, restore the m_owner to its
 					 * previous, unowned state to avoid
 					 * compounding the problem.
 					 */
 					(void)casuword32(&m->m_owner,
 					    id | UMUTEX_CONTESTED,
 					    old_owner);
 				}
 				if (error == 0 &&
 				    old_owner == UMUTEX_RB_OWNERDEAD)
 					error = EOWNERDEAD;
 				break;
 			}
 
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 			
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old, owner |
 		    UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 
 		umtxq_lock(&uq->uq_key);
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.  Note that the UMUTEX_RB_OWNERDEAD
 		 * value for owner is impossible there.
 		 */
 		if (old == owner) {
 			error = umtxq_sleep_pi(uq, pi,
 			    owner & ~UMUTEX_CONTESTED,
 			    "umtxpi", timeout == NULL ? NULL : &timo,
 			    (flags & USYNC_PROCESS_SHARED) != 0);
 			if (error != 0)
 				continue;
 		} else {
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 		}
 
 		error = umtxq_check_susp(td);
 		if (error != 0)
 			break;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtx_pi_unref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PI mutex.
  */
 static int
 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq_first, *uq_first2, *uq_me;
 	struct umtx_pi *pi, *pi2;
 	uint32_t id, new_owner, old, owner;
 	int count, error, pri;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	new_owner = umtx_unlock_val(flags, rb);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, new_owner);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count_pi(&key, &uq_first);
 	if (uq_first != NULL) {
 		mtx_lock(&umtx_lock);
 		pi = uq_first->uq_pi_blocked;
 		KASSERT(pi != NULL, ("pi == NULL?"));
 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
 			mtx_unlock(&umtx_lock);
 			umtxq_unbusy(&key);
 			umtxq_unlock(&key);
 			umtx_key_release(&key);
 			/* userland messed the mutex */
 			return (EPERM);
 		}
 		uq_me = td->td_umtxq;
 		if (pi->pi_owner == td)
 			umtx_pi_disown(pi);
 		/* get highest priority thread which is still sleeping. */
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
 		while (uq_first != NULL && 
 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
 		}
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq_first2 != NULL) {
 				if (pri > UPRI(uq_first2->uq_thread))
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 		if (uq_first)
 			umtxq_signal_thread(uq_first);
 	} else {
 		pi = umtx_pi_lookup(&key);
 		/*
 		 * A umtx_pi can exist if a signal or timeout removed the
 		 * last waiter from the umtxq, but there is still
 		 * a thread in do_lock_pi() holding the umtx_pi.
 		 */
 		if (pi != NULL) {
 			/*
 			 * The umtx_pi can be unowned, such as when a thread
 			 * has just entered do_lock_pi(), allocated the
 			 * umtx_pi, and unlocked the umtxq.
 			 * If the current thread owns it, it must disown it.
 			 */
 			mtx_lock(&umtx_lock);
 			if (pi->pi_owner == td)
 				umtx_pi_disown(pi);
 			mtx_unlock(&umtx_lock);
 		}
 	}
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 
 	if (count > 1)
 		new_owner |= UMUTEX_CONTESTED;
 	error = casueword32(&m->m_owner, owner, &old, new_owner);
 
 	umtxq_unbusy_unlocked(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Lock a PP mutex.
  */
 static int
 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t ceiling;
 	uint32_t owner, id;
 	int error, pri, old_inherited_pri, su, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 	for (;;) {
 		old_inherited_pri = uq->uq_inherited_pri;
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			goto out;
 		}
 		ceiling = RTP_PRIO_MAX - ceiling;
 		if (ceiling > RTP_PRIO_MAX) {
 			error = EINVAL;
 			goto out;
 		}
 
 		mtx_lock(&umtx_lock);
 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
 			mtx_unlock(&umtx_lock);
 			error = EINVAL;
 			goto out;
 		}
 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
 			thread_lock(td);
 			if (uq->uq_inherited_pri < UPRI(td))
 				sched_lend_user_prio(td, uq->uq_inherited_pri);
 			thread_unlock(td);
 		}
 		mtx_unlock(&umtx_lock);
 
 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    id | UMUTEX_CONTESTED);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			error = 0;
 			break;
 		} else if (owner == UMUTEX_RB_OWNERDEAD) {
 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
 			    &owner, id | UMUTEX_CONTESTED);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (owner == UMUTEX_RB_OWNERDEAD) {
 				error = EOWNERDEAD; /* success */
 				break;
 			}
 			error = 0;
 		} else if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
 		    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 	if (error != 0 && error != EOWNERDEAD) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 out:
 	umtxq_unbusy_unlocked(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PP mutex.
  */
 static int
 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t id, owner, rceiling;
 	int error, pri, new_inherited_pri, su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
 	if (error != 0)
 		return (error);
 
 	if (rceiling == -1)
 		new_inherited_pri = PRI_MAX;
 	else {
 		rceiling = RTP_PRIO_MAX - rceiling;
 		if (rceiling > RTP_PRIO_MAX)
 			return (EINVAL);
 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
 	}
 
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_unlock(&key);
 	/*
 	 * For priority protected mutex, always set unlocked state
 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
 	 * to lock the mutex, it is necessary because thread priority
 	 * has to be adjusted for such mutex.
 	 */
 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
 	    UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (error == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 
 	if (error == -1)
 		error = EFAULT;
 	else {
 		mtx_lock(&umtx_lock);
 		if (su != 0)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
     uint32_t *old_ceiling)
 {
 	struct umtx_q *uq;
 	uint32_t flags, id, owner, save_ceiling;
 	int error, rv, rv1;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	if (ceiling > RTP_PRIO_MAX)
 		return (EINVAL);
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 	for (;;) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    id | UMUTEX_CONTESTED);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			rv = suword32(&m->m_ceilings[0], ceiling);
 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
 			break;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			rv = suword32(&m->m_ceilings[0], ceiling);
 			error = rv == 0 ? 0 : EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_RB_OWNERDEAD) {
 			error = EOWNERDEAD;
 			break;
 		} else if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", NULL);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0)
 		umtxq_signal(&uq->uq_key, INT_MAX);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == 0 && old_ceiling != NULL) {
 		rv = suword32(old_ceiling, save_ceiling);
 		error = rv == 0 ? 0 : EFAULT;
 	}
 	return (error);
 }
 
 /*
  * Lock a userland POSIX mutex.
  */
 static int
 do_lock_umutex(struct thread *td, struct umutex *m,
     struct _umtx_time *timeout, int mode)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		error = do_lock_normal(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		error = do_lock_pi(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		error = do_lock_pp(td, m, flags, timeout, mode);
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (timeout == NULL) {
 		if (error == EINTR && mode != _UMUTEX_WAIT)
 			error = ERESTART;
 	} else {
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a userland POSIX mutex.
  */
 static int
 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		return (do_unlock_normal(td, m, flags, rb));
 	case UMUTEX_PRIO_INHERIT:
 		return (do_unlock_pi(td, m, flags, rb));
 	case UMUTEX_PRIO_PROTECT:
 		return (do_unlock_pp(td, m, flags, rb));
 	}
 
 	return (EINVAL);
 }
 
 static int
 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
     struct timespec *timeout, u_long wflags)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, clockid, hasw;
 	int error;
 
 	uq = td->td_umtxq;
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if ((wflags & CVWAIT_CLOCKID) != 0) {
 		error = fueword32(&cv->c_clockid, &clockid);
 		if (error == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (clockid < CLOCK_REALTIME ||
 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
 			/* hmm, only HW clock id will work. */
 			umtx_key_release(&uq->uq_key);
 			return (EINVAL);
 		}
 	} else {
 		clockid = CLOCK_REALTIME;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Set c_has_waiters to 1 before releasing user mutex, also
 	 * don't modify cache line when unnecessary.
 	 */
 	error = fueword32(&cv->c_has_waiters, &hasw);
 	if (error == 0 && hasw == 0)
 		suword32(&cv->c_has_waiters, 1);
 
 	umtxq_unbusy_unlocked(&uq->uq_key);
 
 	error = do_unlock_umutex(td, m, false);
 
 	if (timeout != NULL)
 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
 		    timeout);
 	
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
 		    NULL : &timo);
 	}
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		/*
 		 * This must be timeout,interrupted by signal or
 		 * surprious wakeup, clear c_has_waiter flag when
 		 * necessary.
 		 */
 		umtxq_busy(&uq->uq_key);
 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 			int oldlen = uq->uq_cur_queue->length;
 			umtxq_remove(uq);
 			if (oldlen == 1) {
 				umtxq_unlock(&uq->uq_key);
 				suword32(&cv->c_has_waiters, 0);
 				umtxq_lock(&uq->uq_key);
 			}
 		}
 		umtxq_unbusy(&uq->uq_key);
 		if (error == ERESTART)
 			error = EINTR;
 	}
 
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland condition variable.
  */
 static int
 do_cv_signal(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error, cnt, nwake;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	nwake = umtxq_signal(&key, 1);
 	if (cnt <= nwake) {
 		umtxq_unlock(&key);
 		error = suword32(&cv->c_has_waiters, 0);
 		if (error == -1)
 			error = EFAULT;
 		umtxq_lock(&key);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_cv_broadcast(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_signal(&key, INT_MAX);
 	umtxq_unlock(&key);
 
 	error = suword32(&cv->c_has_waiters, 0);
 	if (error == -1)
 		error = EFAULT;
 
 	umtxq_unbusy_unlocked(&key);
 
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, wrflags;
 	int32_t state, oldstate;
 	int32_t blocked_readers;
 	int error, error1, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	wrflags = URWLOCK_WRITE_OWNER;
 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
 		wrflags |= URWLOCK_WRITE_WAITERS;
 
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/* try to lock it */
 		while (!(state & wrflags)) {
 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
 				umtx_key_release(&uq->uq_key);
 				return (EAGAIN);
 			}
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state + 1);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			state = oldstate;
 		}
 
 		if (error)
 			break;
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		/* set read contention bit */
 		while (error == 0 && (state & wrflags) &&
 		    !(state & URWLOCK_READ_WAITERS)) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_READ_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		/* state is changed while setting flags, restart */
 		if (!(state & wrflags)) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 sleep:
 		/* contention bit is set, before sleeping, increase read waiter count */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
 
 		while (state & wrflags) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert(uq);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		/* decrease read waiter count, and may clear read contention bit */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
 		if (blocked_readers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error1 = umtxq_check_susp(td);
 				if (error1 != 0) {
 					if (error == 0)
 						error = error1;
 					break;
 				}
 			}
 		}
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 		if (error != 0)
 			break;
 	}
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int32_t blocked_writers;
 	int32_t blocked_readers;
 	int error, error1, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	blocked_readers = 0;
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 
 		if (error) {
 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
 			    blocked_readers != 0) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 			}
 
 			break;
 		}
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
 		    URWLOCK_READER_COUNT(state) != 0) &&
 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 sleep:
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
 
 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
 		if (blocked_writers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error1 = umtxq_check_susp(td);
 				/*
 				 * We are leaving the URWLOCK_WRITE_WAITERS
 				 * behind, but this should not harm the
 				 * correctness.
 				 */
 				if (error1 != 0) {
 					if (error == 0)
 						error = error1;
 					break;
 				}
 			}
 			rv = fueword32(&rwlock->rw_blocked_readers,
 			    &blocked_readers);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 		} else
 			blocked_readers = 0;
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 	}
 
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
 {
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int error, rv, q, count;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	error = fueword32(&rwlock->rw_state, &state);
 	if (error == -1) {
 		error = EFAULT;
 		goto out;
 	}
 	if (state & URWLOCK_WRITE_OWNER) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state, 
 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else if (URWLOCK_READER_COUNT(state) != 0) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state - 1);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else {
 		error = EPERM;
 		goto out;
 	}
 
 	count = 0;
 
 	if (!(flags & URWLOCK_PREFER_READER)) {
 		if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		} else if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		}
 	} else {
 		if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		} else if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		}
 	}
 
 	if (count) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_signal_queue(&uq->uq_key, count, q);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 	}
 out:
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, count, count1;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
 	if (rv == 0)
 		rv = fueword32(&sem->_count, &count);
 	if (rv == -1 || count != 0) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (rv == -1 ? EFAULT : 0);
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
 		if (error == ERESTART && timeout != NULL &&
 		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem_wake(struct thread *td, struct _usem *sem)
 {
 	struct umtx_key key;
 	int error, cnt;
 	uint32_t flags;
 
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		/*
 		 * Check if count is greater than 0, this means the memory is
 		 * still being referenced by user code, so we can safely
 		 * update _has_waiters flag.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			error = suword32(&sem->_has_waiters, 0);
 			umtxq_lock(&key);
 			if (error == -1)
 				error = EFAULT;
 		}
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 #endif
 
 static int
 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t count, flags;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	flags = fuword32(&sem->_flags);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = fueword32(&sem->_count, &count);
 	if (rv == -1) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (EFAULT);
 	}
 	for (;;) {
 		if (USEM_COUNT(count) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (0);
 		}
 		if (count == USEM_HAS_WAITERS)
 			break;
 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (count == 0)
 			break;
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
 			/* A relative timeout cannot be restarted. */
 			if (error == ERESTART)
 				error = EINTR;
 			if (error == EINTR) {
 				abs_timeout_update(&timo);
 				timeout->_timeout = timo.end;
 				timespecsub(&timeout->_timeout, &timo.cur);
 			}
 		}
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem2_wake(struct thread *td, struct _usem2 *sem)
 {
 	struct umtx_key key;
 	int error, cnt, rv;
 	uint32_t count, flags;
 
 	rv = fueword32(&sem->_flags, &flags);
 	if (rv == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		/*
 		 * If this was the last sleeping thread, clear the waiters
 		 * flag in _count.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			rv = fueword32(&sem->_count, &count);
 			while (rv != -1 && count & USEM_HAS_WAITERS)
 				rv = casueword32(&sem->_count, count, &count,
 				    count & ~USEM_HAS_WAITERS);
 			if (rv == -1)
 				error = EFAULT;
 			umtxq_lock(&key);
 		}
 
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 inline int
 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
 {
 	int error;
 
 	error = copyin(addr, tsp, sizeof(struct timespec));
 	if (error == 0) {
 		if (tsp->tv_sec < 0 ||
 		    tsp->tv_nsec >= 1000000000 ||
 		    tsp->tv_nsec < 0)
 			error = EINVAL;
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	int error;
 	
 	if (size <= sizeof(struct timespec)) {
 		tp->_clockid = CLOCK_REALTIME;
 		tp->_flags = 0;
 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
 	} else 
 		error = copyin(addr, tp, sizeof(struct _umtx_time));
 	if (error != 0)
 		return (error);
 	if (tp->_timeout.tv_sec < 0 ||
 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
 		return (EINVAL);
 	return (0);
 }
 
 static int
 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
 }
 
 static int
 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 static int
 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
 }
 
 #define BATCH_SIZE	128
 static int
 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	char *uaddrs[BATCH_SIZE], **upp;
 	int count, error, i, pos, tocopy;
 
 	upp = (char **)uap->obj;
 	error = 0;
 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
 	    pos += tocopy) {
 		tocopy = MIN(count, BATCH_SIZE);
 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
 		maybe_yield();
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
 }
 
 static int
 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
 }
 
 static int
 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_wake_umutex(td, uap->obj));
 }
 
 static int
 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_unlock_umutex(td, uap->obj, false));
 }
 
 static int
 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
 }
 
 static int
 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_cv_signal(td, uap->obj));
 }
 
 static int
 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_cv_broadcast(td, uap->obj));
 }
 
 static int
 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2, 
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_rw_unlock(td, uap->obj));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem_wake(td, uap->obj));
 }
 #endif
 
 static int
 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_wake2_umutex(td, uap->obj, uap->val));
 }
 
 static int
 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t uasize;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		uasize = 0;
 		tm_p = NULL;
 	} else {
 		uasize = (size_t)uap->uaddr1;
 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	error = do_sem2_wait(td, uap->obj, tm_p);
 	if (error == EINTR && uap->uaddr2 != NULL &&
 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
 		error = copyout(&timeout._timeout,
 		    (struct _umtx_time *)uap->uaddr2 + 1,
 		    sizeof(struct timespec));
 		if (error == 0) {
 			error = EINTR;
 		}
 	}
 
 	return (error);
 }
 
 static int
 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem2_wake(td, uap->obj));
 }
 
 #define	USHM_OBJ_UMTX(o)						\
     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
 
 #define	USHMF_REG_LINKED	0x0001
 #define	USHMF_OBJ_LINKED	0x0002
 struct umtx_shm_reg {
 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
 	struct umtx_key		ushm_key;
 	struct ucred		*ushm_cred;
 	struct shmfd		*ushm_obj;
 	u_int			ushm_refcnt;
 	u_int			ushm_flags;
 };
 
 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
 
 static uma_zone_t umtx_shm_reg_zone;
 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
 static struct mtx umtx_shm_lock;
 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
 
 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
 
 static void
 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
 {
 	struct umtx_shm_reg_head d;
 	struct umtx_shm_reg *reg, *reg1;
 
 	TAILQ_INIT(&d);
 	mtx_lock(&umtx_shm_lock);
 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
 	mtx_unlock(&umtx_shm_lock);
 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
 		umtx_shm_free_reg(reg);
 	}
 }
 
 static struct task umtx_shm_reg_delfree_task =
     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
 
 static struct umtx_shm_reg *
 umtx_shm_find_reg_locked(const struct umtx_key *key)
 {
 	struct umtx_shm_reg *reg;
 	struct umtx_shm_reg_head *reg_head;
 
 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	reg_head = &umtx_shm_registry[key->hash];
 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
 		KASSERT(reg->ushm_key.shared,
 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
 		if (reg->ushm_key.info.shared.object ==
 		    key->info.shared.object &&
 		    reg->ushm_key.info.shared.offset ==
 		    key->info.shared.offset) {
 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
 			KASSERT(reg->ushm_refcnt > 0,
 			    ("reg %p refcnt 0 onlist", reg));
 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
 			    ("reg %p not linked", reg));
 			reg->ushm_refcnt++;
 			return (reg);
 		}
 	}
 	return (NULL);
 }
 
 static struct umtx_shm_reg *
 umtx_shm_find_reg(const struct umtx_key *key)
 {
 	struct umtx_shm_reg *reg;
 
 	mtx_lock(&umtx_shm_lock);
 	reg = umtx_shm_find_reg_locked(key);
 	mtx_unlock(&umtx_shm_lock);
 	return (reg);
 }
 
 static void
 umtx_shm_free_reg(struct umtx_shm_reg *reg)
 {
 
 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
 	crfree(reg->ushm_cred);
 	shm_drop(reg->ushm_obj);
 	uma_zfree(umtx_shm_reg_zone, reg);
 }
 
 static bool
 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
 {
 	bool res;
 
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
 	reg->ushm_refcnt--;
 	res = reg->ushm_refcnt == 0;
 	if (res || force) {
 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
 			    reg, ushm_reg_link);
 			reg->ushm_flags &= ~USHMF_REG_LINKED;
 		}
 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
 			LIST_REMOVE(reg, ushm_obj_link);
 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
 		}
 	}
 	return (res);
 }
 
 static void
 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
 {
 	vm_object_t object;
 	bool dofree;
 
 	if (force) {
 		object = reg->ushm_obj->shm_object;
 		VM_OBJECT_WLOCK(object);
 		object->flags |= OBJ_UMTXDEAD;
 		VM_OBJECT_WUNLOCK(object);
 	}
 	mtx_lock(&umtx_shm_lock);
 	dofree = umtx_shm_unref_reg_locked(reg, force);
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		umtx_shm_free_reg(reg);
 }
 
 void
 umtx_shm_object_init(vm_object_t object)
 {
 
 	LIST_INIT(USHM_OBJ_UMTX(object));
 }
 
 void
 umtx_shm_object_terminated(vm_object_t object)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	bool dofree;
 
 	dofree = false;
 	mtx_lock(&umtx_shm_lock);
 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
 		if (umtx_shm_unref_reg_locked(reg, true)) {
 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
 			    ushm_reg_link);
 			dofree = true;
 		}
 	}
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
 }
 
 static int
 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
     struct umtx_shm_reg **res)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	struct ucred *cred;
 	int error;
 
 	reg = umtx_shm_find_reg(key);
 	if (reg != NULL) {
 		*res = reg;
 		return (0);
 	}
 	cred = td->td_ucred;
 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
 		return (ENOMEM);
 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
 	reg->ushm_refcnt = 1;
 	bcopy(key, &reg->ushm_key, sizeof(*key));
 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
 	reg->ushm_cred = crhold(cred);
 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
 	if (error != 0) {
 		umtx_shm_free_reg(reg);
 		return (error);
 	}
 	mtx_lock(&umtx_shm_lock);
 	reg1 = umtx_shm_find_reg_locked(key);
 	if (reg1 != NULL) {
 		mtx_unlock(&umtx_shm_lock);
 		umtx_shm_free_reg(reg);
 		*res = reg1;
 		return (0);
 	}
 	reg->ushm_refcnt++;
 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
 	    ushm_obj_link);
 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
 	mtx_unlock(&umtx_shm_lock);
 	*res = reg;
 	return (0);
 }
 
 static int
 umtx_shm_alive(struct thread *td, void *addr)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	int res, ret;
 	boolean_t wired;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
 	    &object, &pindex, &prot, &wired);
 	if (res != KERN_SUCCESS)
 		return (EFAULT);
 	if (object == NULL)
 		ret = EINVAL;
 	else
 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
 	vm_map_lookup_done(map, entry);
 	return (ret);
 }
 
 static void
 umtx_shm_init(void)
 {
 	int i;
 
 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
 	for (i = 0; i < nitems(umtx_shm_registry); i++)
 		TAILQ_INIT(&umtx_shm_registry[i]);
 }
 
 static int
 umtx_shm(struct thread *td, void *addr, u_int flags)
 {
 	struct umtx_key key;
 	struct umtx_shm_reg *reg;
 	struct file *fp;
 	int error, fd;
 
 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
 		return (EINVAL);
 	if ((flags & UMTX_SHM_ALIVE) != 0)
 		return (umtx_shm_alive(td, addr));
 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
 	if (error != 0)
 		return (error);
 	KASSERT(key.shared == 1, ("non-shared key"));
 	if ((flags & UMTX_SHM_CREAT) != 0) {
 		error = umtx_shm_create_reg(td, &key, &reg);
 	} else {
 		reg = umtx_shm_find_reg(&key);
 		if (reg == NULL)
 			error = ESRCH;
 	}
 	umtx_key_release(&key);
 	if (error != 0)
 		return (error);
 	KASSERT(reg != NULL, ("no reg"));
 	if ((flags & UMTX_SHM_DESTROY) != 0) {
 		umtx_shm_unref_reg(reg, true);
 	} else {
 #if 0
 #ifdef MAC
 		error = mac_posixshm_check_open(td->td_ucred,
 		    reg->ushm_obj, FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = shm_access(reg->ushm_obj, td->td_ucred,
 			    FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
 		if (error == 0) {
 			shm_hold(reg->ushm_obj);
 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
 			    &shm_ops);
 			td->td_retval[0] = fd;
 			fdrop(fp, td);
 		}
 	}
 	umtx_shm_unref_reg(reg, false);
 	return (error);
 }
 
 static int
 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (umtx_shm(td, uap->uaddr1, uap->val));
 }
 
 static int
 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
 {
 
 	td->td_rb_list = rbp->robust_list_offset;
 	td->td_rbp_list = rbp->robust_priv_list_offset;
 	td->td_rb_inact = rbp->robust_inact_offset;
 	return (0);
 }
 
 static int
 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct umtx_robust_lists_params rb;
 	int error;
 
 	if (uap->val > sizeof(rb))
 		return (EINVAL);
 	bzero(&rb, sizeof(rb));
 	error = copyin(uap->uaddr1, &rb, uap->val);
 	if (error != 0)
 		return (error);
 	return (umtx_robust_lists(td, &rb));
 }
 
 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
 
 static const _umtx_op_func op_table[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]		= __umtx_op_wait,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
 };
 
 int
 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table))
 		return (*op_table[uap->op])(td, uap);
 	return (EINVAL);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 struct timespec32 {
 	int32_t tv_sec;
 	int32_t tv_nsec;
 };
 
 struct umtx_time32 {
 	struct	timespec32	timeout;
 	uint32_t		flags;
 	uint32_t		clockid;
 };
 
 static inline int
 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
 {
 	struct timespec32 ts32;
 	int error;
 
 	error = copyin(addr, &ts32, sizeof(struct timespec32));
 	if (error == 0) {
 		if (ts32.tv_sec < 0 ||
 		    ts32.tv_nsec >= 1000000000 ||
 		    ts32.tv_nsec < 0)
 			error = EINVAL;
 		else {
 			tsp->tv_sec = ts32.tv_sec;
 			tsp->tv_nsec = ts32.tv_nsec;
 		}
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	struct umtx_time32 t32;
 	int error;
 	
 	t32.clockid = CLOCK_REALTIME;
 	t32.flags   = 0;
 	if (size <= sizeof(struct timespec32))
 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
 	else 
 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
 	if (error != 0)
 		return (error);
 	if (t32.timeout.tv_sec < 0 ||
 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
 		return (EINVAL);
 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
 	tp->_flags = t32.flags;
 	tp->_clockid = t32.clockid;
 	return (0);
 }
 
 static int
 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 			(size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 			    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2, 
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(
 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 #endif
 
 static int
 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t uasize;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		uasize = 0;
 		tm_p = NULL;
 	} else {
 		uasize = (size_t)uap->uaddr1;
 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	error = do_sem2_wait(td, uap->obj, tm_p);
 	if (error == EINTR && uap->uaddr2 != NULL &&
 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
 		struct timespec32 remain32 = {
 			.tv_sec = timeout._timeout.tv_sec,
 			.tv_nsec = timeout._timeout.tv_nsec
 		};
 		error = copyout(&remain32,
 		    (struct umtx_time32 *)uap->uaddr2 + 1,
 		    sizeof(struct timespec32));
 		if (error == 0) {
 			error = EINTR;
 		}
 	}
 
 	return (error);
 }
 
 static int
 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
 {
 	uint32_t uaddrs[BATCH_SIZE], **upp;
 	int count, error, i, pos, tocopy;
 
 	upp = (uint32_t **)uap->obj;
 	error = 0;
 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
 	    pos += tocopy) {
 		tocopy = MIN(count, BATCH_SIZE);
 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
 			    INT_MAX, 1);
 		maybe_yield();
 	}
 	return (error);
 }
 
 struct umtx_robust_lists_params_compat32 {
 	uint32_t	robust_list_offset;
 	uint32_t	robust_priv_list_offset;
 	uint32_t	robust_inact_offset;
 };
 
 static int
 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct umtx_robust_lists_params rb;
 	struct umtx_robust_lists_params_compat32 rb32;
 	int error;
 
 	if (uap->val > sizeof(rb32))
 		return (EINVAL);
 	bzero(&rb, sizeof(rb));
 	bzero(&rb32, sizeof(rb32));
 	error = copyin(uap->uaddr1, &rb32, uap->val);
 	if (error != 0)
 		return (error);
 	rb.robust_list_offset = rb32.robust_list_offset;
 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
 	rb.robust_inact_offset = rb32.robust_inact_offset;
 	return (umtx_robust_lists(td, &rb));
 }
 
 static const _umtx_op_func op_table_compat32[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
 };
 
 int
 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
 		return (*op_table_compat32[uap->op])(td,
 		    (struct _umtx_op_args *)uap);
 	}
 	return (EINVAL);
 }
 #endif
 
 void
 umtx_thread_init(struct thread *td)
 {
 
 	td->td_umtxq = umtxq_alloc();
 	td->td_umtxq->uq_thread = td;
 }
 
 void
 umtx_thread_fini(struct thread *td)
 {
 
 	umtxq_free(td->td_umtxq);
 }
 
 /*
  * It will be called when new thread is created, e.g fork().
  */
 void
 umtx_thread_alloc(struct thread *td)
 {
 	struct umtx_q *uq;
 
 	uq = td->td_umtxq;
 	uq->uq_inherited_pri = PRI_MAX;
 
 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
 }
 
 /*
  * exec() hook.
  *
  * Clear robust lists for all process' threads, not delaying the
  * cleanup to thread_exit hook, since the relevant address space is
  * destroyed right now.
  */
 static void
 umtx_exec_hook(void *arg __unused, struct proc *p,
     struct image_params *imgp __unused)
 {
 	struct thread *td;
 
 	KASSERT(p == curproc, ("need curproc"));
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
 	    ("curproc must be single-threaded"));
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(td == curthread ||
 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
 		    ("running thread %p %p", p, td));
 		PROC_UNLOCK(p);
 		umtx_thread_cleanup(td);
 		PROC_LOCK(p);
 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
 	}
 	PROC_UNLOCK(p);
 }
 
 /*
  * thread_exit() hook.
  */
 void
 umtx_thread_exit(struct thread *td)
 {
 
 	umtx_thread_cleanup(td);
 }
 
 static int
 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
 {
 	u_long res1;
 #ifdef COMPAT_FREEBSD32
 	uint32_t res32;
 #endif
 	int error;
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		error = fueword32((void *)ptr, &res32);
 		if (error == 0)
 			res1 = res32;
 	} else
 #endif
 	{
 		error = fueword((void *)ptr, &res1);
 	}
 	if (error == 0)
 		*res = res1;
 	else
 		error = EFAULT;
 	return (error);
 }
 
 static void
 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
 {
 #ifdef COMPAT_FREEBSD32
 	struct umutex32 m32;
 
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		memcpy(&m32, m, sizeof(m32));
 		*rb_list = m32.m_rb_lnk;
 	} else
 #endif
 		*rb_list = m->m_rb_lnk;
 }
 
 static int
 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
 {
 	struct umutex m;
 	int error;
 
 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
 	error = copyin((void *)rbp, &m, sizeof(m));
 	if (error != 0)
 		return (error);
 	if (rb_list != NULL)
 		umtx_read_rb_list(td, &m, rb_list);
 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
 		return (EINVAL);
 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
 		/* inact is cleared after unlock, allow the inconsistency */
 		return (inact ? 0 : EINVAL);
 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
 }
 
 static void
 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
     const char *name)
 {
 	int error, i;
 	uintptr_t rbp;
 	bool inact;
 
 	if (rb_list == 0)
 		return;
 	error = umtx_read_uptr(td, rb_list, &rbp);
 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
 		if (rbp == *rb_inact) {
 			inact = true;
 			*rb_inact = 0;
 		} else
 			inact = false;
 		error = umtx_handle_rb(td, rbp, &rbp, inact);
 	}
 	if (i == umtx_max_rb && umtx_verbose_rb) {
 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
 	}
 	if (error != 0 && umtx_verbose_rb) {
 		uprintf("comm %s pid %d: handling %srb error %d\n",
 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
 	}
 }
 
 /*
  * Clean up umtx data.
  */
 static void
 umtx_thread_cleanup(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	uintptr_t rb_inact;
 
 	/*
 	 * Disown pi mutexes.
 	 */
 	uq = td->td_umtxq;
 	if (uq != NULL) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = PRI_MAX;
 		while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
 			pi->pi_owner = NULL;
 			TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
 		}
 		mtx_unlock(&umtx_lock);
 		thread_lock(td);
 		sched_lend_user_prio(td, PRI_MAX);
 		thread_unlock(td);
 	}
 
 	/*
 	 * Handle terminated robust mutexes.  Must be done after
 	 * robust pi disown, otherwise unlock could see unowned
 	 * entries.
 	 */
 	rb_inact = td->td_rb_inact;
 	if (rb_inact != 0)
 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
 	if (rb_inact != 0)
 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
 }
Index: head/sys/kern/kern_uuid.c
===================================================================
--- head/sys/kern/kern_uuid.c	(revision 326270)
+++ head/sys/kern/kern_uuid.c	(revision 326271)
@@ -1,433 +1,435 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2002 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/uuid.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 
 /*
  * See also:
  *	http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
  *	http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
  *
  * Note that the generator state is itself an UUID, but the time and clock
  * sequence fields are written in the native byte order.
  */
 
 CTASSERT(sizeof(struct uuid) == 16);
 
 /* We use an alternative, more convenient representation in the generator. */
 struct uuid_private {
 	union {
 		uint64_t	ll;	/* internal, for uuid_last only */
 		struct {
 			uint32_t	low;
 			uint16_t	mid;
 			uint16_t	hi;
 		} x;
 	} time;
 	uint16_t	seq;			/* Big-endian. */
 	uint16_t	node[UUID_NODE_LEN>>1];
 };
 
 CTASSERT(sizeof(struct uuid_private) == 16);
 
 struct uuid_macaddr {
 	uint16_t	state;
 #define	UUID_ETHER_EMPTY	0
 #define	UUID_ETHER_RANDOM	1
 #define	UUID_ETHER_UNIQUE	2
 	uint16_t	node[UUID_NODE_LEN>>1];
 };
 
 static struct uuid_private uuid_last;
 
 #define UUID_NETHER	4
 static struct uuid_macaddr uuid_ether[UUID_NETHER];
 
 static struct mtx uuid_mutex;
 MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);
 
 /*
  * Return the first MAC address added in the array. If it's empty, then
  * construct a sufficiently random multicast MAC address first. Any
  * addresses added later will bump the random MAC address up tp the next
  * index.
  */
 static void
 uuid_node(uint16_t *node)
 {
 	int i;
 
 	if (uuid_ether[0].state == UUID_ETHER_EMPTY) {
 		for (i = 0; i < (UUID_NODE_LEN>>1); i++)
 			uuid_ether[0].node[i] = (uint16_t)arc4random();
 		*((uint8_t*)uuid_ether[0].node) |= 0x01;
 		uuid_ether[0].state = UUID_ETHER_RANDOM;
 	}
 	for (i = 0; i < (UUID_NODE_LEN>>1); i++)
 		node[i] = uuid_ether[0].node[i];
 }
 
 /*
  * Get the current time as a 60 bit count of 100-nanosecond intervals
  * since 00:00:00.00, October 15,1582. We apply a magic offset to convert
  * the Unix time since 00:00:00.00, January 1, 1970 to the date of the
  * Gregorian reform to the Christian calendar.
  */
 static uint64_t
 uuid_time(void)
 {
 	struct bintime bt;
 	uint64_t time = 0x01B21DD213814000LL;
 
 	bintime(&bt);
 	time += (uint64_t)bt.sec * 10000000LL;
 	time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
 	return (time & ((1LL << 60) - 1LL));
 }
 
 struct uuid *
 kern_uuidgen(struct uuid *store, size_t count)
 {
 	struct uuid_private uuid;
 	uint64_t time;
 	size_t n;
 
 	mtx_lock(&uuid_mutex);
 
 	uuid_node(uuid.node);
 	time = uuid_time();
 
 	if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] ||
 	    uuid_last.node[1] != uuid.node[1] ||
 	    uuid_last.node[2] != uuid.node[2])
 		uuid.seq = (uint16_t)arc4random() & 0x3fff;
 	else if (uuid_last.time.ll >= time)
 		uuid.seq = (uuid_last.seq + 1) & 0x3fff;
 	else
 		uuid.seq = uuid_last.seq;
 
 	uuid_last = uuid;
 	uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);
 
 	mtx_unlock(&uuid_mutex);
 
 	/* Set sequence and variant and deal with byte order. */
 	uuid.seq = htobe16(uuid.seq | 0x8000);
 
 	for (n = 0; n < count; n++) {
 		/* Set time and version (=1). */
 		uuid.time.x.low = (uint32_t)time;
 		uuid.time.x.mid = (uint16_t)(time >> 32);
 		uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
 		store[n] = *(struct uuid *)&uuid;
 		time++;
 	}
 
 	return (store);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct uuidgen_args {
 	struct uuid *store;
 	int	count;
 };
 #endif
 int
 sys_uuidgen(struct thread *td, struct uuidgen_args *uap)
 {
 	struct uuid *store;
 	size_t count;
 	int error;
 
 	/*
 	 * Limit the number of UUIDs that can be created at the same time
 	 * to some arbitrary number. This isn't really necessary, but I
 	 * like to have some sort of upper-bound that's less than 2G :-)
 	 * XXX probably needs to be tunable.
 	 */
 	if (uap->count < 1 || uap->count > 2048)
 		return (EINVAL);
 
 	count = uap->count;
 	store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
 	kern_uuidgen(store, count);
 	error = copyout(store, uap->store, count * sizeof(struct uuid));
 	free(store, M_TEMP);
 	return (error);
 }
 
 int
 uuid_ether_add(const uint8_t *addr)
 {
 	int i, sum;
 
 	/*
 	 * Validate input. No multicast (flag 0x1), no locally administered
 	 * (flag 0x2) and no 'all-zeroes' addresses.
 	 */
 	if (addr[0] & 0x03)
 		return (EINVAL);
 	sum = 0;
 	for (i = 0; i < UUID_NODE_LEN; i++)
 		sum += addr[i];
 	if (sum == 0)
 		return (EINVAL);
 
 	mtx_lock(&uuid_mutex);
 
 	/* Make sure the MAC isn't known already and that there's space. */
 	i = 0;
 	while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE) {
 		if (!bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN)) {
 			mtx_unlock(&uuid_mutex);
 			return (EEXIST);
 		}
 		i++;
 	}
 	if (i == UUID_NETHER) {
 		mtx_unlock(&uuid_mutex);
 		return (ENOSPC);
 	}
 
 	/* Insert MAC at index, moving the non-empty entry if possible. */
 	if (uuid_ether[i].state == UUID_ETHER_RANDOM && i < UUID_NETHER - 1)
 		uuid_ether[i + 1] = uuid_ether[i];
 	uuid_ether[i].state = UUID_ETHER_UNIQUE;
 	bcopy(addr, uuid_ether[i].node, UUID_NODE_LEN);
 	mtx_unlock(&uuid_mutex);
 	return (0);
 }
 
 int
 uuid_ether_del(const uint8_t *addr)
 {
 	int i;
 
 	mtx_lock(&uuid_mutex);
 	i = 0;
 	while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE &&
 	    bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN))
 		i++;
 	if (i == UUID_NETHER || uuid_ether[i].state != UUID_ETHER_UNIQUE) {
 		mtx_unlock(&uuid_mutex);
 		return (ENOENT);
 	}
 
 	/* Remove it by shifting higher index entries down. */
 	while (i < UUID_NETHER - 1 && uuid_ether[i].state != UUID_ETHER_EMPTY) {
 		uuid_ether[i] = uuid_ether[i + 1];
 		i++;
 	}
 	if (uuid_ether[i].state != UUID_ETHER_EMPTY) {
 		uuid_ether[i].state = UUID_ETHER_EMPTY;
 		bzero(uuid_ether[i].node, UUID_NODE_LEN);
 	}
 	mtx_unlock(&uuid_mutex);
 	return (0);
 }
 
 int
 snprintf_uuid(char *buf, size_t sz, struct uuid *uuid)
 {
 	struct uuid_private *id;
 	int cnt;
 
 	id = (struct uuid_private *)uuid;
 	cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
 	    id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
 	    be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
 	return (cnt);
 }
 
 int
 printf_uuid(struct uuid *uuid)
 {
 	char buf[38];
 
 	snprintf_uuid(buf, sizeof(buf), uuid);
 	return (printf("%s", buf));
 }
 
 int
 sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid)
 {
 	char buf[38];
 
 	snprintf_uuid(buf, sizeof(buf), uuid);
 	return (sbuf_printf(sb, "%s", buf));
 }
 
 /*
  * Encode/Decode UUID into byte-stream.
  *   http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
  *
  * 0                   1                   2                   3
  *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |                          time_low                             |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |       time_mid                |         time_hi_and_version   |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |clk_seq_hi_res |  clk_seq_low  |         node (0-1)            |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |                         node (2-5)                            |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 
 void
 le_uuid_enc(void *buf, struct uuid const *uuid)
 {
 	u_char *p;
 	int i;
 
 	p = buf;
 	le32enc(p, uuid->time_low);
 	le16enc(p + 4, uuid->time_mid);
 	le16enc(p + 6, uuid->time_hi_and_version);
 	p[8] = uuid->clock_seq_hi_and_reserved;
 	p[9] = uuid->clock_seq_low;
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		p[10 + i] = uuid->node[i];
 }
 
 void
 le_uuid_dec(void const *buf, struct uuid *uuid)
 {
 	u_char const *p;
 	int i;
 
 	p = buf;
 	uuid->time_low = le32dec(p);
 	uuid->time_mid = le16dec(p + 4);
 	uuid->time_hi_and_version = le16dec(p + 6);
 	uuid->clock_seq_hi_and_reserved = p[8];
 	uuid->clock_seq_low = p[9];
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		uuid->node[i] = p[10 + i];
 }
 
 void
 be_uuid_enc(void *buf, struct uuid const *uuid)
 {
 	u_char *p;
 	int i;
 
 	p = buf;
 	be32enc(p, uuid->time_low);
 	be16enc(p + 4, uuid->time_mid);
 	be16enc(p + 6, uuid->time_hi_and_version);
 	p[8] = uuid->clock_seq_hi_and_reserved;
 	p[9] = uuid->clock_seq_low;
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		p[10 + i] = uuid->node[i];
 }
 
 void
 be_uuid_dec(void const *buf, struct uuid *uuid)
 {
 	u_char const *p;
 	int i;
 
 	p = buf;
 	uuid->time_low = be32dec(p);
 	uuid->time_mid = be16dec(p + 4);
 	uuid->time_hi_and_version = be16dec(p + 6);
 	uuid->clock_seq_hi_and_reserved = p[8];
 	uuid->clock_seq_low = p[9];
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		uuid->node[i] = p[10 + i];
 }
 
 int
 parse_uuid(const char *str, struct uuid *uuid)
 {
 	u_int c[11];
 	int n;
 
 	/* An empty string represents a nil UUID. */
 	if (*str == '\0') {
 		bzero(uuid, sizeof(*uuid));
 		return (0);
 	}
 
 	/* The UUID string representation has a fixed length. */
 	if (strlen(str) != 36)
 		return (EINVAL);
 
 	/*
 	 * We only work with "new" UUIDs. New UUIDs have the form:
 	 *      01234567-89ab-cdef-0123-456789abcdef
 	 * The so called "old" UUIDs, which we don't support, have the form:
 	 *      0123456789ab.cd.ef.01.23.45.67.89.ab
 	 */
 	if (str[8] != '-')
 		return (EINVAL);
 
 	n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
 	    c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
 	/* Make sure we have all conversions. */
 	if (n != 11)
 		return (EINVAL);
 
 	/* Successful scan. Build the UUID. */
 	uuid->time_low = c[0];
 	uuid->time_mid = c[1];
 	uuid->time_hi_and_version = c[2];
 	uuid->clock_seq_hi_and_reserved = c[3];
 	uuid->clock_seq_low = c[4];
 	for (n = 0; n < 6; n++)
 		uuid->node[n] = c[n + 5];
 
 	/* Check semantics... */
 	return (((c[3] & 0x80) != 0x00 &&		/* variant 0? */
 	    (c[3] & 0xc0) != 0x80 &&			/* variant 1? */
 	    (c[3] & 0xe0) != 0xc0) ? EINVAL : 0);	/* variant 2? */
 }
 
 int
 uuidcmp(const struct uuid *uuid1, const struct uuid *uuid2)
 {
 
 	return (memcmp(uuid1, uuid2, sizeof(struct uuid)));
 }
Index: head/sys/kern/link_elf.c
===================================================================
--- head/sys/kern/link_elf.c	(revision 326270)
+++ head/sys/kern/link_elf.c	(revision 326271)
@@ -1,1658 +1,1660 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1998-2000 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_gdb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mount.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/fcntl.h>
 #include <sys/vnode.h>
 #include <sys/linker.h>
 
 #include <machine/elf.h>
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #ifdef SPARSE_MAPPING
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #endif
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #include <sys/link_elf.h>
 
 #ifdef DDB_CTF
 #include <sys/zlib.h>
 #endif
 
 #include "linker_if.h"
 
 #define MAXSEGS 4
 
 typedef struct elf_file {
 	struct linker_file lf;		/* Common fields */
 	int		preloaded;	/* Was file pre-loaded */
 	caddr_t		address;	/* Relocation address */
 #ifdef SPARSE_MAPPING
 	vm_object_t	object;		/* VM object to hold file pages */
 #endif
 	Elf_Dyn		*dynamic;	/* Symbol table etc. */
 	Elf_Hashelt	nbuckets;	/* DT_HASH info */
 	Elf_Hashelt	nchains;
 	const Elf_Hashelt *buckets;
 	const Elf_Hashelt *chains;
 	caddr_t		hash;
 	caddr_t		strtab;		/* DT_STRTAB */
 	int		strsz;		/* DT_STRSZ */
 	const Elf_Sym	*symtab;		/* DT_SYMTAB */
 	Elf_Addr	*got;		/* DT_PLTGOT */
 	const Elf_Rel	*pltrel;	/* DT_JMPREL */
 	int		pltrelsize;	/* DT_PLTRELSZ */
 	const Elf_Rela	*pltrela;	/* DT_JMPREL */
 	int		pltrelasize;	/* DT_PLTRELSZ */
 	const Elf_Rel	*rel;		/* DT_REL */
 	int		relsize;	/* DT_RELSZ */
 	const Elf_Rela	*rela;		/* DT_RELA */
 	int		relasize;	/* DT_RELASZ */
 	caddr_t		modptr;
 	const Elf_Sym	*ddbsymtab;	/* The symbol table we are using */
 	long		ddbsymcnt;	/* Number of symbols */
 	caddr_t		ddbstrtab;	/* String table */
 	long		ddbstrcnt;	/* number of bytes in string table */
 	caddr_t		symbase;	/* malloc'ed symbold base */
 	caddr_t		strbase;	/* malloc'ed string base */
 	caddr_t		ctftab;		/* CTF table */
 	long		ctfcnt;		/* number of bytes in CTF table */
 	caddr_t		ctfoff;		/* CTF offset table */
 	caddr_t		typoff;		/* Type offset table */
 	long		typlen;		/* Number of type entries. */
 	Elf_Addr	pcpu_start;	/* Pre-relocation pcpu set start. */
 	Elf_Addr	pcpu_stop;	/* Pre-relocation pcpu set stop. */
 	Elf_Addr	pcpu_base;	/* Relocated pcpu set address. */
 #ifdef VIMAGE
 	Elf_Addr	vnet_start;	/* Pre-relocation vnet set start. */
 	Elf_Addr	vnet_stop;	/* Pre-relocation vnet set stop. */
 	Elf_Addr	vnet_base;	/* Relocated vnet set address. */
 #endif
 #ifdef GDB
 	struct link_map	gdb;		/* hooks for gdb */
 #endif
 } *elf_file_t;
 
 struct elf_set {
 	Elf_Addr	es_start;
 	Elf_Addr	es_stop;
 	Elf_Addr	es_base;
 	TAILQ_ENTRY(elf_set)	es_link;
 };
 
 TAILQ_HEAD(elf_set_head, elf_set);
 
 #include <kern/kern_ctf.c>
 
 static int	link_elf_link_common_finish(linker_file_t);
 static int	link_elf_link_preload(linker_class_t cls,
 				      const char *, linker_file_t *);
 static int	link_elf_link_preload_finish(linker_file_t);
 static int	link_elf_load_file(linker_class_t, const char *,
 		    linker_file_t *);
 static int	link_elf_lookup_symbol(linker_file_t, const char *,
 		    c_linker_sym_t *);
 static int	link_elf_symbol_values(linker_file_t, c_linker_sym_t,
 		    linker_symval_t *);
 static int	link_elf_search_symbol(linker_file_t, caddr_t,
 		    c_linker_sym_t *, long *);
 
 static void	link_elf_unload_file(linker_file_t);
 static void	link_elf_unload_preload(linker_file_t);
 static int	link_elf_lookup_set(linker_file_t, const char *,
 		    void ***, void ***, int *);
 static int	link_elf_each_function_name(linker_file_t,
 		    int (*)(const char *, void *), void *);
 static int	link_elf_each_function_nameval(linker_file_t,
 		    linker_function_nameval_callback_t, void *);
 static void	link_elf_reloc_local(linker_file_t);
 static long	link_elf_symtab_get(linker_file_t, const Elf_Sym **);
 static long	link_elf_strtab_get(linker_file_t, caddr_t *);
 static int	elf_lookup(linker_file_t, Elf_Size, int, Elf_Addr *);
 
 static kobj_method_t link_elf_methods[] = {
 	KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
 	KOBJMETHOD(linker_symbol_values,	link_elf_symbol_values),
 	KOBJMETHOD(linker_search_symbol,	link_elf_search_symbol),
 	KOBJMETHOD(linker_unload,		link_elf_unload_file),
 	KOBJMETHOD(linker_load_file,		link_elf_load_file),
 	KOBJMETHOD(linker_link_preload,		link_elf_link_preload),
 	KOBJMETHOD(linker_link_preload_finish,	link_elf_link_preload_finish),
 	KOBJMETHOD(linker_lookup_set,		link_elf_lookup_set),
 	KOBJMETHOD(linker_each_function_name,	link_elf_each_function_name),
 	KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
 	KOBJMETHOD(linker_ctf_get,		link_elf_ctf_get),
 	KOBJMETHOD(linker_symtab_get,		link_elf_symtab_get),
 	KOBJMETHOD(linker_strtab_get,		link_elf_strtab_get),
 	{ 0, 0 }
 };
 
 static struct linker_class link_elf_class = {
 #if ELF_TARG_CLASS == ELFCLASS32
 	"elf32",
 #else
 	"elf64",
 #endif
 	link_elf_methods, sizeof(struct elf_file)
 };
 
 static int	parse_dynamic(elf_file_t);
 static int	relocate_file(elf_file_t);
 static int	link_elf_preload_parse_symbols(elf_file_t);
 
 static struct elf_set_head set_pcpu_list;
 #ifdef VIMAGE
 static struct elf_set_head set_vnet_list;
 #endif
 
 static void
 elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base)
 {
 	struct elf_set *set, *iter;
 
 	set = malloc(sizeof(*set), M_LINKER, M_WAITOK);
 	set->es_start = start;
 	set->es_stop = stop;
 	set->es_base = base;
 
 	TAILQ_FOREACH(iter, list, es_link) {
 
 		KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) ||
 		    (set->es_start > iter->es_start && set->es_stop > iter->es_stop),
 		    ("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx",
 		    (uintmax_t)set->es_start, (uintmax_t)set->es_stop,
 		    (uintmax_t)iter->es_start, (uintmax_t)iter->es_stop));
 
 		if (iter->es_start > set->es_start) {
 			TAILQ_INSERT_BEFORE(iter, set, es_link);
 			break;
 		}
 	}
 
 	if (iter == NULL)
 		TAILQ_INSERT_TAIL(list, set, es_link);
 }
 
 static int
 elf_set_find(struct elf_set_head *list, Elf_Addr addr, Elf_Addr *start, Elf_Addr *base)
 {
 	struct elf_set *set;
 
 	TAILQ_FOREACH(set, list, es_link) {
 		if (addr < set->es_start)
 			return (0);
 		if (addr < set->es_stop) {
 			*start = set->es_start;
 			*base = set->es_base;
 			return (1);
 		}
 	}
 
 	return (0);
 }
 
 static void
 elf_set_delete(struct elf_set_head *list, Elf_Addr start)
 {
 	struct elf_set *set;
 
 	TAILQ_FOREACH(set, list, es_link) {
 		if (start < set->es_start)
 			break;
 		if (start == set->es_start) {
 			TAILQ_REMOVE(list, set, es_link);
 			free(set, M_LINKER);
 			return;
 		}
 	}
 	KASSERT(0, ("deleting unknown linker set (start = 0x%jx)",
 	    (uintmax_t)start));
 }
 
 #ifdef GDB
 static void	r_debug_state(struct r_debug *, struct link_map *);
 
 /*
  * A list of loaded modules for GDB to use for loading symbols.
  */
 struct r_debug r_debug;
 
 #define GDB_STATE(s) do {				\
 	r_debug.r_state = s; r_debug_state(NULL, NULL);	\
 } while (0)
 
 /*
  * Function for the debugger to set a breakpoint on to gain control.
  */
 static void
 r_debug_state(struct r_debug *dummy_one __unused,
 	      struct link_map *dummy_two __unused)
 {
 }
 
 static void
 link_elf_add_gdb(struct link_map *l)
 {
 	struct link_map *prev;
 
 	l->l_next = NULL;
 
 	if (r_debug.r_map == NULL) {
 		/* Add first. */
 		l->l_prev = NULL;
 		r_debug.r_map = l;
 	} else {
 		/* Append to list. */
 		for (prev = r_debug.r_map;
 		    prev->l_next != NULL;
 		    prev = prev->l_next)
 			;
 		l->l_prev = prev;
 		prev->l_next = l;
 	}
 }
 
 static void
 link_elf_delete_gdb(struct link_map *l)
 {
 	if (l->l_prev == NULL) {
 		/* Remove first. */
 		if ((r_debug.r_map = l->l_next) != NULL)
 			l->l_next->l_prev = NULL;
 	} else {
 		/* Remove any but first. */
 		if ((l->l_prev->l_next = l->l_next) != NULL)
 			l->l_next->l_prev = l->l_prev;
 	}
 }
 #endif /* GDB */
 
 /*
  * The kernel symbol table starts here.
  */
 extern struct _dynamic _DYNAMIC;
 
 static void
 link_elf_error(const char *filename, const char *s)
 {
 	if (filename == NULL)
 		printf("kldload: %s\n", s);
 	else
 		printf("kldload: %s: %s\n", filename, s);
 }
 
 static void
 link_elf_invoke_ctors(caddr_t addr, size_t size)
 {
 	void (**ctor)(void);
 	size_t i, cnt;
 
 	if (addr == NULL || size == 0)
 		return;
 	cnt = size / sizeof(*ctor);
 	ctor = (void *)addr;
 	for (i = 0; i < cnt; i++) {
 		if (ctor[i] != NULL)
 			(*ctor[i])();
 	}
 }
 
 /*
  * Actions performed after linking/loading both the preloaded kernel and any
  * modules; whether preloaded or dynamicly loaded.
  */
 static int
 link_elf_link_common_finish(linker_file_t lf)
 {
 #ifdef GDB
 	elf_file_t ef = (elf_file_t)lf;
 	char *newfilename;
 #endif
 	int error;
 
 	/* Notify MD code that a module is being loaded. */
 	error = elf_cpu_load_file(lf);
 	if (error != 0)
 		return (error);
 
 #ifdef GDB
 	GDB_STATE(RT_ADD);
 	ef->gdb.l_addr = lf->address;
 	newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
 	strcpy(newfilename, lf->filename);
 	ef->gdb.l_name = newfilename;
 	ef->gdb.l_ld = ef->dynamic;
 	link_elf_add_gdb(&ef->gdb);
 	GDB_STATE(RT_CONSISTENT);
 #endif
 
 	/* Invoke .ctors */
 	link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size);
 	return (0);
 }
 
 extern vm_offset_t __startkernel;
 
 static void
 link_elf_init(void* arg)
 {
 	Elf_Dyn *dp;
 	Elf_Addr *ctors_addrp;
 	Elf_Size *ctors_sizep;
 	caddr_t modptr, baseptr, sizeptr;
 	elf_file_t ef;
 	char *modname;
 
 	linker_add_class(&link_elf_class);
 
 	dp = (Elf_Dyn *)&_DYNAMIC;
 	modname = NULL;
 	modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel");
 	if (modptr == NULL)
 		modptr = preload_search_by_type("elf kernel");
 	modname = (char *)preload_search_info(modptr, MODINFO_NAME);
 	if (modname == NULL)
 		modname = "kernel";
 	linker_kernel_file = linker_make_file(modname, &link_elf_class);
 	if (linker_kernel_file == NULL)
 		panic("%s: Can't create linker structures for kernel",
 		    __func__);
 
 	ef = (elf_file_t) linker_kernel_file;
 	ef->preloaded = 1;
 #ifdef __powerpc__
 	ef->address = (caddr_t) (__startkernel - KERNBASE);
 #else
 	ef->address = 0;
 #endif
 #ifdef SPARSE_MAPPING
 	ef->object = 0;
 #endif
 	ef->dynamic = dp;
 
 	if (dp != NULL)
 		parse_dynamic(ef);
 	linker_kernel_file->address += KERNBASE;
 	linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;
 
 	if (modptr != NULL) {
 		ef->modptr = modptr;
 		baseptr = preload_search_info(modptr, MODINFO_ADDR);
 		if (baseptr != NULL)
 			linker_kernel_file->address = *(caddr_t *)baseptr;
 		sizeptr = preload_search_info(modptr, MODINFO_SIZE);
 		if (sizeptr != NULL)
 			linker_kernel_file->size = *(size_t *)sizeptr;
 		ctors_addrp = (Elf_Addr *)preload_search_info(modptr,
 			MODINFO_METADATA | MODINFOMD_CTORS_ADDR);
 		ctors_sizep = (Elf_Size *)preload_search_info(modptr,
 			MODINFO_METADATA | MODINFOMD_CTORS_SIZE);
 		if (ctors_addrp != NULL && ctors_sizep != NULL) {
 			linker_kernel_file->ctors_addr = ef->address +
 			    *ctors_addrp;
 			linker_kernel_file->ctors_size = *ctors_sizep;
 		}
 	}
 	(void)link_elf_preload_parse_symbols(ef);
 
 #ifdef GDB
 	r_debug.r_map = NULL;
 	r_debug.r_brk = r_debug_state;
 	r_debug.r_state = RT_CONSISTENT;
 #endif
 
 	(void)link_elf_link_common_finish(linker_kernel_file);
 	linker_kernel_file->flags |= LINKER_FILE_LINKED;
 	TAILQ_INIT(&set_pcpu_list);
 #ifdef VIMAGE
 	TAILQ_INIT(&set_vnet_list);
 #endif
 }
 
 SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0);
 
 static int
 link_elf_preload_parse_symbols(elf_file_t ef)
 {
 	caddr_t pointer;
 	caddr_t ssym, esym, base;
 	caddr_t strtab;
 	int strcnt;
 	Elf_Sym *symtab;
 	int symcnt;
 
 	if (ef->modptr == NULL)
 		return (0);
 	pointer = preload_search_info(ef->modptr,
 	    MODINFO_METADATA | MODINFOMD_SSYM);
 	if (pointer == NULL)
 		return (0);
 	ssym = *(caddr_t *)pointer;
 	pointer = preload_search_info(ef->modptr,
 	    MODINFO_METADATA | MODINFOMD_ESYM);
 	if (pointer == NULL)
 		return (0);
 	esym = *(caddr_t *)pointer;
 
 	base = ssym;
 
 	symcnt = *(long *)base;
 	base += sizeof(long);
 	symtab = (Elf_Sym *)base;
 	base += roundup(symcnt, sizeof(long));
 
 	if (base > esym || base < ssym) {
 		printf("Symbols are corrupt!\n");
 		return (EINVAL);
 	}
 
 	strcnt = *(long *)base;
 	base += sizeof(long);
 	strtab = base;
 	base += roundup(strcnt, sizeof(long));
 
 	if (base > esym || base < ssym) {
 		printf("Symbols are corrupt!\n");
 		return (EINVAL);
 	}
 
 	ef->ddbsymtab = symtab;
 	ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
 	ef->ddbstrtab = strtab;
 	ef->ddbstrcnt = strcnt;
 
 	return (0);
 }
 
 static int
 parse_dynamic(elf_file_t ef)
 {
 	Elf_Dyn *dp;
 	int plttype = DT_REL;
 
 	for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
 		switch (dp->d_tag) {
 		case DT_HASH:
 		{
 			/* From src/libexec/rtld-elf/rtld.c */
 			const Elf_Hashelt *hashtab = (const Elf_Hashelt *)
 			    (ef->address + dp->d_un.d_ptr);
 			ef->nbuckets = hashtab[0];
 			ef->nchains = hashtab[1];
 			ef->buckets = hashtab + 2;
 			ef->chains = ef->buckets + ef->nbuckets;
 			break;
 		}
 		case DT_STRTAB:
 			ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
 			break;
 		case DT_STRSZ:
 			ef->strsz = dp->d_un.d_val;
 			break;
 		case DT_SYMTAB:
 			ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
 			break;
 		case DT_SYMENT:
 			if (dp->d_un.d_val != sizeof(Elf_Sym))
 				return (ENOEXEC);
 			break;
 		case DT_PLTGOT:
 			ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
 			break;
 		case DT_REL:
 			ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
 			break;
 		case DT_RELSZ:
 			ef->relsize = dp->d_un.d_val;
 			break;
 		case DT_RELENT:
 			if (dp->d_un.d_val != sizeof(Elf_Rel))
 				return (ENOEXEC);
 			break;
 		case DT_JMPREL:
 			ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
 			break;
 		case DT_PLTRELSZ:
 			ef->pltrelsize = dp->d_un.d_val;
 			break;
 		case DT_RELA:
 			ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
 			break;
 		case DT_RELASZ:
 			ef->relasize = dp->d_un.d_val;
 			break;
 		case DT_RELAENT:
 			if (dp->d_un.d_val != sizeof(Elf_Rela))
 				return (ENOEXEC);
 			break;
 		case DT_PLTREL:
 			plttype = dp->d_un.d_val;
 			if (plttype != DT_REL && plttype != DT_RELA)
 				return (ENOEXEC);
 			break;
 #ifdef GDB
 		case DT_DEBUG:
 			dp->d_un.d_ptr = (Elf_Addr)&r_debug;
 			break;
 #endif
 		}
 	}
 
 	if (plttype == DT_RELA) {
 		ef->pltrela = (const Elf_Rela *)ef->pltrel;
 		ef->pltrel = NULL;
 		ef->pltrelasize = ef->pltrelsize;
 		ef->pltrelsize = 0;
 	}
 
 	ef->ddbsymtab = ef->symtab;
 	ef->ddbsymcnt = ef->nchains;
 	ef->ddbstrtab = ef->strtab;
 	ef->ddbstrcnt = ef->strsz;
 
 	return (0);
 }
 
 static int
 parse_dpcpu(elf_file_t ef)
 {
 	int count;
 	int error;
 
 	ef->pcpu_start = 0;
 	ef->pcpu_stop = 0;
 	error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start,
 	    (void ***)&ef->pcpu_stop, &count);
 	/* Error just means there is no pcpu set to relocate. */
 	if (error != 0)
 		return (0);
 	count *= sizeof(void *);
 	/*
 	 * Allocate space in the primary pcpu area.  Copy in our
 	 * initialization from the data section and then initialize
 	 * all per-cpu storage from that.
 	 */
 	ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(count);
 	if (ef->pcpu_base == 0)
 		return (ENOSPC);
 	memcpy((void *)ef->pcpu_base, (void *)ef->pcpu_start, count);
 	dpcpu_copy((void *)ef->pcpu_base, count);
 	elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop,
 	    ef->pcpu_base);
 
 	return (0);
 }
 
 #ifdef VIMAGE
 static int
 parse_vnet(elf_file_t ef)
 {
 	int count;
 	int error;
 
 	ef->vnet_start = 0;
 	ef->vnet_stop = 0;
 	error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start,
 	    (void ***)&ef->vnet_stop, &count);
 	/* Error just means there is no vnet data set to relocate. */
 	if (error != 0)
 		return (0);
 	count *= sizeof(void *);
 	/*
 	 * Allocate space in the primary vnet area.  Copy in our
 	 * initialization from the data section and then initialize
 	 * all per-vnet storage from that.
 	 */
 	ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(count);
 	if (ef->vnet_base == 0)
 		return (ENOSPC);
 	memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, count);
 	vnet_data_copy((void *)ef->vnet_base, count);
 	elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop,
 	    ef->vnet_base);
 
 	return (0);
 }
 #endif
 
 static int
 link_elf_link_preload(linker_class_t cls,
     const char* filename, linker_file_t *result)
 {
 	Elf_Addr *ctors_addrp;
 	Elf_Size *ctors_sizep;
 	caddr_t modptr, baseptr, sizeptr, dynptr;
 	char *type;
 	elf_file_t ef;
 	linker_file_t lf;
 	int error;
 	vm_offset_t dp;
 
 	/* Look to see if we have the file preloaded */
 	modptr = preload_search_by_name(filename);
 	if (modptr == NULL)
 		return (ENOENT);
 
 	type = (char *)preload_search_info(modptr, MODINFO_TYPE);
 	baseptr = preload_search_info(modptr, MODINFO_ADDR);
 	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
 	dynptr = preload_search_info(modptr,
 	    MODINFO_METADATA | MODINFOMD_DYNAMIC);
 	if (type == NULL ||
 	    (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 &&
 	     strcmp(type, "elf module") != 0))
 		return (EFTYPE);
 	if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
 		return (EINVAL);
 
 	lf = linker_make_file(filename, &link_elf_class);
 	if (lf == NULL)
 		return (ENOMEM);
 
 	ef = (elf_file_t) lf;
 	ef->preloaded = 1;
 	ef->modptr = modptr;
 	ef->address = *(caddr_t *)baseptr;
 #ifdef SPARSE_MAPPING
 	ef->object = 0;
 #endif
 	dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
 	ef->dynamic = (Elf_Dyn *)dp;
 	lf->address = ef->address;
 	lf->size = *(size_t *)sizeptr;
 
 	ctors_addrp = (Elf_Addr *)preload_search_info(modptr,
 	    MODINFO_METADATA | MODINFOMD_CTORS_ADDR);
 	ctors_sizep = (Elf_Size *)preload_search_info(modptr,
 	    MODINFO_METADATA | MODINFOMD_CTORS_SIZE);
 	if (ctors_addrp != NULL && ctors_sizep != NULL) {
 		lf->ctors_addr = ef->address + *ctors_addrp;
 		lf->ctors_size = *ctors_sizep;
 	}
 
 	error = parse_dynamic(ef);
 	if (error == 0)
 		error = parse_dpcpu(ef);
 #ifdef VIMAGE
 	if (error == 0)
 		error = parse_vnet(ef);
 #endif
 	if (error != 0) {
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 		return (error);
 	}
 	link_elf_reloc_local(lf);
 	*result = lf;
 	return (0);
 }
 
 static int
 link_elf_link_preload_finish(linker_file_t lf)
 {
 	elf_file_t ef;
 	int error;
 
 	ef = (elf_file_t) lf;
 	error = relocate_file(ef);
 	if (error != 0)
 		return (error);
 	(void)link_elf_preload_parse_symbols(ef);
 
 	return (link_elf_link_common_finish(lf));
 }
 
 static int
 link_elf_load_file(linker_class_t cls, const char* filename,
     linker_file_t* result)
 {
 	struct nameidata nd;
 	struct thread* td = curthread;	/* XXX */
 	Elf_Ehdr *hdr;
 	caddr_t firstpage;
 	int nbytes, i;
 	Elf_Phdr *phdr;
 	Elf_Phdr *phlimit;
 	Elf_Phdr *segs[MAXSEGS];
 	int nsegs;
 	Elf_Phdr *phdyn;
 	Elf_Phdr *phphdr;
 	caddr_t mapbase;
 	size_t mapsize;
 	Elf_Off base_offset;
 	Elf_Addr base_vaddr;
 	Elf_Addr base_vlimit;
 	int error = 0;
 	ssize_t resid;
 	int flags;
 	elf_file_t ef;
 	linker_file_t lf;
 	Elf_Shdr *shdr;
 	int symtabindex;
 	int symstrindex;
 	int shstrindex;
 	int symcnt;
 	int strcnt;
 	char *shstrs;
 
 	shdr = NULL;
 	lf = NULL;
 	shstrs = NULL;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG) {
 		error = ENOEXEC;
 		firstpage = NULL;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp);
 	if (error != 0) {
 		firstpage = NULL;
 		goto out;
 	}
 #endif
 
 	/*
 	 * Read the elf header from the file.
 	 */
 	firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
 	hdr = (Elf_Ehdr *)firstpage;
 	error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	nbytes = PAGE_SIZE - resid;
 	if (error != 0)
 		goto out;
 
 	if (!IS_ELF(*hdr)) {
 		error = ENOEXEC;
 		goto out;
 	}
 
 	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
 		link_elf_error(filename, "Unsupported file layout");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_version != EV_CURRENT) {
 		link_elf_error(filename, "Unsupported file version");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
 		error = ENOSYS;
 		goto out;
 	}
 	if (hdr->e_machine != ELF_TARG_MACH) {
 		link_elf_error(filename, "Unsupported machine");
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/*
 	 * We rely on the program header being in the first page.
 	 * This is not strictly required by the ABI specification, but
 	 * it seems to always true in practice.  And, it simplifies
 	 * things considerably.
 	 */
 	if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
 	      (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
 	      (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
 		link_elf_error(filename, "Unreadable program headers");
 
 	/*
 	 * Scan the program header entries, and save key information.
 	 *
 	 * We rely on there being exactly two load segments, text and data,
 	 * in that order.
 	 */
 	phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
 	phlimit = phdr + hdr->e_phnum;
 	nsegs = 0;
 	phdyn = NULL;
 	phphdr = NULL;
 	while (phdr < phlimit) {
 		switch (phdr->p_type) {
 		case PT_LOAD:
 			if (nsegs == MAXSEGS) {
 				link_elf_error(filename, "Too many sections");
 				error = ENOEXEC;
 				goto out;
 			}
 			/*
 			 * XXX: We just trust they come in right order ??
 			 */
 			segs[nsegs] = phdr;
 			++nsegs;
 			break;
 
 		case PT_PHDR:
 			phphdr = phdr;
 			break;
 
 		case PT_DYNAMIC:
 			phdyn = phdr;
 			break;
 
 		case PT_INTERP:
 			error = ENOSYS;
 			goto out;
 		}
 
 		++phdr;
 	}
 	if (phdyn == NULL) {
 		link_elf_error(filename, "Object is not dynamically-linked");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (nsegs == 0) {
 		link_elf_error(filename, "No sections");
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/*
 	 * Allocate the entire address space of the object, to stake
 	 * out our contiguous region, and to establish the base
 	 * address for relocation.
 	 */
 	base_offset = trunc_page(segs[0]->p_offset);
 	base_vaddr = trunc_page(segs[0]->p_vaddr);
 	base_vlimit = round_page(segs[nsegs - 1]->p_vaddr +
 	    segs[nsegs - 1]->p_memsz);
 	mapsize = base_vlimit - base_vaddr;
 
 	lf = linker_make_file(filename, &link_elf_class);
 	if (lf == NULL) {
 		error = ENOMEM;
 		goto out;
 	}
 
 	ef = (elf_file_t) lf;
 #ifdef SPARSE_MAPPING
 	ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
 	if (ef->object == NULL) {
 		error = ENOMEM;
 		goto out;
 	}
 	ef->address = (caddr_t) vm_map_min(kernel_map);
 	error = vm_map_find(kernel_map, ef->object, 0,
 	    (vm_offset_t *) &ef->address, mapsize, 0, VMFS_OPTIMAL_SPACE,
 	    VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error != 0) {
 		vm_object_deallocate(ef->object);
 		ef->object = 0;
 		goto out;
 	}
 #else
 	ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
 #endif
 	mapbase = ef->address;
 
 	/*
 	 * Read the text and data sections and zero the bss.
 	 */
 	for (i = 0; i < nsegs; i++) {
 		caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
 		error = vn_rdwr(UIO_READ, nd.ni_vp,
 		    segbase, segs[i]->p_filesz, segs[i]->p_offset,
 		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 		    &resid, td);
 		if (error != 0)
 			goto out;
 		bzero(segbase + segs[i]->p_filesz,
 		    segs[i]->p_memsz - segs[i]->p_filesz);
 
 #ifdef SPARSE_MAPPING
 		/*
 		 * Wire down the pages
 		 */
 		error = vm_map_wire(kernel_map,
 		    (vm_offset_t) segbase,
 		    (vm_offset_t) segbase + segs[i]->p_memsz,
 		    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
 		if (error != KERN_SUCCESS) {
 			error = ENOMEM;
 			goto out;
 		}
 #endif
 	}
 
 #ifdef GPROF
 	/* Update profiling information with the new text segment. */
 	mtx_lock(&Giant);
 	kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
 	    segs[0]->p_memsz));
 	mtx_unlock(&Giant);
 #endif
 
 	ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
 
 	lf->address = ef->address;
 	lf->size = mapsize;
 
 	error = parse_dynamic(ef);
 	if (error != 0)
 		goto out;
 	error = parse_dpcpu(ef);
 	if (error != 0)
 		goto out;
 #ifdef VIMAGE
 	error = parse_vnet(ef);
 	if (error != 0)
 		goto out;
 #endif
 	link_elf_reloc_local(lf);
 
 	VOP_UNLOCK(nd.ni_vp, 0);
 	error = linker_load_dependencies(lf);
 	vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error != 0)
 		goto out;
 	error = relocate_file(ef);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Try and load the symbol table if it's present.  (you can
 	 * strip it!)
 	 */
 	nbytes = hdr->e_shnum * hdr->e_shentsize;
 	if (nbytes == 0 || hdr->e_shoff == 0)
 		goto nosyms;
 	shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
 	error = vn_rdwr(UIO_READ, nd.ni_vp,
 	    (caddr_t)shdr, nbytes, hdr->e_shoff,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error != 0)
 		goto out;
 
 	/* Read section string table */
 	shstrindex = hdr->e_shstrndx;
 	if (shstrindex != 0 && shdr[shstrindex].sh_type == SHT_STRTAB &&
 	    shdr[shstrindex].sh_size != 0) {
 		nbytes = shdr[shstrindex].sh_size;
 		shstrs = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
 		error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shstrs, nbytes,
 		    shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED,
 		    td->td_ucred, NOCRED, &resid, td);
 		if (error)
 			goto out;
 	}
 
 	symtabindex = -1;
 	symstrindex = -1;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_type == SHT_SYMTAB) {
 			symtabindex = i;
 			symstrindex = shdr[i].sh_link;
 		} else if (shstrs != NULL && shdr[i].sh_name != 0 &&
 		    strcmp(shstrs + shdr[i].sh_name, ".ctors") == 0) {
 			/* Record relocated address and size of .ctors. */
 			lf->ctors_addr = mapbase + shdr[i].sh_addr - base_vaddr;
 			lf->ctors_size = shdr[i].sh_size;
 		}
 	}
 	if (symtabindex < 0 || symstrindex < 0)
 		goto nosyms;
 
 	symcnt = shdr[symtabindex].sh_size;
 	ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
 	strcnt = shdr[symstrindex].sh_size;
 	ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
 
 	error = vn_rdwr(UIO_READ, nd.ni_vp,
 	    ef->symbase, symcnt, shdr[symtabindex].sh_offset,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error != 0)
 		goto out;
 	error = vn_rdwr(UIO_READ, nd.ni_vp,
 	    ef->strbase, strcnt, shdr[symstrindex].sh_offset,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error != 0)
 		goto out;
 
 	ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
 	ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
 	ef->ddbstrcnt = strcnt;
 	ef->ddbstrtab = ef->strbase;
 
 nosyms:
 	error = link_elf_link_common_finish(lf);
 	if (error != 0)
 		goto out;
 
 	*result = lf;
 
 out:
 	VOP_UNLOCK(nd.ni_vp, 0);
 	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
 	if (error != 0 && lf != NULL)
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 	free(shdr, M_LINKER);
 	free(firstpage, M_LINKER);
 	free(shstrs, M_LINKER);
 
 	return (error);
 }
 
 Elf_Addr
 elf_relocaddr(linker_file_t lf, Elf_Addr x)
 {
 	elf_file_t ef;
 
 	ef = (elf_file_t)lf;
 	if (x >= ef->pcpu_start && x < ef->pcpu_stop)
 		return ((x - ef->pcpu_start) + ef->pcpu_base);
 #ifdef VIMAGE
 	if (x >= ef->vnet_start && x < ef->vnet_stop)
 		return ((x - ef->vnet_start) + ef->vnet_base);
 #endif
 	return (x);
 }
 
 
 static void
 link_elf_unload_file(linker_file_t file)
 {
 	elf_file_t ef = (elf_file_t) file;
 
 	if (ef->pcpu_base != 0) {
 		dpcpu_free((void *)ef->pcpu_base,
 		    ef->pcpu_stop - ef->pcpu_start);
 		elf_set_delete(&set_pcpu_list, ef->pcpu_start);
 	}
 #ifdef VIMAGE
 	if (ef->vnet_base != 0) {
 		vnet_data_free((void *)ef->vnet_base,
 		    ef->vnet_stop - ef->vnet_start);
 		elf_set_delete(&set_vnet_list, ef->vnet_start);
 	}
 #endif
 #ifdef GDB
 	if (ef->gdb.l_ld != NULL) {
 		GDB_STATE(RT_DELETE);
 		free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
 		link_elf_delete_gdb(&ef->gdb);
 		GDB_STATE(RT_CONSISTENT);
 	}
 #endif
 
 	/* Notify MD code that a module is being unloaded. */
 	elf_cpu_unload_file(file);
 
 	if (ef->preloaded) {
 		link_elf_unload_preload(file);
 		return;
 	}
 
 #ifdef SPARSE_MAPPING
 	if (ef->object != NULL) {
 		vm_map_remove(kernel_map, (vm_offset_t) ef->address,
 		    (vm_offset_t) ef->address
 		    + (ef->object->size << PAGE_SHIFT));
 	}
 #else
 	free(ef->address, M_LINKER);
 #endif
 	free(ef->symbase, M_LINKER);
 	free(ef->strbase, M_LINKER);
 	free(ef->ctftab, M_LINKER);
 	free(ef->ctfoff, M_LINKER);
 	free(ef->typoff, M_LINKER);
 }
 
 static void
 link_elf_unload_preload(linker_file_t file)
 {
 	if (file->filename != NULL)
 		preload_delete_name(file->filename);
 }
 
 static const char *
 symbol_name(elf_file_t ef, Elf_Size r_info)
 {
 	const Elf_Sym *ref;
 
 	if (ELF_R_SYM(r_info)) {
 		ref = ef->symtab + ELF_R_SYM(r_info);
 		return (ef->strtab + ref->st_name);
 	}
 	return (NULL);
 }
 
 static int
 relocate_file(elf_file_t ef)
 {
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Rela *relalim;
 	const Elf_Rela *rela;
 	const char *symname;
 
 	/* Perform relocations without addend if there are any: */
 	rel = ef->rel;
 	if (rel != NULL) {
 		rellim = (const Elf_Rel *)
 		    ((const char *)ef->rel + ef->relsize);
 		while (rel < rellim) {
 			if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
 			    ELF_RELOC_REL, elf_lookup)) {
 				symname = symbol_name(ef, rel->r_info);
 				printf("link_elf: symbol %s undefined\n", symname);
 				return (ENOENT);
 			}
 			rel++;
 		}
 	}
 
 	/* Perform relocations with addend if there are any: */
 	rela = ef->rela;
 	if (rela != NULL) {
 		relalim = (const Elf_Rela *)
 		    ((const char *)ef->rela + ef->relasize);
 		while (rela < relalim) {
 			if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
 			    ELF_RELOC_RELA, elf_lookup)) {
 				symname = symbol_name(ef, rela->r_info);
 				printf("link_elf: symbol %s undefined\n",
 				    symname);
 				return (ENOENT);
 			}
 			rela++;
 		}
 	}
 
 	/* Perform PLT relocations without addend if there are any: */
 	rel = ef->pltrel;
 	if (rel != NULL) {
 		rellim = (const Elf_Rel *)
 		    ((const char *)ef->pltrel + ef->pltrelsize);
 		while (rel < rellim) {
 			if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
 			    ELF_RELOC_REL, elf_lookup)) {
 				symname = symbol_name(ef, rel->r_info);
 				printf("link_elf: symbol %s undefined\n",
 				    symname);
 				return (ENOENT);
 			}
 			rel++;
 		}
 	}
 
 	/* Perform relocations with addend if there are any: */
 	rela = ef->pltrela;
 	if (rela != NULL) {
 		relalim = (const Elf_Rela *)
 		    ((const char *)ef->pltrela + ef->pltrelasize);
 		while (rela < relalim) {
 			if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
 			    ELF_RELOC_RELA, elf_lookup)) {
 				symname = symbol_name(ef, rela->r_info);
 				printf("link_elf: symbol %s undefined\n",
 				    symname);
 				return (ENOENT);
 			}
 			rela++;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Hash function for symbol table lookup.  Don't even think about changing
  * this.  It is specified by the System V ABI.
  */
 static unsigned long
 elf_hash(const char *name)
 {
 	const unsigned char *p = (const unsigned char *) name;
 	unsigned long h = 0;
 	unsigned long g;
 
 	while (*p != '\0') {
 		h = (h << 4) + *p++;
 		if ((g = h & 0xf0000000) != 0)
 			h ^= g >> 24;
 		h &= ~g;
 	}
 	return (h);
 }
 
 static int
 link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	unsigned long symnum;
 	const Elf_Sym* symp;
 	const char *strp;
 	unsigned long hash;
 	int i;
 
 	/* If we don't have a hash, bail. */
 	if (ef->buckets == NULL || ef->nbuckets == 0) {
 		printf("link_elf_lookup_symbol: missing symbol hash table\n");
 		return (ENOENT);
 	}
 
 	/* First, search hashed global symbols */
 	hash = elf_hash(name);
 	symnum = ef->buckets[hash % ef->nbuckets];
 
 	while (symnum != STN_UNDEF) {
 		if (symnum >= ef->nchains) {
 			printf("%s: corrupt symbol table\n", __func__);
 			return (ENOENT);
 		}
 
 		symp = ef->symtab + symnum;
 		if (symp->st_name == 0) {
 			printf("%s: corrupt symbol table\n", __func__);
 			return (ENOENT);
 		}
 
 		strp = ef->strtab + symp->st_name;
 
 		if (strcmp(name, strp) == 0) {
 			if (symp->st_shndx != SHN_UNDEF ||
 			    (symp->st_value != 0 &&
 			     ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
 				*sym = (c_linker_sym_t) symp;
 				return (0);
 			}
 			return (ENOENT);
 		}
 
 		symnum = ef->chains[symnum];
 	}
 
 	/* If we have not found it, look at the full table (if loaded) */
 	if (ef->symtab == ef->ddbsymtab)
 		return (ENOENT);
 
 	/* Exhaustive search */
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		strp = ef->ddbstrtab + symp->st_name;
 		if (strcmp(name, strp) == 0) {
 			if (symp->st_shndx != SHN_UNDEF ||
 			    (symp->st_value != 0 &&
 			     ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
 				*sym = (c_linker_sym_t) symp;
 				return (0);
 			}
 			return (ENOENT);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static int
 link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
     linker_symval_t *symval)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	const Elf_Sym* es = (const Elf_Sym*) sym;
 
 	if (es >= ef->symtab && es < (ef->symtab + ef->nchains)) {
 		symval->name = ef->strtab + es->st_name;
 		symval->value = (caddr_t) ef->address + es->st_value;
 		symval->size = es->st_size;
 		return (0);
 	}
 	if (ef->symtab == ef->ddbsymtab)
 		return (ENOENT);
 	if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
 		symval->name = ef->ddbstrtab + es->st_name;
 		symval->value = (caddr_t) ef->address + es->st_value;
 		symval->size = es->st_size;
 		return (0);
 	}
 	return (ENOENT);
 }
 
 static int
 link_elf_search_symbol(linker_file_t lf, caddr_t value,
     c_linker_sym_t *sym, long *diffp)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	u_long off = (uintptr_t) (void *) value;
 	u_long diff = off;
 	u_long st_value;
 	const Elf_Sym* es;
 	const Elf_Sym* best = NULL;
 	int i;
 
 	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
 		if (es->st_name == 0)
 			continue;
 		st_value = es->st_value + (uintptr_t) (void *) ef->address;
 		if (off >= st_value) {
 			if (off - st_value < diff) {
 				diff = off - st_value;
 				best = es;
 				if (diff == 0)
 					break;
 			} else if (off - st_value == diff) {
 				best = es;
 			}
 		}
 	}
 	if (best == NULL)
 		*diffp = off;
 	else
 		*diffp = diff;
 	*sym = (c_linker_sym_t) best;
 
 	return (0);
 }
 
 /*
  * Look up a linker set on an ELF system.
  */
 static int
 link_elf_lookup_set(linker_file_t lf, const char *name,
     void ***startp, void ***stopp, int *countp)
 {
 	c_linker_sym_t sym;
 	linker_symval_t symval;
 	char *setsym;
 	void **start, **stop;
 	int len, error = 0, count;
 
 	len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
 	setsym = malloc(len, M_LINKER, M_WAITOK);
 
 	/* get address of first entry */
 	snprintf(setsym, len, "%s%s", "__start_set_", name);
 	error = link_elf_lookup_symbol(lf, setsym, &sym);
 	if (error != 0)
 		goto out;
 	link_elf_symbol_values(lf, sym, &symval);
 	if (symval.value == 0) {
 		error = ESRCH;
 		goto out;
 	}
 	start = (void **)symval.value;
 
 	/* get address of last entry */
 	snprintf(setsym, len, "%s%s", "__stop_set_", name);
 	error = link_elf_lookup_symbol(lf, setsym, &sym);
 	if (error != 0)
 		goto out;
 	link_elf_symbol_values(lf, sym, &symval);
 	if (symval.value == 0) {
 		error = ESRCH;
 		goto out;
 	}
 	stop = (void **)symval.value;
 
 	/* and the number of entries */
 	count = stop - start;
 
 	/* and copy out */
 	if (startp != NULL)
 		*startp = start;
 	if (stopp != NULL)
 		*stopp = stop;
 	if (countp != NULL)
 		*countp = count;
 
 out:
 	free(setsym, M_LINKER);
 	return (error);
 }
 
 static int
 link_elf_each_function_name(linker_file_t file,
   int (*callback)(const char *, void *), void *opaque)
 {
 	elf_file_t ef = (elf_file_t)file;
 	const Elf_Sym *symp;
 	int i, error;
 
 	/* Exhaustive search */
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		if (symp->st_value != 0 &&
 		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
 			error = callback(ef->ddbstrtab + symp->st_name, opaque);
 			if (error != 0)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 static int
 link_elf_each_function_nameval(linker_file_t file,
     linker_function_nameval_callback_t callback, void *opaque)
 {
 	linker_symval_t symval;
 	elf_file_t ef = (elf_file_t)file;
 	const Elf_Sym* symp;
 	int i, error;
 
 	/* Exhaustive search */
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		if (symp->st_value != 0 &&
 		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
 			error = link_elf_symbol_values(file,
 			    (c_linker_sym_t) symp, &symval);
 			if (error != 0)
 				return (error);
 			error = callback(file, i, &symval, opaque);
 			if (error != 0)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 const Elf_Sym *
 elf_get_sym(linker_file_t lf, Elf_Size symidx)
 {
 	elf_file_t ef = (elf_file_t)lf;
 
 	if (symidx >= ef->nchains)
 		return (NULL);
 	return (ef->symtab + symidx);
 }
 
 const char *
 elf_get_symname(linker_file_t lf, Elf_Size symidx)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	const Elf_Sym *sym;
 
 	if (symidx >= ef->nchains)
 		return (NULL);
 	sym = ef->symtab + symidx;
 	return (ef->strtab + sym->st_name);
 }
 
 /*
  * Symbol lookup function that can be used when the symbol index is known (ie
  * in relocations). It uses the symbol index instead of doing a fully fledged
  * hash table based lookup when such is valid. For example for local symbols.
  * This is not only more efficient, it's also more correct. It's not always
  * the case that the symbol can be found through the hash table.
  */
 static int
 elf_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	const Elf_Sym *sym;
 	const char *symbol;
 	Elf_Addr addr, start, base;
 
 	/* Don't even try to lookup the symbol if the index is bogus. */
 	if (symidx >= ef->nchains) {
 		*res = 0;
 		return (EINVAL);
 	}
 
 	sym = ef->symtab + symidx;
 
 	/*
 	 * Don't do a full lookup when the symbol is local. It may even
 	 * fail because it may not be found through the hash table.
 	 */
 	if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
 		/* Force lookup failure when we have an insanity. */
 		if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) {
 			*res = 0;
 			return (EINVAL);
 		}
 		*res = ((Elf_Addr)ef->address + sym->st_value);
 		return (0);
 	}
 
 	/*
 	 * XXX we can avoid doing a hash table based lookup for global
 	 * symbols as well. This however is not always valid, so we'll
 	 * just do it the hard way for now. Performance tweaks can
 	 * always be added.
 	 */
 
 	symbol = ef->strtab + sym->st_name;
 
 	/* Force a lookup failure if the symbol name is bogus. */
 	if (*symbol == 0) {
 		*res = 0;
 		return (EINVAL);
 	}
 
 	addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
 	if (addr == 0 && ELF_ST_BIND(sym->st_info) != STB_WEAK) {
 		*res = 0;
 		return (EINVAL);
 	}
 
 	if (elf_set_find(&set_pcpu_list, addr, &start, &base))
 		addr = addr - start + base;
 #ifdef VIMAGE
 	else if (elf_set_find(&set_vnet_list, addr, &start, &base))
 		addr = addr - start + base;
 #endif
 	*res = addr;
 	return (0);
 }
 
 static void
 link_elf_reloc_local(linker_file_t lf)
 {
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Rela *relalim;
 	const Elf_Rela *rela;
 	elf_file_t ef = (elf_file_t)lf;
 
 	/* Perform relocations without addend if there are any: */
 	if ((rel = ef->rel) != NULL) {
 		rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize);
 		while (rel < rellim) {
 			elf_reloc_local(lf, (Elf_Addr)ef->address, rel,
 			    ELF_RELOC_REL, elf_lookup);
 			rel++;
 		}
 	}
 
 	/* Perform relocations with addend if there are any: */
 	if ((rela = ef->rela) != NULL) {
 		relalim = (const Elf_Rela *)
 		    ((const char *)ef->rela + ef->relasize);
 		while (rela < relalim) {
 			elf_reloc_local(lf, (Elf_Addr)ef->address, rela,
 			    ELF_RELOC_RELA, elf_lookup);
 			rela++;
 		}
 	}
 }
 
 static long
 link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
 {
 	elf_file_t ef = (elf_file_t)lf;
 
 	*symtab = ef->ddbsymtab;
 
 	if (*symtab == NULL)
 		return (0);
 
 	return (ef->ddbsymcnt);
 }
 
 static long
 link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
 {
 	elf_file_t ef = (elf_file_t)lf;
 
 	*strtab = ef->ddbstrtab;
 
 	if (*strtab == NULL)
 		return (0);
 
 	return (ef->ddbstrcnt);
 }
Index: head/sys/kern/link_elf_obj.c
===================================================================
--- head/sys/kern/link_elf_obj.c	(revision 326270)
+++ head/sys/kern/link_elf_obj.c	(revision 326271)
@@ -1,1518 +1,1520 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1998-2000 Doug Rabson
  * Copyright (c) 2004 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/fcntl.h>
 #include <sys/vnode.h>
 #include <sys/linker.h>
 
 #include <machine/elf.h>
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #include <sys/link_elf.h>
 
 #ifdef DDB_CTF
 #include <sys/zlib.h>
 #endif
 
 #include "linker_if.h"
 
 typedef struct {
 	void		*addr;
 	Elf_Off		size;
 	int		flags;
 	int		sec;	/* Original section */
 	char		*name;
 } Elf_progent;
 
 typedef struct {
 	Elf_Rel		*rel;
 	int		nrel;
 	int		sec;
 } Elf_relent;
 
 typedef struct {
 	Elf_Rela	*rela;
 	int		nrela;
 	int		sec;
 } Elf_relaent;
 
 
 typedef struct elf_file {
 	struct linker_file lf;		/* Common fields */
 
 	int		preloaded;
 	caddr_t		address;	/* Relocation address */
 	vm_object_t	object;		/* VM object to hold file pages */
 	Elf_Shdr	*e_shdr;
 
 	Elf_progent	*progtab;
 	int		nprogtab;
 
 	Elf_relaent	*relatab;
 	int		nrelatab;
 
 	Elf_relent	*reltab;
 	int		nreltab;
 
 	Elf_Sym		*ddbsymtab;	/* The symbol table we are using */
 	long		ddbsymcnt;	/* Number of symbols */
 	caddr_t		ddbstrtab;	/* String table */
 	long		ddbstrcnt;	/* number of bytes in string table */
 
 	caddr_t		shstrtab;	/* Section name string table */
 	long		shstrcnt;	/* number of bytes in string table */
 
 	caddr_t		ctftab;		/* CTF table */
 	long		ctfcnt;		/* number of bytes in CTF table */
 	caddr_t		ctfoff;		/* CTF offset table */
 	caddr_t		typoff;		/* Type offset table */
 	long		typlen;		/* Number of type entries. */
 
 } *elf_file_t;
 
 #include <kern/kern_ctf.c>
 
 static int	link_elf_link_preload(linker_class_t cls,
 		    const char *, linker_file_t *);
 static int	link_elf_link_preload_finish(linker_file_t);
 static int	link_elf_load_file(linker_class_t, const char *, linker_file_t *);
 static int	link_elf_lookup_symbol(linker_file_t, const char *,
 		    c_linker_sym_t *);
 static int	link_elf_symbol_values(linker_file_t, c_linker_sym_t,
 		    linker_symval_t *);
 static int	link_elf_search_symbol(linker_file_t, caddr_t value,
 		    c_linker_sym_t *sym, long *diffp);
 
 static void	link_elf_unload_file(linker_file_t);
 static int	link_elf_lookup_set(linker_file_t, const char *,
 		    void ***, void ***, int *);
 static int	link_elf_each_function_name(linker_file_t,
 		    int (*)(const char *, void *), void *);
 static int	link_elf_each_function_nameval(linker_file_t,
 				linker_function_nameval_callback_t,
 				void *);
 static int	link_elf_reloc_local(linker_file_t);
 static long	link_elf_symtab_get(linker_file_t, const Elf_Sym **);
 static long	link_elf_strtab_get(linker_file_t, caddr_t *);
 
 static int	elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps,
 		    Elf_Addr *);
 
 static kobj_method_t link_elf_methods[] = {
 	KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
 	KOBJMETHOD(linker_symbol_values,	link_elf_symbol_values),
 	KOBJMETHOD(linker_search_symbol,	link_elf_search_symbol),
 	KOBJMETHOD(linker_unload,		link_elf_unload_file),
 	KOBJMETHOD(linker_load_file,		link_elf_load_file),
 	KOBJMETHOD(linker_link_preload,		link_elf_link_preload),
 	KOBJMETHOD(linker_link_preload_finish,	link_elf_link_preload_finish),
 	KOBJMETHOD(linker_lookup_set,		link_elf_lookup_set),
 	KOBJMETHOD(linker_each_function_name,	link_elf_each_function_name),
 	KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
 	KOBJMETHOD(linker_ctf_get,		link_elf_ctf_get),
 	KOBJMETHOD(linker_symtab_get, 		link_elf_symtab_get),
 	KOBJMETHOD(linker_strtab_get, 		link_elf_strtab_get),
 	{ 0, 0 }
 };
 
 static struct linker_class link_elf_class = {
 #if ELF_TARG_CLASS == ELFCLASS32
 	"elf32_obj",
 #else
 	"elf64_obj",
 #endif
 	link_elf_methods, sizeof(struct elf_file)
 };
 
 static int	relocate_file(elf_file_t ef);
 static void	elf_obj_cleanup_globals_cache(elf_file_t);
 
 static void
 link_elf_error(const char *filename, const char *s)
 {
 	if (filename == NULL)
 		printf("kldload: %s\n", s);
 	else
 		printf("kldload: %s: %s\n", filename, s);
 }
 
 static void
 link_elf_init(void *arg)
 {
 
 	linker_add_class(&link_elf_class);
 }
 
 SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
 
 static int
 link_elf_link_preload(linker_class_t cls, const char *filename,
     linker_file_t *result)
 {
 	Elf_Ehdr *hdr;
 	Elf_Shdr *shdr;
 	Elf_Sym *es;
 	void *modptr, *baseptr, *sizeptr;
 	char *type;
 	elf_file_t ef;
 	linker_file_t lf;
 	Elf_Addr off;
 	int error, i, j, pb, ra, rl, shstrindex, symstrindex, symtabindex;
 
 	/* Look to see if we have the file preloaded */
 	modptr = preload_search_by_name(filename);
 	if (modptr == NULL)
 		return ENOENT;
 
 	type = (char *)preload_search_info(modptr, MODINFO_TYPE);
 	baseptr = preload_search_info(modptr, MODINFO_ADDR);
 	sizeptr = preload_search_info(modptr, MODINFO_SIZE);
 	hdr = (Elf_Ehdr *)preload_search_info(modptr, MODINFO_METADATA |
 	    MODINFOMD_ELFHDR);
 	shdr = (Elf_Shdr *)preload_search_info(modptr, MODINFO_METADATA |
 	    MODINFOMD_SHDR);
 	if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE)
 	    " obj module") != 0 &&
 	    strcmp(type, "elf obj module") != 0)) {
 		return (EFTYPE);
 	}
 	if (baseptr == NULL || sizeptr == NULL || hdr == NULL ||
 	    shdr == NULL)
 		return (EINVAL);
 
 	lf = linker_make_file(filename, &link_elf_class);
 	if (lf == NULL)
 		return (ENOMEM);
 
 	ef = (elf_file_t)lf;
 	ef->preloaded = 1;
 	ef->address = *(caddr_t *)baseptr;
 	lf->address = *(caddr_t *)baseptr;
 	lf->size = *(size_t *)sizeptr;
 
 	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_version != EV_CURRENT ||
 	    hdr->e_type != ET_REL ||
 	    hdr->e_machine != ELF_TARG_MACH) {
 		error = EFTYPE;
 		goto out;
 	}
 	ef->e_shdr = shdr;
 
 	/* Scan the section header for information and table sizing. */
 	symtabindex = -1;
 	symstrindex = -1;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			ef->nprogtab++;
 			break;
 		case SHT_SYMTAB:
 			symtabindex = i;
 			symstrindex = shdr[i].sh_link;
 			break;
 		case SHT_REL:
 			ef->nreltab++;
 			break;
 		case SHT_RELA:
 			ef->nrelatab++;
 			break;
 		}
 	}
 
 	shstrindex = hdr->e_shstrndx;
 	if (ef->nprogtab == 0 || symstrindex < 0 ||
 	    symstrindex >= hdr->e_shnum ||
 	    shdr[symstrindex].sh_type != SHT_STRTAB || shstrindex == 0 ||
 	    shstrindex >= hdr->e_shnum ||
 	    shdr[shstrindex].sh_type != SHT_STRTAB) {
 		printf("%s: bad/missing section headers\n", filename);
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Allocate space for tracking the load chunks */
 	if (ef->nprogtab != 0)
 		ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if (ef->nreltab != 0)
 		ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if (ef->nrelatab != 0)
 		ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if ((ef->nprogtab != 0 && ef->progtab == NULL) ||
 	    (ef->nreltab != 0 && ef->reltab == NULL) ||
 	    (ef->nrelatab != 0 && ef->relatab == NULL)) {
 		error = ENOMEM;
 		goto out;
 	}
 
 	/* XXX, relocate the sh_addr fields saved by the loader. */
 	off = 0;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_addr != 0 && (off == 0 || shdr[i].sh_addr < off))
 			off = shdr[i].sh_addr;
 	}
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_addr != 0)
 			shdr[i].sh_addr = shdr[i].sh_addr - off +
 			    (Elf_Addr)ef->address;
 	}
 
 	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
 	ef->ddbsymtab = (Elf_Sym *)shdr[symtabindex].sh_addr;
 	ef->ddbstrcnt = shdr[symstrindex].sh_size;
 	ef->ddbstrtab = (char *)shdr[symstrindex].sh_addr;
 	ef->shstrcnt = shdr[shstrindex].sh_size;
 	ef->shstrtab = (char *)shdr[shstrindex].sh_addr;
 
 	/* Now fill out progtab and the relocation tables. */
 	pb = 0;
 	rl = 0;
 	ra = 0;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			ef->progtab[pb].addr = (void *)shdr[i].sh_addr;
 			if (shdr[i].sh_type == SHT_PROGBITS)
 				ef->progtab[pb].name = "<<PROGBITS>>";
 #ifdef __amd64__
 			else if (shdr[i].sh_type == SHT_X86_64_UNWIND)
 				ef->progtab[pb].name = "<<UNWIND>>";
 #endif
 			else
 				ef->progtab[pb].name = "<<NOBITS>>";
 			ef->progtab[pb].size = shdr[i].sh_size;
 			ef->progtab[pb].sec = i;
 			if (ef->shstrtab && shdr[i].sh_name != 0)
 				ef->progtab[pb].name =
 				    ef->shstrtab + shdr[i].sh_name;
 			if (ef->progtab[pb].name != NULL && 
 			    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) {
 				void *dpcpu;
 
 				dpcpu = dpcpu_alloc(shdr[i].sh_size);
 				if (dpcpu == NULL) {
 					error = ENOSPC;
 					goto out;
 				}
 				memcpy(dpcpu, ef->progtab[pb].addr,
 				    ef->progtab[pb].size);
 				dpcpu_copy(dpcpu, shdr[i].sh_size);
 				ef->progtab[pb].addr = dpcpu;
 #ifdef VIMAGE
 			} else if (ef->progtab[pb].name != NULL &&
 			    !strcmp(ef->progtab[pb].name, VNET_SETNAME)) {
 				void *vnet_data;
 
 				vnet_data = vnet_data_alloc(shdr[i].sh_size);
 				if (vnet_data == NULL) {
 					error = ENOSPC;
 					goto out;
 				}
 				memcpy(vnet_data, ef->progtab[pb].addr,
 				    ef->progtab[pb].size);
 				vnet_data_copy(vnet_data, shdr[i].sh_size);
 				ef->progtab[pb].addr = vnet_data;
 #endif
 			} else if (ef->progtab[pb].name != NULL &&
 			    !strcmp(ef->progtab[pb].name, ".ctors")) {
 				lf->ctors_addr = ef->progtab[pb].addr;
 				lf->ctors_size = shdr[i].sh_size;
 			}
 
 			/* Update all symbol values with the offset. */
 			for (j = 0; j < ef->ddbsymcnt; j++) {
 				es = &ef->ddbsymtab[j];
 				if (es->st_shndx != i)
 					continue;
 				es->st_value += (Elf_Addr)ef->progtab[pb].addr;
 			}
 			pb++;
 			break;
 		case SHT_REL:
 			ef->reltab[rl].rel = (Elf_Rel *)shdr[i].sh_addr;
 			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
 			ef->reltab[rl].sec = shdr[i].sh_info;
 			rl++;
 			break;
 		case SHT_RELA:
 			ef->relatab[ra].rela = (Elf_Rela *)shdr[i].sh_addr;
 			ef->relatab[ra].nrela =
 			    shdr[i].sh_size / sizeof(Elf_Rela);
 			ef->relatab[ra].sec = shdr[i].sh_info;
 			ra++;
 			break;
 		}
 	}
 	if (pb != ef->nprogtab) {
 		printf("%s: lost progbits\n", filename);
 		error = ENOEXEC;
 		goto out;
 	}
 	if (rl != ef->nreltab) {
 		printf("%s: lost reltab\n", filename);
 		error = ENOEXEC;
 		goto out;
 	}
 	if (ra != ef->nrelatab) {
 		printf("%s: lost relatab\n", filename);
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Local intra-module relocations */
 	error = link_elf_reloc_local(lf);
 	if (error != 0)
 		goto out;
 
 	*result = lf;
 	return (0);
 
 out:
 	/* preload not done this way */
 	linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 	return (error);
 }
 
 static void
 link_elf_invoke_ctors(caddr_t addr, size_t size)
 {
 	void (**ctor)(void);
 	size_t i, cnt;
 
 	if (addr == NULL || size == 0)
 		return;
 	cnt = size / sizeof(*ctor);
 	ctor = (void *)addr;
 	for (i = 0; i < cnt; i++) {
 		if (ctor[i] != NULL)
 			(*ctor[i])();
 	}
 }
 
 static int
 link_elf_link_preload_finish(linker_file_t lf)
 {
 	elf_file_t ef;
 	int error;
 
 	ef = (elf_file_t)lf;
 	error = relocate_file(ef);
 	if (error)
 		return error;
 
 	/* Notify MD code that a module is being loaded. */
 	error = elf_cpu_load_file(lf);
 	if (error)
 		return (error);
 
 	/* Invoke .ctors */
 	link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size);
 	return (0);
 }
 
 static int
 link_elf_load_file(linker_class_t cls, const char *filename,
     linker_file_t *result)
 {
 	struct nameidata *nd;
 	struct thread *td = curthread;	/* XXX */
 	Elf_Ehdr *hdr;
 	Elf_Shdr *shdr;
 	Elf_Sym *es;
 	int nbytes, i, j;
 	vm_offset_t mapbase;
 	size_t mapsize;
 	int error = 0;
 	ssize_t resid;
 	int flags;
 	elf_file_t ef;
 	linker_file_t lf;
 	int symtabindex;
 	int symstrindex;
 	int shstrindex;
 	int nsym;
 	int pb, rl, ra;
 	int alignmask;
 
 	shdr = NULL;
 	lf = NULL;
 	mapsize = 0;
 	hdr = NULL;
 
 	nd = malloc(sizeof(struct nameidata), M_TEMP, M_WAITOK);
 	NDINIT(nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
 	flags = FREAD;
 	error = vn_open(nd, &flags, 0, NULL);
 	if (error) {
 		free(nd, M_TEMP);
 		return error;
 	}
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	if (nd->ni_vp->v_type != VREG) {
 		error = ENOEXEC;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_kld_check_load(td->td_ucred, nd->ni_vp);
 	if (error) {
 		goto out;
 	}
 #endif
 
 	/* Read the elf header from the file. */
 	hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK);
 	error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)hdr, sizeof(*hdr), 0,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error)
 		goto out;
 	if (resid != 0){
 		error = ENOEXEC;
 		goto out;
 	}
 
 	if (!IS_ELF(*hdr)) {
 		error = ENOEXEC;
 		goto out;
 	}
 
 	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
 	    || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
 		link_elf_error(filename, "Unsupported file layout");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (hdr->e_ident[EI_VERSION] != EV_CURRENT
 	    || hdr->e_version != EV_CURRENT) {
 		link_elf_error(filename, "Unsupported file version");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (hdr->e_type != ET_REL) {
 		error = ENOSYS;
 		goto out;
 	}
 	if (hdr->e_machine != ELF_TARG_MACH) {
 		link_elf_error(filename, "Unsupported machine");
 		error = ENOEXEC;
 		goto out;
 	}
 
 	lf = linker_make_file(filename, &link_elf_class);
 	if (!lf) {
 		error = ENOMEM;
 		goto out;
 	}
 	ef = (elf_file_t) lf;
 	ef->nprogtab = 0;
 	ef->e_shdr = 0;
 	ef->nreltab = 0;
 	ef->nrelatab = 0;
 
 	/* Allocate and read in the section header */
 	nbytes = hdr->e_shnum * hdr->e_shentsize;
 	if (nbytes == 0 || hdr->e_shoff == 0 ||
 	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
 		error = ENOEXEC;
 		goto out;
 	}
 	shdr = malloc(nbytes, M_LINKER, M_WAITOK);
 	ef->e_shdr = shdr;
 	error = vn_rdwr(UIO_READ, nd->ni_vp, (caddr_t)shdr, nbytes,
 	    hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 	    NOCRED, &resid, td);
 	if (error)
 		goto out;
 	if (resid) {
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Scan the section header for information and table sizing. */
 	nsym = 0;
 	symtabindex = -1;
 	symstrindex = -1;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_size == 0)
 			continue;
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			ef->nprogtab++;
 			break;
 		case SHT_SYMTAB:
 			nsym++;
 			symtabindex = i;
 			symstrindex = shdr[i].sh_link;
 			break;
 		case SHT_REL:
 			ef->nreltab++;
 			break;
 		case SHT_RELA:
 			ef->nrelatab++;
 			break;
 		case SHT_STRTAB:
 			break;
 		}
 	}
 	if (ef->nprogtab == 0) {
 		link_elf_error(filename, "file has no contents");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (nsym != 1) {
 		/* Only allow one symbol table for now */
 		link_elf_error(filename, "file has no valid symbol table");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (symstrindex < 0 || symstrindex > hdr->e_shnum ||
 	    shdr[symstrindex].sh_type != SHT_STRTAB) {
 		link_elf_error(filename, "file has invalid symbol strings");
 		error = ENOEXEC;
 		goto out;
 	}
 
 	/* Allocate space for tracking the load chunks */
 	if (ef->nprogtab != 0)
 		ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if (ef->nreltab != 0)
 		ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 	if (ef->nrelatab != 0)
 		ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 
 	if (symtabindex == -1) {
 		link_elf_error(filename, "lost symbol table index");
 		error = ENOEXEC;
 		goto out;
 	}
 	/* Allocate space for and load the symbol table */
 	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
 	ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
 	error = vn_rdwr(UIO_READ, nd->ni_vp, (void *)ef->ddbsymtab,
 	    shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error)
 		goto out;
 	if (resid != 0){
 		error = EINVAL;
 		goto out;
 	}
 
 	if (symstrindex == -1) {
 		link_elf_error(filename, "lost symbol string index");
 		error = ENOEXEC;
 		goto out;
 	}
 	/* Allocate space for and load the symbol strings */
 	ef->ddbstrcnt = shdr[symstrindex].sh_size;
 	ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
 	error = vn_rdwr(UIO_READ, nd->ni_vp, ef->ddbstrtab,
 	    shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset,
 	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 	    &resid, td);
 	if (error)
 		goto out;
 	if (resid != 0){
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Do we have a string table for the section names?  */
 	shstrindex = -1;
 	if (hdr->e_shstrndx != 0 &&
 	    shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) {
 		shstrindex = hdr->e_shstrndx;
 		ef->shstrcnt = shdr[shstrindex].sh_size;
 		ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER,
 		    M_WAITOK);
 		error = vn_rdwr(UIO_READ, nd->ni_vp, ef->shstrtab,
 		    shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset,
 		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 		    &resid, td);
 		if (error)
 			goto out;
 		if (resid != 0){
 			error = EINVAL;
 			goto out;
 		}
 	}
 
 	/* Size up code/data(progbits) and bss(nobits). */
 	alignmask = 0;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_size == 0)
 			continue;
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			alignmask = shdr[i].sh_addralign - 1;
 			mapsize += alignmask;
 			mapsize &= ~alignmask;
 			mapsize += shdr[i].sh_size;
 			break;
 		}
 	}
 
 	/*
 	 * We know how much space we need for the text/data/bss/etc.
 	 * This stuff needs to be in a single chunk so that profiling etc
 	 * can get the bounds and gdb can associate offsets with modules
 	 */
 	ef->object = vm_object_allocate(OBJT_DEFAULT,
 	    round_page(mapsize) >> PAGE_SHIFT);
 	if (ef->object == NULL) {
 		error = ENOMEM;
 		goto out;
 	}
 	ef->address = (caddr_t) vm_map_min(kernel_map);
 
 	/*
 	 * In order to satisfy amd64's architectural requirements on the
 	 * location of code and data in the kernel's address space, request a
 	 * mapping that is above the kernel.  
 	 */
 #ifdef __amd64__
 	mapbase = KERNBASE;
 #else
 	mapbase = VM_MIN_KERNEL_ADDRESS;
 #endif
 	error = vm_map_find(kernel_map, ef->object, 0, &mapbase,
 	    round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL,
 	    VM_PROT_ALL, 0);
 	if (error) {
 		vm_object_deallocate(ef->object);
 		ef->object = 0;
 		goto out;
 	}
 
 	/* Wire the pages */
 	error = vm_map_wire(kernel_map, mapbase,
 	    mapbase + round_page(mapsize),
 	    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
 	if (error != KERN_SUCCESS) {
 		error = ENOMEM;
 		goto out;
 	}
 
 	/* Inform the kld system about the situation */
 	lf->address = ef->address = (caddr_t)mapbase;
 	lf->size = mapsize;
 
 	/*
 	 * Now load code/data(progbits), zero bss(nobits), allocate space for
 	 * and load relocs
 	 */
 	pb = 0;
 	rl = 0;
 	ra = 0;
 	alignmask = 0;
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if (shdr[i].sh_size == 0)
 			continue;
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
 #ifdef __amd64__
 		case SHT_X86_64_UNWIND:
 #endif
 			alignmask = shdr[i].sh_addralign - 1;
 			mapbase += alignmask;
 			mapbase &= ~alignmask;
 			if (ef->shstrtab != NULL && shdr[i].sh_name != 0) {
 				ef->progtab[pb].name =
 				    ef->shstrtab + shdr[i].sh_name;
 				if (!strcmp(ef->progtab[pb].name, ".ctors")) {
 					lf->ctors_addr = (caddr_t)mapbase;
 					lf->ctors_size = shdr[i].sh_size;
 				}
 			} else if (shdr[i].sh_type == SHT_PROGBITS)
 				ef->progtab[pb].name = "<<PROGBITS>>";
 #ifdef __amd64__
 			else if (shdr[i].sh_type == SHT_X86_64_UNWIND)
 				ef->progtab[pb].name = "<<UNWIND>>";
 #endif
 			else
 				ef->progtab[pb].name = "<<NOBITS>>";
 			if (ef->progtab[pb].name != NULL && 
 			    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
 				ef->progtab[pb].addr =
 				    dpcpu_alloc(shdr[i].sh_size);
 #ifdef VIMAGE
 			else if (ef->progtab[pb].name != NULL &&
 			    !strcmp(ef->progtab[pb].name, VNET_SETNAME))
 				ef->progtab[pb].addr =
 				    vnet_data_alloc(shdr[i].sh_size);
 #endif
 			else
 				ef->progtab[pb].addr =
 				    (void *)(uintptr_t)mapbase;
 			if (ef->progtab[pb].addr == NULL) {
 				error = ENOSPC;
 				goto out;
 			}
 			ef->progtab[pb].size = shdr[i].sh_size;
 			ef->progtab[pb].sec = i;
 			if (shdr[i].sh_type == SHT_PROGBITS
 #ifdef __amd64__
 			    || shdr[i].sh_type == SHT_X86_64_UNWIND
 #endif
 			    ) {
 				error = vn_rdwr(UIO_READ, nd->ni_vp,
 				    ef->progtab[pb].addr,
 				    shdr[i].sh_size, shdr[i].sh_offset,
 				    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 				    NOCRED, &resid, td);
 				if (error)
 					goto out;
 				if (resid != 0){
 					error = EINVAL;
 					goto out;
 				}
 				/* Initialize the per-cpu or vnet area. */
 				if (ef->progtab[pb].addr != (void *)mapbase &&
 				    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
 					dpcpu_copy(ef->progtab[pb].addr,
 					    shdr[i].sh_size);
 #ifdef VIMAGE
 				else if (ef->progtab[pb].addr !=
 				    (void *)mapbase &&
 				    !strcmp(ef->progtab[pb].name, VNET_SETNAME))
 					vnet_data_copy(ef->progtab[pb].addr,
 					    shdr[i].sh_size);
 #endif
 			} else
 				bzero(ef->progtab[pb].addr, shdr[i].sh_size);
 
 			/* Update all symbol values with the offset. */
 			for (j = 0; j < ef->ddbsymcnt; j++) {
 				es = &ef->ddbsymtab[j];
 				if (es->st_shndx != i)
 					continue;
 				es->st_value += (Elf_Addr)ef->progtab[pb].addr;
 			}
 			mapbase += shdr[i].sh_size;
 			pb++;
 			break;
 		case SHT_REL:
 			ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER,
 			    M_WAITOK);
 			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
 			ef->reltab[rl].sec = shdr[i].sh_info;
 			error = vn_rdwr(UIO_READ, nd->ni_vp,
 			    (void *)ef->reltab[rl].rel,
 			    shdr[i].sh_size, shdr[i].sh_offset,
 			    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 			    &resid, td);
 			if (error)
 				goto out;
 			if (resid != 0){
 				error = EINVAL;
 				goto out;
 			}
 			rl++;
 			break;
 		case SHT_RELA:
 			ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER,
 			    M_WAITOK);
 			ef->relatab[ra].nrela =
 			    shdr[i].sh_size / sizeof(Elf_Rela);
 			ef->relatab[ra].sec = shdr[i].sh_info;
 			error = vn_rdwr(UIO_READ, nd->ni_vp,
 			    (void *)ef->relatab[ra].rela,
 			    shdr[i].sh_size, shdr[i].sh_offset,
 			    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 			    &resid, td);
 			if (error)
 				goto out;
 			if (resid != 0){
 				error = EINVAL;
 				goto out;
 			}
 			ra++;
 			break;
 		}
 	}
 	if (pb != ef->nprogtab) {
 		link_elf_error(filename, "lost progbits");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (rl != ef->nreltab) {
 		link_elf_error(filename, "lost reltab");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (ra != ef->nrelatab) {
 		link_elf_error(filename, "lost relatab");
 		error = ENOEXEC;
 		goto out;
 	}
 	if (mapbase != (vm_offset_t)ef->address + mapsize) {
 		printf(
 		    "%s: mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n",
 		    filename != NULL ? filename : "<none>",
 		    (u_long)mapbase, ef->address, (u_long)mapsize,
 		    (u_long)(vm_offset_t)ef->address + mapsize);
 		error = ENOMEM;
 		goto out;
 	}
 
 	/* Local intra-module relocations */
 	error = link_elf_reloc_local(lf);
 	if (error != 0)
 		goto out;
 
 	/* Pull in dependencies */
 	VOP_UNLOCK(nd->ni_vp, 0);
 	error = linker_load_dependencies(lf);
 	vn_lock(nd->ni_vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error)
 		goto out;
 
 	/* External relocations */
 	error = relocate_file(ef);
 	if (error)
 		goto out;
 
 	/* Notify MD code that a module is being loaded. */
 	error = elf_cpu_load_file(lf);
 	if (error)
 		goto out;
 
 	/* Invoke .ctors */
 	link_elf_invoke_ctors(lf->ctors_addr, lf->ctors_size);
 
 	*result = lf;
 
 out:
 	VOP_UNLOCK(nd->ni_vp, 0);
 	vn_close(nd->ni_vp, FREAD, td->td_ucred, td);
 	free(nd, M_TEMP);
 	if (error && lf)
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 	free(hdr, M_LINKER);
 
 	return error;
 }
 
 static void
 link_elf_unload_file(linker_file_t file)
 {
 	elf_file_t ef = (elf_file_t) file;
 	int i;
 
 	/* Notify MD code that a module is being unloaded. */
 	elf_cpu_unload_file(file);
 
 	if (ef->progtab) {
 		for (i = 0; i < ef->nprogtab; i++) {
 			if (ef->progtab[i].size == 0)
 				continue;
 			if (ef->progtab[i].name == NULL)
 				continue;
 			if (!strcmp(ef->progtab[i].name, DPCPU_SETNAME))
 				dpcpu_free(ef->progtab[i].addr,
 				    ef->progtab[i].size);
 #ifdef VIMAGE
 			else if (!strcmp(ef->progtab[i].name, VNET_SETNAME))
 				vnet_data_free(ef->progtab[i].addr,
 				    ef->progtab[i].size);
 #endif
 		}
 	}
 	if (ef->preloaded) {
 		free(ef->reltab, M_LINKER);
 		free(ef->relatab, M_LINKER);
 		free(ef->progtab, M_LINKER);
 		free(ef->ctftab, M_LINKER);
 		free(ef->ctfoff, M_LINKER);
 		free(ef->typoff, M_LINKER);
 		if (file->filename != NULL)
 			preload_delete_name(file->filename);
 		/* XXX reclaim module memory? */
 		return;
 	}
 
 	for (i = 0; i < ef->nreltab; i++)
 		free(ef->reltab[i].rel, M_LINKER);
 	for (i = 0; i < ef->nrelatab; i++)
 		free(ef->relatab[i].rela, M_LINKER);
 	free(ef->reltab, M_LINKER);
 	free(ef->relatab, M_LINKER);
 	free(ef->progtab, M_LINKER);
 
 	if (ef->object) {
 		vm_map_remove(kernel_map, (vm_offset_t) ef->address,
 		    (vm_offset_t) ef->address +
 		    (ef->object->size << PAGE_SHIFT));
 	}
 	free(ef->e_shdr, M_LINKER);
 	free(ef->ddbsymtab, M_LINKER);
 	free(ef->ddbstrtab, M_LINKER);
 	free(ef->shstrtab, M_LINKER);
 	free(ef->ctftab, M_LINKER);
 	free(ef->ctfoff, M_LINKER);
 	free(ef->typoff, M_LINKER);
 }
 
 static const char *
 symbol_name(elf_file_t ef, Elf_Size r_info)
 {
 	const Elf_Sym *ref;
 
 	if (ELF_R_SYM(r_info)) {
 		ref = ef->ddbsymtab + ELF_R_SYM(r_info);
 		return ef->ddbstrtab + ref->st_name;
 	} else
 		return NULL;
 }
 
 static Elf_Addr
 findbase(elf_file_t ef, int sec)
 {
 	int i;
 	Elf_Addr base = 0;
 
 	for (i = 0; i < ef->nprogtab; i++) {
 		if (sec == ef->progtab[i].sec) {
 			base = (Elf_Addr)ef->progtab[i].addr;
 			break;
 		}
 	}
 	return base;
 }
 
 static int
 relocate_file(elf_file_t ef)
 {
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Rela *relalim;
 	const Elf_Rela *rela;
 	const char *symname;
 	const Elf_Sym *sym;
 	int i;
 	Elf_Size symidx;
 	Elf_Addr base;
 
 
 	/* Perform relocations without addend if there are any: */
 	for (i = 0; i < ef->nreltab; i++) {
 		rel = ef->reltab[i].rel;
 		if (rel == NULL) {
 			link_elf_error(ef->lf.filename, "lost a reltab!");
 			return (ENOEXEC);
 		}
 		rellim = rel + ef->reltab[i].nrel;
 		base = findbase(ef, ef->reltab[i].sec);
 		if (base == 0) {
 			link_elf_error(ef->lf.filename, "lost base for reltab");
 			return (ENOEXEC);
 		}
 		for ( ; rel < rellim; rel++) {
 			symidx = ELF_R_SYM(rel->r_info);
 			if (symidx >= ef->ddbsymcnt)
 				continue;
 			sym = ef->ddbsymtab + symidx;
 			/* Local relocs are already done */
 			if (ELF_ST_BIND(sym->st_info) == STB_LOCAL)
 				continue;
 			if (elf_reloc(&ef->lf, base, rel, ELF_RELOC_REL,
 			    elf_obj_lookup)) {
 				symname = symbol_name(ef, rel->r_info);
 				printf("link_elf_obj: symbol %s undefined\n",
 				    symname);
 				return (ENOENT);
 			}
 		}
 	}
 
 	/* Perform relocations with addend if there are any: */
 	for (i = 0; i < ef->nrelatab; i++) {
 		rela = ef->relatab[i].rela;
 		if (rela == NULL) {
 			link_elf_error(ef->lf.filename, "lost a relatab!");
 			return (ENOEXEC);
 		}
 		relalim = rela + ef->relatab[i].nrela;
 		base = findbase(ef, ef->relatab[i].sec);
 		if (base == 0) {
 			link_elf_error(ef->lf.filename,
 			    "lost base for relatab");
 			return (ENOEXEC);
 		}
 		for ( ; rela < relalim; rela++) {
 			symidx = ELF_R_SYM(rela->r_info);
 			if (symidx >= ef->ddbsymcnt)
 				continue;
 			sym = ef->ddbsymtab + symidx;
 			/* Local relocs are already done */
 			if (ELF_ST_BIND(sym->st_info) == STB_LOCAL)
 				continue;
 			if (elf_reloc(&ef->lf, base, rela, ELF_RELOC_RELA,
 			    elf_obj_lookup)) {
 				symname = symbol_name(ef, rela->r_info);
 				printf("link_elf_obj: symbol %s undefined\n",
 				    symname);
 				return (ENOENT);
 			}
 		}
 	}
 
 	/*
 	 * Only clean SHN_FBSD_CACHED for successful return.  If we
 	 * modified symbol table for the object but found an
 	 * unresolved symbol, there is no reason to roll back.
 	 */
 	elf_obj_cleanup_globals_cache(ef);
 
 	return (0);
 }
 
 static int
 link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	const Elf_Sym *symp;
 	const char *strp;
 	int i;
 
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		strp = ef->ddbstrtab + symp->st_name;
 		if (symp->st_shndx != SHN_UNDEF && strcmp(name, strp) == 0) {
 			*sym = (c_linker_sym_t) symp;
 			return 0;
 		}
 	}
 	return ENOENT;
 }
 
 static int
 link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
     linker_symval_t *symval)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	const Elf_Sym *es = (const Elf_Sym*) sym;
 
 	if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
 		symval->name = ef->ddbstrtab + es->st_name;
 		symval->value = (caddr_t)es->st_value;
 		symval->size = es->st_size;
 		return 0;
 	}
 	return ENOENT;
 }
 
 static int
 link_elf_search_symbol(linker_file_t lf, caddr_t value,
     c_linker_sym_t *sym, long *diffp)
 {
 	elf_file_t ef = (elf_file_t) lf;
 	u_long off = (uintptr_t) (void *) value;
 	u_long diff = off;
 	u_long st_value;
 	const Elf_Sym *es;
 	const Elf_Sym *best = NULL;
 	int i;
 
 	for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
 		if (es->st_name == 0)
 			continue;
 		st_value = es->st_value;
 		if (off >= st_value) {
 			if (off - st_value < diff) {
 				diff = off - st_value;
 				best = es;
 				if (diff == 0)
 					break;
 			} else if (off - st_value == diff) {
 				best = es;
 			}
 		}
 	}
 	if (best == NULL)
 		*diffp = off;
 	else
 		*diffp = diff;
 	*sym = (c_linker_sym_t) best;
 
 	return 0;
 }
 
 /*
  * Look up a linker set on an ELF system.
  */
 static int
 link_elf_lookup_set(linker_file_t lf, const char *name,
     void ***startp, void ***stopp, int *countp)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	void **start, **stop;
 	int i, count;
 
 	/* Relative to section number */
 	for (i = 0; i < ef->nprogtab; i++) {
 		if ((strncmp(ef->progtab[i].name, "set_", 4) == 0) &&
 		    strcmp(ef->progtab[i].name + 4, name) == 0) {
 			start  = (void **)ef->progtab[i].addr;
 			stop = (void **)((char *)ef->progtab[i].addr +
 			    ef->progtab[i].size);
 			count = stop - start;
 			if (startp)
 				*startp = start;
 			if (stopp)
 				*stopp = stop;
 			if (countp)
 				*countp = count;
 			return (0);
 		}
 	}
 	return (ESRCH);
 }
 
 static int
 link_elf_each_function_name(linker_file_t file,
     int (*callback)(const char *, void *), void *opaque)
 {
 	elf_file_t ef = (elf_file_t)file;
 	const Elf_Sym *symp;
 	int i, error;
 	
 	/* Exhaustive search */
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		if (symp->st_value != 0 &&
 		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
 			error = callback(ef->ddbstrtab + symp->st_name, opaque);
 			if (error)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 static int
 link_elf_each_function_nameval(linker_file_t file,
     linker_function_nameval_callback_t callback, void *opaque)
 {
 	linker_symval_t symval;
 	elf_file_t ef = (elf_file_t)file;
 	const Elf_Sym* symp;
 	int i, error;
 
 	/* Exhaustive search */
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		if (symp->st_value != 0 &&
 		    ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
 			error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval);
 			if (error)
 				return (error);
 			error = callback(file, i, &symval, opaque);
 			if (error)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 static void
 elf_obj_cleanup_globals_cache(elf_file_t ef)
 {
 	Elf_Sym *sym;
 	Elf_Size i;
 
 	for (i = 0; i < ef->ddbsymcnt; i++) {
 		sym = ef->ddbsymtab + i;
 		if (sym->st_shndx == SHN_FBSD_CACHED) {
 			sym->st_shndx = SHN_UNDEF;
 			sym->st_value = 0;
 		}
 	}
 }
 
 /*
  * Symbol lookup function that can be used when the symbol index is known (ie
  * in relocations). It uses the symbol index instead of doing a fully fledged
  * hash table based lookup when such is valid. For example for local symbols.
  * This is not only more efficient, it's also more correct. It's not always
  * the case that the symbol can be found through the hash table.
  */
 static int
 elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	Elf_Sym *sym;
 	const char *symbol;
 	Elf_Addr res1;
 
 	/* Don't even try to lookup the symbol if the index is bogus. */
 	if (symidx >= ef->ddbsymcnt) {
 		*res = 0;
 		return (EINVAL);
 	}
 
 	sym = ef->ddbsymtab + symidx;
 
 	/* Quick answer if there is a definition included. */
 	if (sym->st_shndx != SHN_UNDEF) {
 		*res = sym->st_value;
 		return (0);
 	}
 
 	/* If we get here, then it is undefined and needs a lookup. */
 	switch (ELF_ST_BIND(sym->st_info)) {
 	case STB_LOCAL:
 		/* Local, but undefined? huh? */
 		*res = 0;
 		return (EINVAL);
 
 	case STB_GLOBAL:
 	case STB_WEAK:
 		/* Relative to Data or Function name */
 		symbol = ef->ddbstrtab + sym->st_name;
 
 		/* Force a lookup failure if the symbol name is bogus. */
 		if (*symbol == 0) {
 			*res = 0;
 			return (EINVAL);
 		}
 		res1 = (Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps);
 
 		/*
 		 * Cache global lookups during module relocation. The failure
 		 * case is particularly expensive for callers, who must scan
 		 * through the entire globals table doing strcmp(). Cache to
 		 * avoid doing such work repeatedly.
 		 *
 		 * After relocation is complete, undefined globals will be
 		 * restored to SHN_UNDEF in elf_obj_cleanup_globals_cache(),
 		 * above.
 		 */
 		if (res1 != 0) {
 			sym->st_shndx = SHN_FBSD_CACHED;
 			sym->st_value = res1;
 			*res = res1;
 			return (0);
 		} else if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
 			sym->st_value = 0;
 			*res = 0;
 			return (0);
 		}
 		return (EINVAL);
 
 	default:
 		return (EINVAL);
 	}
 }
 
 static void
 link_elf_fix_link_set(elf_file_t ef)
 {
 	static const char startn[] = "__start_";
 	static const char stopn[] = "__stop_";
 	Elf_Sym *sym;
 	const char *sym_name, *linkset_name;
 	Elf_Addr startp, stopp;
 	Elf_Size symidx;
 	int start, i;
 
 	startp = stopp = 0;
 	for (symidx = 1 /* zero entry is special */;
 		symidx < ef->ddbsymcnt; symidx++) {
 		sym = ef->ddbsymtab + symidx;
 		if (sym->st_shndx != SHN_UNDEF)
 			continue;
 
 		sym_name = ef->ddbstrtab + sym->st_name;
 		if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) {
 			start = 1;
 			linkset_name = sym_name + sizeof(startn) - 1;
 		}
 		else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) {
 			start = 0;
 			linkset_name = sym_name + sizeof(stopn) - 1;
 		}
 		else
 			continue;
 
 		for (i = 0; i < ef->nprogtab; i++) {
 			if (strcmp(ef->progtab[i].name, linkset_name) == 0) {
 				startp = (Elf_Addr)ef->progtab[i].addr;
 				stopp = (Elf_Addr)(startp + ef->progtab[i].size);
 				break;
 			}
 		}
 		if (i == ef->nprogtab)
 			continue;
 
 		sym->st_value = start ? startp : stopp;
 		sym->st_shndx = i;
 	}
 }
 
 static int
 link_elf_reloc_local(linker_file_t lf)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	const Elf_Rel *rellim;
 	const Elf_Rel *rel;
 	const Elf_Rela *relalim;
 	const Elf_Rela *rela;
 	const Elf_Sym *sym;
 	Elf_Addr base;
 	int i;
 	Elf_Size symidx;
 
 	link_elf_fix_link_set(ef);
 
 	/* Perform relocations without addend if there are any: */
 	for (i = 0; i < ef->nreltab; i++) {
 		rel = ef->reltab[i].rel;
 		if (rel == NULL) {
 			link_elf_error(ef->lf.filename, "lost a reltab");
 			return (ENOEXEC);
 		}
 		rellim = rel + ef->reltab[i].nrel;
 		base = findbase(ef, ef->reltab[i].sec);
 		if (base == 0) {
 			link_elf_error(ef->lf.filename, "lost base for reltab");
 			return (ENOEXEC);
 		}
 		for ( ; rel < rellim; rel++) {
 			symidx = ELF_R_SYM(rel->r_info);
 			if (symidx >= ef->ddbsymcnt)
 				continue;
 			sym = ef->ddbsymtab + symidx;
 			/* Only do local relocs */
 			if (ELF_ST_BIND(sym->st_info) != STB_LOCAL)
 				continue;
 			elf_reloc_local(lf, base, rel, ELF_RELOC_REL,
 			    elf_obj_lookup);
 		}
 	}
 
 	/* Perform relocations with addend if there are any: */
 	for (i = 0; i < ef->nrelatab; i++) {
 		rela = ef->relatab[i].rela;
 		if (rela == NULL) {
 			link_elf_error(ef->lf.filename, "lost a relatab!");
 			return (ENOEXEC);
 		}
 		relalim = rela + ef->relatab[i].nrela;
 		base = findbase(ef, ef->relatab[i].sec);
 		if (base == 0) {
 			link_elf_error(ef->lf.filename, "lost base for reltab");
 			return (ENOEXEC);
 		}
 		for ( ; rela < relalim; rela++) {
 			symidx = ELF_R_SYM(rela->r_info);
 			if (symidx >= ef->ddbsymcnt)
 				continue;
 			sym = ef->ddbsymtab + symidx;
 			/* Only do local relocs */
 			if (ELF_ST_BIND(sym->st_info) != STB_LOCAL)
 				continue;
 			elf_reloc_local(lf, base, rela, ELF_RELOC_RELA,
 			    elf_obj_lookup);
 		}
 	}
 	return (0);
 }
 
 static long
 link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
 {
     elf_file_t ef = (elf_file_t)lf;
     
     *symtab = ef->ddbsymtab;
     
     if (*symtab == NULL)
         return (0);
 
     return (ef->ddbsymcnt);
 }
     
 static long
 link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
 {
     elf_file_t ef = (elf_file_t)lf;
 
     *strtab = ef->ddbstrtab;
 
     if (*strtab == NULL)
         return (0);
 
     return (ef->ddbstrcnt);
 }
Index: head/sys/kern/sched_ule.c
===================================================================
--- head/sys/kern/sched_ule.c	(revision 326270)
+++ head/sys/kern/sched_ule.c	(revision 326271)
@@ -1,2956 +1,2958 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * This file implements the ULE scheduler.  ULE supports independent CPU
  * run queues and fine grain locking.  It has superior interactive
  * performance under load even on uni-processor systems.
  *
  * etymology:
  *   ULE is the last three letters in schedule.  It owes its name to a
  * generic user created for a scheduling system by Paul Mikesell at
  * Isilon Systems and a general lack of creativity on the part of the author.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/turnstile.h>
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
 #include <sys/sbuf.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 int				dtrace_vtime_active;
 dtrace_vtime_switch_func_t	dtrace_vtime_switch_func;
 #endif
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #define	KTR_ULE	0
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
 #define	TDQ_NAME_LEN	(sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
 #define	TDQ_LOADNAME_LEN	(sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
 
 /*
  * Thread scheduler specific section.  All fields are protected
  * by the thread lock.
  */
 struct td_sched {	
 	struct runq	*ts_runq;	/* Run-queue we're queued on. */
 	short		ts_flags;	/* TSF_* flags. */
 	int		ts_cpu;		/* CPU that we have affinity for. */
 	int		ts_rltick;	/* Real last tick, for affinity. */
 	int		ts_slice;	/* Ticks of slice remaining. */
 	u_int		ts_slptime;	/* Number of ticks we vol. slept */
 	u_int		ts_runtime;	/* Number of ticks we were running */
 	int		ts_ltick;	/* Last tick that we were running on */
 	int		ts_ftick;	/* First tick that we were running on */
 	int		ts_ticks;	/* Tick count */
 #ifdef KTR
 	char		ts_name[TS_NAME_LEN];
 #endif
 };
 /* flags kept in ts_flags */
 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
 #define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
 
 #define	THREAD_CAN_MIGRATE(td)	((td)->td_pinned == 0)
 #define	THREAD_CAN_SCHED(td, cpu)	\
     CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
 
 _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
     sizeof(struct thread0_storage),
     "increase struct thread0_storage.t0st_sched size");
 
 /*
  * Priority ranges used for interactive and non-interactive timeshare
  * threads.  The timeshare priorities are split up into four ranges.
  * The first range handles interactive threads.  The last three ranges
  * (NHALF, x, and NHALF) handle non-interactive threads with the outer
  * ranges supporting nice values.
  */
 #define	PRI_TIMESHARE_RANGE	(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 #define	PRI_INTERACT_RANGE	((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
 #define	PRI_BATCH_RANGE		(PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
 
 #define	PRI_MIN_INTERACT	PRI_MIN_TIMESHARE
 #define	PRI_MAX_INTERACT	(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
 #define	PRI_MIN_BATCH		(PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
 #define	PRI_MAX_BATCH		PRI_MAX_TIMESHARE
 
 /*
  * Cpu percentage computation macros and defines.
  *
  * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
  * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
  * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
  * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
  * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
  * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
  */
 #define	SCHED_TICK_SECS		10
 #define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
 #define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
 #define	SCHED_TICK_SHIFT	10
 #define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
 #define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
 
 /*
  * These macros determine priorities for non-interactive threads.  They are
  * assigned a priority based on their recent cpu utilization as expressed
  * by the ratio of ticks to the tick total.  NHALF priorities at the start
  * and end of the MIN to MAX timeshare range are only reachable with negative
  * or positive nice respectively.
  *
  * PRI_RANGE:	Priority range for utilization dependent priorities.
  * PRI_NRESV:	Number of nice values.
  * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
  * PRI_NICE:	Determines the part of the priority inherited from nice.
  */
 #define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
 #define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
 #define	SCHED_PRI_MIN		(PRI_MIN_BATCH + SCHED_PRI_NHALF)
 #define	SCHED_PRI_MAX		(PRI_MAX_BATCH - SCHED_PRI_NHALF)
 #define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
 #define	SCHED_PRI_TICKS(ts)						\
     (SCHED_TICK_HZ((ts)) /						\
     (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
 #define	SCHED_PRI_NICE(nice)	(nice)
 
 /*
  * These determine the interactivity of a process.  Interactivity differs from
  * cpu utilization in that it expresses the voluntary time slept vs time ran
  * while cpu utilization includes all time not running.  This more accurately
  * models the intent of the thread.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
  * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
  * INTERACT_THRESH:	Threshold for placement on the current runq.
  */
 #define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
 #define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
 #define	SCHED_INTERACT_MAX	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
 /*
  * These parameters determine the slice behavior for batch work.
  */
 #define	SCHED_SLICE_DEFAULT_DIVISOR	10	/* ~94 ms, 12 stathz ticks. */
 #define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
 
 /* Flags kept in td_flags. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /*
  * tickincr:		Converts a stathz tick into a hz domain scaled by
  *			the shift factor.  Without the shift the error rate
  *			due to rounding would be unacceptably high.
  * realstathz:		stathz is sometimes 0 and run off of hz.
  * sched_slice:		Runtime of each thread before rescheduling.
  * preempt_thresh:	Priority threshold for preemption and remote IPIs.
  */
 static int sched_interact = SCHED_INTERACT_THRESH;
 static int tickincr = 8 << SCHED_TICK_SHIFT;
 static int realstathz = 127;	/* reset during boot. */
 static int sched_slice = 10;	/* reset during boot. */
 static int sched_slice_min = 1;	/* reset during boot. */
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int preempt_thresh = PRI_MAX_IDLE;
 #else
 static int preempt_thresh = PRI_MIN_KERN;
 #endif
 #else 
 static int preempt_thresh = 0;
 #endif
 static int static_boost = PRI_MIN_BATCH;
 static int sched_idlespins = 10000;
 static int sched_idlespinthresh = -1;
 
 /*
  * tdq - per processor runqs and statistics.  All fields are protected by the
  * tdq_lock.  The load and lowpri may be accessed without to avoid excess
  * locking in sched_pickcpu();
  */
 struct tdq {
 	/* 
 	 * Ordered to improve efficiency of cpu_search() and switch().
 	 * tdq_lock is padded to avoid false sharing with tdq_load and
 	 * tdq_cpu_idle.
 	 */
 	struct mtx_padalign tdq_lock;		/* run queue lock. */
 	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
 	volatile int	tdq_load;		/* Aggregate load. */
 	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 	int		tdq_transferable;	/* Transferable thread count. */
 	short		tdq_switchcnt;		/* Switches this tick. */
 	short		tdq_oldswitchcnt;	/* Switches last tick. */
 	u_char		tdq_lowpri;		/* Lowest priority thread. */
 	u_char		tdq_ipipending;		/* IPI pending. */
 	u_char		tdq_idx;		/* Current insert index. */
 	u_char		tdq_ridx;		/* Current removal index. */
 	struct runq	tdq_realtime;		/* real-time run queue. */
 	struct runq	tdq_timeshare;		/* timeshare run queue. */
 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
 	char		tdq_name[TDQ_NAME_LEN];
 #ifdef KTR
 	char		tdq_loadname[TDQ_LOADNAME_LEN];
 #endif
 } __aligned(64);
 
 /* Idle thread states and config. */
 #define	TDQ_RUNNING	1
 #define	TDQ_IDLE	2
 
 #ifdef SMP
 struct cpu_group *cpu_top;		/* CPU topology */
 
 #define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 1000))
 #define	SCHED_AFFINITY(ts, t)	((ts)->ts_rltick > ticks - ((t) * affinity))
 
 /*
  * Run-time tunables.
  */
 static int rebalance = 1;
 static int balance_interval = 128;	/* Default set in sched_initticks(). */
 static int affinity;
 static int steal_idle = 1;
 static int steal_thresh = 2;
 
 /*
  * One thread queue per processor.
  */
 static struct tdq	tdq_cpu[MAXCPU];
 static struct tdq	*balance_tdq;
 static int balance_ticks;
 static DPCPU_DEFINE(uint32_t, randomval);
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
 #define	TDQ_ID(x)	((int)((x) - tdq_cpu))
 #else	/* !SMP */
 static struct tdq	tdq_cpu;
 
 #define	TDQ_ID(x)	(0)
 #define	TDQ_SELF()	(&tdq_cpu)
 #define	TDQ_CPU(x)	(&tdq_cpu)
 #endif
 
 #define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
 #define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
 #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCKPTR(t)		((struct mtx *)(&(t)->tdq_lock))
 
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
 static int sched_interact_score(struct thread *);
 static void sched_interact_update(struct thread *);
 static void sched_interact_fork(struct thread *);
 static void sched_pctcpu_update(struct td_sched *, int);
 
 /* Operations on per processor queues */
 static struct thread *tdq_choose(struct tdq *);
 static void tdq_setup(struct tdq *);
 static void tdq_load_add(struct tdq *, struct thread *);
 static void tdq_load_rem(struct tdq *, struct thread *);
 static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
 static __inline void tdq_runq_rem(struct tdq *, struct thread *);
 static inline int sched_shouldpreempt(int, int, int);
 void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
 static void tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
 static int tdq_move(struct tdq *, struct tdq *);
 static int tdq_idled(struct tdq *);
 static void tdq_notify(struct tdq *, struct thread *);
 static struct thread *tdq_steal(struct tdq *, int);
 static struct thread *runq_steal(struct runq *, int);
 static int sched_pickcpu(struct thread *, int);
 static void sched_balance(void);
 static int sched_balance_pair(struct tdq *, struct tdq *);
 static inline struct tdq *sched_setcpu(struct thread *, int, int);
 static inline void thread_unblock_switch(struct thread *, struct mtx *);
 static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
 static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, 
     struct cpu_group *cg, int indent);
 #endif
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
 SDT_PROVIDER_DEFINE(sched);
 
 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
     "struct proc *", "uint8_t");
 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
     "struct proc *", "void *");
 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", 
     "struct proc *");
 SDT_PROBE_DEFINE(sched, , , on__cpu);
 SDT_PROBE_DEFINE(sched, , , remain__cpu);
 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", 
     "struct proc *");
 
 /*
  * Print the threads waiting on a run-queue.
  */
 static void
 runq_print(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct thread *td;
 	int pri;
 	int j;
 	int i;
 
 	for (i = 0; i < RQB_LEN; i++) {
 		printf("\t\trunq bits %d 0x%zx\n",
 		    i, rq->rq_status.rqb_bits[i]);
 		for (j = 0; j < RQB_BPW; j++)
 			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
 				pri = j + (i << RQB_L2BPW);
 				rqh = &rq->rq_queues[pri];
 				TAILQ_FOREACH(td, rqh, td_runq) {
 					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
 					    td, td->td_name, td->td_priority,
 					    td->td_rqindex, pri);
 				}
 			}
 	}
 }
 
 /*
  * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
  */
 void
 tdq_print(int cpu)
 {
 	struct tdq *tdq;
 
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq %d:\n", TDQ_ID(tdq));
 	printf("\tlock            %p\n", TDQ_LOCKPTR(tdq));
 	printf("\tLock name:      %s\n", tdq->tdq_name);
 	printf("\tload:           %d\n", tdq->tdq_load);
 	printf("\tswitch cnt:     %d\n", tdq->tdq_switchcnt);
 	printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
 	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
 	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
 	printf("\trealtime runq:\n");
 	runq_print(&tdq->tdq_realtime);
 	printf("\ttimeshare runq:\n");
 	runq_print(&tdq->tdq_timeshare);
 	printf("\tidle runq:\n");
 	runq_print(&tdq->tdq_idle);
 }
 
 static inline int
 sched_shouldpreempt(int pri, int cpri, int remote)
 {
 	/*
 	 * If the new priority is not better than the current priority there is
 	 * nothing to do.
 	 */
 	if (pri >= cpri)
 		return (0);
 	/*
 	 * Always preempt idle.
 	 */
 	if (cpri >= PRI_MIN_IDLE)
 		return (1);
 	/*
 	 * If preemption is disabled don't preempt others.
 	 */
 	if (preempt_thresh == 0)
 		return (0);
 	/*
 	 * Preempt if we exceed the threshold.
 	 */
 	if (pri <= preempt_thresh)
 		return (1);
 	/*
 	 * If we're interactive or better and there is non-interactive
 	 * or worse running preempt only remote processors.
 	 */
 	if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
 		return (1);
 	return (0);
 }
 
 /*
  * Add a thread to the actual run-queue.  Keeps transferable counts up to
  * date with what is actually on the run-queue.  Selects the correct
  * queue position for timeshare threads.
  */
 static __inline void
 tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct td_sched *ts;
 	u_char pri;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	pri = td->td_priority;
 	ts = td_get_sched(td);
 	TD_SET_RUNQ(td);
 	if (THREAD_CAN_MIGRATE(td)) {
 		tdq->tdq_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
 	}
 	if (pri < PRI_MIN_BATCH) {
 		ts->ts_runq = &tdq->tdq_realtime;
 	} else if (pri <= PRI_MAX_BATCH) {
 		ts->ts_runq = &tdq->tdq_timeshare;
 		KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
 			("Invalid priority %d on timeshare runq", pri));
 		/*
 		 * This queue contains only priorities between MIN and MAX
 		 * realtime.  Use the whole queue to represent these values.
 		 */
 		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
 			pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
 			pri = (pri + tdq->tdq_idx) % RQ_NQS;
 			/*
 			 * This effectively shortens the queue by one so we
 			 * can have a one slot difference between idx and
 			 * ridx while we wait for threads to drain.
 			 */
 			if (tdq->tdq_ridx != tdq->tdq_idx &&
 			    pri == tdq->tdq_ridx)
 				pri = (unsigned char)(pri - 1) % RQ_NQS;
 		} else
 			pri = tdq->tdq_ridx;
 		runq_add_pri(ts->ts_runq, td, pri, flags);
 		return;
 	} else
 		ts->ts_runq = &tdq->tdq_idle;
 	runq_add(ts->ts_runq, td, flags);
 }
 
 /* 
  * Remove a thread from a run-queue.  This typically happens when a thread
  * is selected to run.  Running threads are not on the queue and the
  * transferable count does not reflect them.
  */
 static __inline void
 tdq_runq_rem(struct tdq *tdq, struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT(ts->ts_runq != NULL,
 	    ("tdq_runq_remove: thread %p null ts_runq", td));
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
 	}
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
 		if (tdq->tdq_idx != tdq->tdq_ridx)
 			runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
 		else
 			runq_remove_idx(ts->ts_runq, td, NULL);
 	} else
 		runq_remove(ts->ts_runq, td);
 }
 
 /*
  * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
  * for this thread to the referenced thread queue.
  */
 static void
 tdq_load_add(struct tdq *tdq, struct thread *td)
 {
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	tdq->tdq_load++;
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload++;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
  * Remove the load from a thread that is transitioning to a sleep state or
  * exiting.
  */
 static void
 tdq_load_rem(struct tdq *tdq, struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT(tdq->tdq_load != 0,
 	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
 
 	tdq->tdq_load--;
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload--;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
 	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
  * Bound timeshare latency by decreasing slice size as load increases.  We
  * consider the maximum latency as the sum of the threads waiting to run
  * aside from curthread and target no more than sched_slice latency but
  * no less than sched_slice_min runtime.
  */
 static inline int
 tdq_slice(struct tdq *tdq)
 {
 	int load;
 
 	/*
 	 * It is safe to use sys_load here because this is called from
 	 * contexts where timeshare threads are running and so there
 	 * cannot be higher priority load in the system.
 	 */
 	load = tdq->tdq_sysload - 1;
 	if (load >= SCHED_SLICE_MIN_DIVISOR)
 		return (sched_slice_min);
 	if (load <= 1)
 		return (sched_slice);
 	return (sched_slice / load);
 }
 
 /*
  * Set lowpri to its exact value by searching the run-queue and
  * evaluating curthread.  curthread may be passed as an optimization.
  */
 static void
 tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if (ctd == NULL)
 		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
 	td = tdq_choose(tdq);
 	if (td == NULL || td->td_priority > ctd->td_priority)
 		tdq->tdq_lowpri = ctd->td_priority;
 	else
 		tdq->tdq_lowpri = td->td_priority;
 }
 
 #ifdef SMP
 /*
  * We need some randomness. Implement a classic Linear Congruential
  * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for
  * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits
  * of the random state (in the low bits of our answer) to keep
  * the maximum randomness.
  */
 static uint32_t
 sched_random(void)
 {
 	uint32_t *rndptr;
 
 	rndptr = DPCPU_PTR(randomval);
 	*rndptr = *rndptr * 69069 + 5;
 
 	return (*rndptr >> 16);
 }
 
 struct cpu_search {
 	cpuset_t cs_mask;
 	u_int	cs_prefer;
 	int	cs_pri;		/* Min priority for low. */
 	int	cs_limit;	/* Max load for low, min load for high. */
 	int	cs_cpu;
 	int	cs_load;
 };
 
 #define	CPU_SEARCH_LOWEST	0x1
 #define	CPU_SEARCH_HIGHEST	0x2
 #define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
 
 #define	CPUSET_FOREACH(cpu, mask)				\
 	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
 		if (CPU_ISSET(cpu, &mask))
 
 static __always_inline int cpu_search(const struct cpu_group *cg,
     struct cpu_search *low, struct cpu_search *high, const int match);
 int __noinline cpu_search_lowest(const struct cpu_group *cg,
     struct cpu_search *low);
 int __noinline cpu_search_highest(const struct cpu_group *cg,
     struct cpu_search *high);
 int __noinline cpu_search_both(const struct cpu_group *cg,
     struct cpu_search *low, struct cpu_search *high);
 
 /*
  * Search the tree of cpu_groups for the lowest or highest loaded cpu
  * according to the match argument.  This routine actually compares the
  * load on all paths through the tree and finds the least loaded cpu on
  * the least loaded path, which may differ from the least loaded cpu in
  * the system.  This balances work among caches and buses.
  *
  * This inline is instantiated in three forms below using constants for the
  * match argument.  It is reduced to the minimum set for each case.  It is
  * also recursive to the depth of the tree.
  */
 static __always_inline int
 cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match)
 {
 	struct cpu_search lgroup;
 	struct cpu_search hgroup;
 	cpuset_t cpumask;
 	struct cpu_group *child;
 	struct tdq *tdq;
 	int cpu, i, hload, lload, load, total, rnd;
 
 	total = 0;
 	cpumask = cg->cg_mask;
 	if (match & CPU_SEARCH_LOWEST) {
 		lload = INT_MAX;
 		lgroup = *low;
 	}
 	if (match & CPU_SEARCH_HIGHEST) {
 		hload = INT_MIN;
 		hgroup = *high;
 	}
 
 	/* Iterate through the child CPU groups and then remaining CPUs. */
 	for (i = cg->cg_children, cpu = mp_maxid; ; ) {
 		if (i == 0) {
 #ifdef HAVE_INLINE_FFSL
 			cpu = CPU_FFS(&cpumask) - 1;
 #else
 			while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
 				cpu--;
 #endif
 			if (cpu < 0)
 				break;
 			child = NULL;
 		} else
 			child = &cg->cg_child[i - 1];
 
 		if (match & CPU_SEARCH_LOWEST)
 			lgroup.cs_cpu = -1;
 		if (match & CPU_SEARCH_HIGHEST)
 			hgroup.cs_cpu = -1;
 		if (child) {			/* Handle child CPU group. */
 			CPU_NAND(&cpumask, &child->cg_mask);
 			switch (match) {
 			case CPU_SEARCH_LOWEST:
 				load = cpu_search_lowest(child, &lgroup);
 				break;
 			case CPU_SEARCH_HIGHEST:
 				load = cpu_search_highest(child, &hgroup);
 				break;
 			case CPU_SEARCH_BOTH:
 				load = cpu_search_both(child, &lgroup, &hgroup);
 				break;
 			}
 		} else {			/* Handle child CPU. */
 			CPU_CLR(cpu, &cpumask);
 			tdq = TDQ_CPU(cpu);
 			load = tdq->tdq_load * 256;
 			rnd = sched_random() % 32;
 			if (match & CPU_SEARCH_LOWEST) {
 				if (cpu == low->cs_prefer)
 					load -= 64;
 				/* If that CPU is allowed and get data. */
 				if (tdq->tdq_lowpri > lgroup.cs_pri &&
 				    tdq->tdq_load <= lgroup.cs_limit &&
 				    CPU_ISSET(cpu, &lgroup.cs_mask)) {
 					lgroup.cs_cpu = cpu;
 					lgroup.cs_load = load - rnd;
 				}
 			}
 			if (match & CPU_SEARCH_HIGHEST)
 				if (tdq->tdq_load >= hgroup.cs_limit &&
 				    tdq->tdq_transferable &&
 				    CPU_ISSET(cpu, &hgroup.cs_mask)) {
 					hgroup.cs_cpu = cpu;
 					hgroup.cs_load = load - rnd;
 				}
 		}
 		total += load;
 
 		/* We have info about child item. Compare it. */
 		if (match & CPU_SEARCH_LOWEST) {
 			if (lgroup.cs_cpu >= 0 &&
 			    (load < lload ||
 			     (load == lload && lgroup.cs_load < low->cs_load))) {
 				lload = load;
 				low->cs_cpu = lgroup.cs_cpu;
 				low->cs_load = lgroup.cs_load;
 			}
 		}
 		if (match & CPU_SEARCH_HIGHEST)
 			if (hgroup.cs_cpu >= 0 &&
 			    (load > hload ||
 			     (load == hload && hgroup.cs_load > high->cs_load))) {
 				hload = load;
 				high->cs_cpu = hgroup.cs_cpu;
 				high->cs_load = hgroup.cs_load;
 			}
 		if (child) {
 			i--;
 			if (i == 0 && CPU_EMPTY(&cpumask))
 				break;
 		}
 #ifndef HAVE_INLINE_FFSL
 		else
 			cpu--;
 #endif
 	}
 	return (total);
 }
 
 /*
  * cpu_search instantiations must pass constants to maintain the inline
  * optimization.
  */
 int
 cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
 {
 	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
 }
 
 int
 cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
 {
 	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
 }
 
 int
 cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high)
 {
 	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
 }
 
 /*
  * Find the cpu with the least load via the least loaded path that has a
  * lowpri greater than pri  pri.  A pri of -1 indicates any priority is
  * acceptable.
  */
 static inline int
 sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
     int prefer)
 {
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
 	low.cs_prefer = prefer;
 	low.cs_mask = mask;
 	low.cs_pri = pri;
 	low.cs_limit = maxload;
 	cpu_search_lowest(cg, &low);
 	return low.cs_cpu;
 }
 
 /*
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
 sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
 {
 	struct cpu_search high;
 
 	high.cs_cpu = -1;
 	high.cs_mask = mask;
 	high.cs_limit = minload;
 	cpu_search_highest(cg, &high);
 	return high.cs_cpu;
 }
 
 static void
 sched_balance_group(struct cpu_group *cg)
 {
 	cpuset_t hmask, lmask;
 	int high, low, anylow;
 
 	CPU_FILL(&hmask);
 	for (;;) {
 		high = sched_highest(cg, hmask, 1);
 		/* Stop if there is no more CPU with transferrable threads. */
 		if (high == -1)
 			break;
 		CPU_CLR(high, &hmask);
 		CPU_COPY(&hmask, &lmask);
 		/* Stop if there is no more CPU left for low. */
 		if (CPU_EMPTY(&lmask))
 			break;
 		anylow = 1;
 nextlow:
 		low = sched_lowest(cg, lmask, -1,
 		    TDQ_CPU(high)->tdq_load - 1, high);
 		/* Stop if we looked well and found no less loaded CPU. */
 		if (anylow && low == -1)
 			break;
 		/* Go to next high if we found no less loaded CPU. */
 		if (low == -1)
 			continue;
 		/* Transfer thread from high to low. */
 		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
 			/* CPU that got thread can no longer be a donor. */
 			CPU_CLR(low, &hmask);
 		} else {
 			/*
 			 * If failed, then there is no threads on high
 			 * that can run on this low. Drop low from low
 			 * mask and look for different one.
 			 */
 			CPU_CLR(low, &lmask);
 			anylow = 0;
 			goto nextlow;
 		}
 	}
 }
 
 static void
 sched_balance(void)
 {
 	struct tdq *tdq;
 
 	if (smp_started == 0 || rebalance == 0)
 		return;
 
 	balance_ticks = max(balance_interval / 2, 1) +
 	    (sched_random() % balance_interval);
 	tdq = TDQ_SELF();
 	TDQ_UNLOCK(tdq);
 	sched_balance_group(cpu_top);
 	TDQ_LOCK(tdq);
 }
 
 /*
  * Lock two thread queues using their address to maintain lock order.
  */
 static void
 tdq_lock_pair(struct tdq *one, struct tdq *two)
 {
 	if (one < two) {
 		TDQ_LOCK(one);
 		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
 	} else {
 		TDQ_LOCK(two);
 		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
 	}
 }
 
 /*
  * Unlock two thread queues.  Order is not important here.
  */
 static void
 tdq_unlock_pair(struct tdq *one, struct tdq *two)
 {
 	TDQ_UNLOCK(one);
 	TDQ_UNLOCK(two);
 }
 
 /*
  * Transfer load between two imbalanced thread queues.
  */
 static int
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
 	int moved;
 	int cpu;
 
 	tdq_lock_pair(high, low);
 	moved = 0;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
 	 */
 	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
 	    (moved = tdq_move(high, low)) > 0) {
 		/*
 		 * In case the target isn't the current cpu IPI it to force a
 		 * reschedule with the new workload.
 		 */
 		cpu = TDQ_ID(low);
 		if (cpu != PCPU_GET(cpuid))
 			ipi_cpu(cpu, IPI_PREEMPT);
 	}
 	tdq_unlock_pair(high, low);
 	return (moved);
 }
 
 /*
  * Move a thread from one thread queue to another.
  */
 static int
 tdq_move(struct tdq *from, struct tdq *to)
 {
 	struct td_sched *ts;
 	struct thread *td;
 	struct tdq *tdq;
 	int cpu;
 
 	TDQ_LOCK_ASSERT(from, MA_OWNED);
 	TDQ_LOCK_ASSERT(to, MA_OWNED);
 
 	tdq = from;
 	cpu = TDQ_ID(to);
 	td = tdq_steal(tdq, cpu);
 	if (td == NULL)
 		return (0);
 	ts = td_get_sched(td);
 	/*
 	 * Although the run queue is locked the thread may be blocked.  Lock
 	 * it to clear this and acquire the run-queue lock.
 	 */
 	thread_lock(td);
 	/* Drop recursive lock on from acquired via thread_lock(). */
 	TDQ_UNLOCK(from);
 	sched_rem(td);
 	ts->ts_cpu = cpu;
 	td->td_lock = TDQ_LOCKPTR(to);
 	tdq_add(to, td, SRQ_YIELDING);
 	return (1);
 }
 
 /*
  * This tdq has idled.  Try to steal a thread from another cpu and switch
  * to it.
  */
 static int
 tdq_idled(struct tdq *tdq)
 {
 	struct cpu_group *cg;
 	struct tdq *steal;
 	cpuset_t mask;
 	int thresh;
 	int cpu;
 
 	if (smp_started == 0 || steal_idle == 0)
 		return (1);
 	CPU_FILL(&mask);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
 	/* We don't want to be preempted while we're iterating. */
 	spinlock_enter();
 	for (cg = tdq->tdq_cg; cg != NULL; ) {
 		if ((cg->cg_flags & CG_FLAG_THREAD) == 0)
 			thresh = steal_thresh;
 		else
 			thresh = 1;
 		cpu = sched_highest(cg, mask, thresh);
 		if (cpu == -1) {
 			cg = cg->cg_parent;
 			continue;
 		}
 		steal = TDQ_CPU(cpu);
 		CPU_CLR(cpu, &mask);
 		tdq_lock_pair(tdq, steal);
 		if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
 			tdq_unlock_pair(tdq, steal);
 			continue;
 		}
 		/*
 		 * If a thread was added while interrupts were disabled don't
 		 * steal one here.  If we fail to acquire one due to affinity
 		 * restrictions loop again with this cpu removed from the
 		 * set.
 		 */
 		if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
 			tdq_unlock_pair(tdq, steal);
 			continue;
 		}
 		spinlock_exit();
 		TDQ_UNLOCK(steal);
 		mi_switch(SW_VOL | SWT_IDLE, NULL);
 		thread_unlock(curthread);
 
 		return (0);
 	}
 	spinlock_exit();
 	return (1);
 }
 
 /*
  * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
  */
 static void
 tdq_notify(struct tdq *tdq, struct thread *td)
 {
 	struct thread *ctd;
 	int pri;
 	int cpu;
 
 	if (tdq->tdq_ipipending)
 		return;
 	cpu = td_get_sched(td)->ts_cpu;
 	pri = td->td_priority;
 	ctd = pcpu_find(cpu)->pc_curthread;
 	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
 		return;
 
 	/*
 	 * Make sure that our caller's earlier update to tdq_load is
 	 * globally visible before we read tdq_cpu_idle.  Idle thread
 	 * accesses both of them without locks, and the order is important.
 	 */
 	atomic_thread_fence_seq_cst();
 
 	if (TD_IS_IDLETHREAD(ctd)) {
 		/*
 		 * If the MD code has an idle wakeup routine try that before
 		 * falling back to IPI.
 		 */
 		if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
 			return;
 	}
 	tdq->tdq_ipipending = 1;
 	ipi_cpu(cpu, IPI_PREEMPT);
 }
 
 /*
  * Steals load from a timeshare queue.  Honors the rotating queue head
  * index.
  */
 static struct thread *
 runq_steal_from(struct runq *rq, int cpu, u_char start)
 {
 	struct rqbits *rqb;
 	struct rqhead *rqh;
 	struct thread *td, *first;
 	int bit;
 	int i;
 
 	rqb = &rq->rq_status;
 	bit = start & (RQB_BPW -1);
 	first = NULL;
 again:
 	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
 		if (rqb->rqb_bits[i] == 0)
 			continue;
 		if (bit == 0)
 			bit = RQB_FFS(rqb->rqb_bits[i]);
 		for (; bit < RQB_BPW; bit++) {
 			if ((rqb->rqb_bits[i] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)];
 			TAILQ_FOREACH(td, rqh, td_runq) {
 				if (first && THREAD_CAN_MIGRATE(td) &&
 				    THREAD_CAN_SCHED(td, cpu))
 					return (td);
 				first = td;
 			}
 		}
 	}
 	if (start != 0) {
 		start = 0;
 		goto again;
 	}
 
 	if (first && THREAD_CAN_MIGRATE(first) &&
 	    THREAD_CAN_SCHED(first, cpu))
 		return (first);
 	return (NULL);
 }
 
 /*
  * Steals load from a standard linear queue.
  */
 static struct thread *
 runq_steal(struct runq *rq, int cpu)
 {
 	struct rqhead *rqh;
 	struct rqbits *rqb;
 	struct thread *td;
 	int word;
 	int bit;
 
 	rqb = &rq->rq_status;
 	for (word = 0; word < RQB_LEN; word++) {
 		if (rqb->rqb_bits[word] == 0)
 			continue;
 		for (bit = 0; bit < RQB_BPW; bit++) {
 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
 			TAILQ_FOREACH(td, rqh, td_runq)
 				if (THREAD_CAN_MIGRATE(td) &&
 				    THREAD_CAN_SCHED(td, cpu))
 					return (td);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Attempt to steal a thread in priority order from a thread queue.
  */
 static struct thread *
 tdq_steal(struct tdq *tdq, int cpu)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
 		return (td);
 	if ((td = runq_steal_from(&tdq->tdq_timeshare,
 	    cpu, tdq->tdq_ridx)) != NULL)
 		return (td);
 	return (runq_steal(&tdq->tdq_idle, cpu));
 }
 
 /*
  * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
  * current lock and returns with the assigned queue locked.
  */
 static inline struct tdq *
 sched_setcpu(struct thread *td, int cpu, int flags)
 {
 
 	struct tdq *tdq;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_CPU(cpu);
 	td_get_sched(td)->ts_cpu = cpu;
 	/*
 	 * If the lock matches just return the queue.
 	 */
 	if (td->td_lock == TDQ_LOCKPTR(tdq))
 		return (tdq);
 #ifdef notyet
 	/*
 	 * If the thread isn't running its lockptr is a
 	 * turnstile or a sleepqueue.  We can just lock_set without
 	 * blocking.
 	 */
 	if (TD_CAN_RUN(td)) {
 		TDQ_LOCK(tdq);
 		thread_lock_set(td, TDQ_LOCKPTR(tdq));
 		return (tdq);
 	}
 #endif
 	/*
 	 * The hard case, migration, we need to block the thread first to
 	 * prevent order reversals with other cpus locks.
 	 */
 	spinlock_enter();
 	thread_lock_block(td);
 	TDQ_LOCK(tdq);
 	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
 	spinlock_exit();
 	return (tdq);
 }
 
 SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
 SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
 SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
 SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
 SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
 SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
 
 static int
 sched_pickcpu(struct thread *td, int flags)
 {
 	struct cpu_group *cg, *ccg;
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t mask;
 	int cpu, pri, self;
 
 	self = PCPU_GET(cpuid);
 	ts = td_get_sched(td);
 	KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on "
 	    "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name));
 	if (smp_started == 0)
 		return (self);
 	/*
 	 * Don't migrate a running thread from sched_switch().
 	 */
 	if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
 		return (ts->ts_cpu);
 	/*
 	 * Prefer to run interrupt threads on the processors that generate
 	 * the interrupt.
 	 */
 	pri = td->td_priority;
 	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
 	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
 		SCHED_STAT_INC(pickcpu_intrbind);
 		ts->ts_cpu = self;
 		if (TDQ_CPU(self)->tdq_lowpri > pri) {
 			SCHED_STAT_INC(pickcpu_affinity);
 			return (ts->ts_cpu);
 		}
 	}
 	/*
 	 * If the thread can run on the last cpu and the affinity has not
 	 * expired or it is idle run it there.
 	 */
 	tdq = TDQ_CPU(ts->ts_cpu);
 	cg = tdq->tdq_cg;
 	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
 	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
 	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
 		if (cg->cg_flags & CG_FLAG_THREAD) {
 			CPUSET_FOREACH(cpu, cg->cg_mask) {
 				if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
 					break;
 			}
 		} else
 			cpu = INT_MAX;
 		if (cpu > mp_maxid) {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (ts->ts_cpu);
 		}
 	}
 	/*
 	 * Search for the last level cache CPU group in the tree.
 	 * Skip caches with expired affinity time and SMT groups.
 	 * Affinity to higher level caches will be handled less aggressively.
 	 */
 	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
 		if (cg->cg_flags & CG_FLAG_THREAD)
 			continue;
 		if (!SCHED_AFFINITY(ts, cg->cg_level))
 			continue;
 		ccg = cg;
 	}
 	if (ccg != NULL)
 		cg = ccg;
 	cpu = -1;
 	/* Search the group for the less loaded idle CPU we can run now. */
 	mask = td->td_cpuset->cs_mask;
 	if (cg != NULL && cg != cpu_top &&
 	    CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0)
 		cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
 		    INT_MAX, ts->ts_cpu);
 	/* Search globally for the less loaded CPU we can run now. */
 	if (cpu == -1)
 		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
 	/* Search globally for the less loaded CPU. */
 	if (cpu == -1)
 		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
 	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
 	KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu));
 	/*
 	 * Compare the lowest loaded cpu to current cpu.
 	 */
 	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
 	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
 	    TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
 		SCHED_STAT_INC(pickcpu_local);
 		cpu = self;
 	} else
 		SCHED_STAT_INC(pickcpu_lowest);
 	if (cpu != ts->ts_cpu)
 		SCHED_STAT_INC(pickcpu_migration);
 	return (cpu);
 }
 #endif
 
 /*
  * Pick the highest priority task we have and return it.
  */
 static struct thread *
 tdq_choose(struct tdq *tdq)
 {
 	struct thread *td;
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	td = runq_choose(&tdq->tdq_realtime);
 	if (td != NULL)
 		return (td);
 	td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_BATCH,
 		    ("tdq_choose: Invalid priority on timeshare queue %d",
 		    td->td_priority));
 		return (td);
 	}
 	td = runq_choose(&tdq->tdq_idle);
 	if (td != NULL) {
 		KASSERT(td->td_priority >= PRI_MIN_IDLE,
 		    ("tdq_choose: Invalid priority on idle queue %d",
 		    td->td_priority));
 		return (td);
 	}
 
 	return (NULL);
 }
 
 /*
  * Initialize a thread queue.
  */
 static void
 tdq_setup(struct tdq *tdq)
 {
 
 	if (bootverbose)
 		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
 	runq_init(&tdq->tdq_realtime);
 	runq_init(&tdq->tdq_timeshare);
 	runq_init(&tdq->tdq_idle);
 	snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
 	    "sched lock %d", (int)TDQ_ID(tdq));
 	mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
 	    MTX_SPIN | MTX_RECURSE);
 #ifdef KTR
 	snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
 	    "CPU %d load", (int)TDQ_ID(tdq));
 #endif
 }
 
 #ifdef SMP
 static void
 sched_setup_smp(void)
 {
 	struct tdq *tdq;
 	int i;
 
 	cpu_top = smp_topo();
 	CPU_FOREACH(i) {
 		tdq = TDQ_CPU(i);
 		tdq_setup(tdq);
 		tdq->tdq_cg = smp_topo_find(cpu_top, i);
 		if (tdq->tdq_cg == NULL)
 			panic("Can't find cpu group for %d\n", i);
 	}
 	balance_tdq = TDQ_SELF();
 	sched_balance();
 }
 #endif
 
 /*
  * Setup the thread queues and initialize the topology based on MD
  * information.
  */
 static void
 sched_setup(void *dummy)
 {
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 #ifdef SMP
 	sched_setup_smp();
 #else
 	tdq_setup(tdq);
 #endif
 
 	/* Add thread0's load since it's running. */
 	TDQ_LOCK(tdq);
 	td_get_sched(&thread0)->ts_cpu = curcpu; /* Something valid to start */
 	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
 	tdq_load_add(tdq, &thread0);
 	tdq->tdq_lowpri = thread0.td_priority;
 	TDQ_UNLOCK(tdq);
 }
 
 /*
  * This routine determines time constants after stathz and hz are setup.
  */
 /* ARGSUSED */
 static void
 sched_initticks(void *dummy)
 {
 	int incr;
 
 	realstathz = stathz ? stathz : hz;
 	sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
 	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 
 	/*
 	 * tickincr is shifted out by 10 to avoid rounding errors due to
 	 * hz not being evenly divisible by stathz on all platforms.
 	 */
 	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
 	/*
 	 * This does not work for values of stathz that are more than
 	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
 	 */
 	if (incr == 0)
 		incr = 1;
 	tickincr = incr;
 #ifdef SMP
 	/*
 	 * Set the default balance interval now that we know
 	 * what realstathz is.
 	 */
 	balance_interval = realstathz;
 	affinity = SCHED_AFFINITY_DEFAULT;
 #endif
 	if (sched_idlespinthresh < 0)
 		sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
 }
 
 
 /*
  * This is the core of the interactivity algorithm.  Determines a score based
  * on past behavior.  It is the ratio of sleep time to run time scaled to
  * a [0, 100] integer.  This is the voluntary sleep time of a process, which
  * differs from the cpu usage because it does not account for time spent
  * waiting on a run-queue.  Would be prettier if we had floating point.
  *
  * When a thread's sleep time is greater than its run time the
  * calculation is:
  *
  *                           scaling factor 
  * interactivity score =  ---------------------
  *                        sleep time / run time
  *
  *
  * When a thread's run time is greater than its sleep time the
  * calculation is:
  *
  *                           scaling factor 
  * interactivity score =  ---------------------    + scaling factor
  *                        run time / sleep time
  */
 static int
 sched_interact_score(struct thread *td)
 {
 	struct td_sched *ts;
 	int div;
 
 	ts = td_get_sched(td);
 	/*
 	 * The score is only needed if this is likely to be an interactive
 	 * task.  Don't go through the expense of computing it if there's
 	 * no chance.
 	 */
 	if (sched_interact <= SCHED_INTERACT_HALF &&
 		ts->ts_runtime >= ts->ts_slptime)
 			return (SCHED_INTERACT_HALF);
 
 	if (ts->ts_runtime > ts->ts_slptime) {
 		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
 		return (SCHED_INTERACT_HALF +
 		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
 	}
 	if (ts->ts_slptime > ts->ts_runtime) {
 		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
 		return (ts->ts_runtime / div);
 	}
 	/* runtime == slptime */
 	if (ts->ts_runtime)
 		return (SCHED_INTERACT_HALF);
 
 	/*
 	 * This can happen if slptime and runtime are 0.
 	 */
 	return (0);
 
 }
 
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
  */
 static void
 sched_priority(struct thread *td)
 {
 	int score;
 	int pri;
 
 	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
 		return;
 	/*
 	 * If the score is interactive we place the thread in the realtime
 	 * queue with a priority that is less than kernel and interrupt
 	 * priorities.  These threads are not subject to nice restrictions.
 	 *
 	 * Scores greater than this are placed on the normal timeshare queue
 	 * where the priority is partially decided by the most recent cpu
 	 * utilization and the rest is decided by nice value.
 	 *
 	 * The nice value of the process has a linear effect on the calculated
 	 * score.  Negative nice values make it easier for a thread to be
 	 * considered interactive.
 	 */
 	score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
 	if (score < sched_interact) {
 		pri = PRI_MIN_INTERACT;
 		pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) /
 		    sched_interact) * score;
 		KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
 		    ("sched_priority: invalid interactive priority %d score %d",
 		    pri, score));
 	} else {
 		pri = SCHED_PRI_MIN;
 		if (td_get_sched(td)->ts_ticks)
 			pri += min(SCHED_PRI_TICKS(td_get_sched(td)),
 			    SCHED_PRI_RANGE - 1);
 		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
 		KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
 		    ("sched_priority: invalid priority %d: nice %d, " 
 		    "ticks %d ftick %d ltick %d tick pri %d",
 		    pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks,
 		    td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick,
 		    SCHED_PRI_TICKS(td_get_sched(td))));
 	}
 	sched_user_prio(td, pri);
 
 	return;
 }
 
 /*
  * This routine enforces a maximum limit on the amount of scheduling history
  * kept.  It is called after either the slptime or runtime is adjusted.  This
  * function is ugly due to integer math.
  */
 static void
 sched_interact_update(struct thread *td)
 {
 	struct td_sched *ts;
 	u_int sum;
 
 	ts = td_get_sched(td);
 	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
 		return;
 	/*
 	 * This only happens from two places:
 	 * 1) We have added an unusual amount of run time from fork_exit.
 	 * 2) We have added an unusual amount of sleep time from sched_sleep().
 	 */
 	if (sum > SCHED_SLP_RUN_MAX * 2) {
 		if (ts->ts_runtime > ts->ts_slptime) {
 			ts->ts_runtime = SCHED_SLP_RUN_MAX;
 			ts->ts_slptime = 1;
 		} else {
 			ts->ts_slptime = SCHED_SLP_RUN_MAX;
 			ts->ts_runtime = 1;
 		}
 		return;
 	}
 	/*
 	 * If we have exceeded by more than 1/5th then the algorithm below
 	 * will not bring us back into range.  Dividing by two here forces
 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 	 */
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 		ts->ts_runtime /= 2;
 		ts->ts_slptime /= 2;
 		return;
 	}
 	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
 	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
 }
 
 /*
  * Scale back the interactivity history when a child thread is created.  The
  * history is inherited from the parent but the thread may behave totally
  * differently.  For example, a shell spawning a compiler process.  We want
  * to learn that the compiler is behaving badly very quickly.
  */
 static void
 sched_interact_fork(struct thread *td)
 {
 	struct td_sched *ts;
 	int ratio;
 	int sum;
 
 	ts = td_get_sched(td);
 	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum > SCHED_SLP_RUN_FORK) {
 		ratio = sum / SCHED_SLP_RUN_FORK;
 		ts->ts_runtime /= ratio;
 		ts->ts_slptime /= ratio;
 	}
 }
 
 /*
  * Called from proc0_init() to setup the scheduler fields.
  */
 void
 schedinit(void)
 {
 	struct td_sched *ts0;
 
 	/*
 	 * Set up the scheduler specific parts of thread0.
 	 */
 	ts0 = td_get_sched(&thread0);
 	ts0->ts_ltick = ticks;
 	ts0->ts_ftick = ticks;
 	ts0->ts_slice = 0;
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
  * at most sched_slice stathz ticks.
  */
 int
 sched_rr_interval(void)
 {
 
 	/* Convert sched_slice from stathz to hz. */
 	return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
 }
 
 /*
  * Update the percent cpu tracking information when it is requested or
  * the total history exceeds the maximum.  We keep a sliding history of
  * tick counts that slowly decays.  This is less precise than the 4BSD
  * mechanism since it happens with less regular and frequent events.
  */
 static void
 sched_pctcpu_update(struct td_sched *ts, int run)
 {
 	int t = ticks;
 
 	/*
 	 * The signed difference may be negative if the thread hasn't run for
 	 * over half of the ticks rollover period.
 	 */
 	if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) {
 		ts->ts_ticks = 0;
 		ts->ts_ftick = t - SCHED_TICK_TARG;
 	} else if (t - ts->ts_ftick >= SCHED_TICK_MAX) {
 		ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) *
 		    (ts->ts_ltick - (t - SCHED_TICK_TARG));
 		ts->ts_ftick = t - SCHED_TICK_TARG;
 	}
 	if (run)
 		ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT;
 	ts->ts_ltick = t;
 }
 
 /*
  * Adjust the priority of a thread.  Move it to the appropriate run-queue
  * if necessary.  This is the back-end for several priority related
  * functions.
  */
 static void
 sched_thread_priority(struct thread *td, u_char prio)
 {
 	struct td_sched *ts;
 	struct tdq *tdq;
 	int oldpri;
 
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
 	    "prio:%d", td->td_priority, "new prio:%d", prio,
 	    KTR_ATTR_LINKED, sched_tdname(curthread));
 	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 	if (td != curthread && prio < td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
 		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
 		    curthread);
 	} 
 	ts = td_get_sched(td);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
 	/*
 	 * If the priority has been elevated due to priority
 	 * propagation, we may have to move ourselves to a new
 	 * queue.  This could be optimized to not re-add in some
 	 * cases.
 	 */
 	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
 		sched_rem(td);
 		td->td_priority = prio;
 		sched_add(td, SRQ_BORROWING);
 		return;
 	}
 	/*
 	 * If the thread is currently running we may have to adjust the lowpri
 	 * information so other cpus are aware of our current priority.
 	 */
 	if (TD_IS_RUNNING(td)) {
 		tdq = TDQ_CPU(ts->ts_cpu);
 		oldpri = td->td_priority;
 		td->td_priority = prio;
 		if (prio < tdq->tdq_lowpri)
 			tdq->tdq_lowpri = prio;
 		else if (tdq->tdq_lowpri == oldpri)
 			tdq_setlowpri(tdq, td);
 		return;
 	}
 	td->td_priority = prio;
 }
 
 /*
  * Update a thread's priority when it is lent another thread's
  * priority.
  */
 void
 sched_lend_prio(struct thread *td, u_char prio)
 {
 
 	td->td_flags |= TDF_BORROWING;
 	sched_thread_priority(td, prio);
 }
 
 /*
  * Restore a thread's priority when priority propagation is
  * over.  The prio argument is the minimum priority the thread
  * needs to have to satisfy other possible priority lending
  * requests.  If the thread's regular priority is less
  * important than prio, the thread will keep a priority boost
  * of prio.
  */
 void
 sched_unlend_prio(struct thread *td, u_char prio)
 {
 	u_char base_pri;
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
 		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
 		td->td_flags &= ~TDF_BORROWING;
 		sched_thread_priority(td, base_pri);
 	} else
 		sched_lend_prio(td, prio);
 }
 
 /*
  * Standard entry for setting the priority to an absolute value.
  */
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	u_char oldprio;
 
 	/* First, update the base priority. */
 	td->td_base_pri = prio;
 
 	/*
 	 * If the thread is borrowing another thread's priority, don't
 	 * ever lower the priority.
 	 */
 	if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 		return;
 
 	/* Change the real priority. */
 	oldprio = td->td_priority;
 	sched_thread_priority(td, prio);
 
 	/*
 	 * If the thread is on a turnstile, then let the turnstile update
 	 * its state.
 	 */
 	if (TD_ON_LOCK(td) && oldprio != prio)
 		turnstile_adjust(td, oldprio);
 }
 
 /*
  * Set the base user priority, does not effect current running priority.
  */
 void
 sched_user_prio(struct thread *td, u_char prio)
 {
 
 	td->td_base_user_pri = prio;
 	if (td->td_lend_user_pri <= prio)
 		return;
 	td->td_user_pri = prio;
 }
 
 void
 sched_lend_user_prio(struct thread *td, u_char prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_lend_user_pri = prio;
 	td->td_user_pri = min(prio, td->td_base_user_pri);
 	if (td->td_priority > td->td_user_pri)
 		sched_prio(td, td->td_user_pri);
 	else if (td->td_priority != td->td_user_pri)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Handle migration from sched_switch().  This happens only for
  * cpu binding.
  */
 static struct mtx *
 sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
 {
 	struct tdq *tdn;
 
 	KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: "
 	    "thread %s queued on absent CPU %d.", td->td_name,
 	    td_get_sched(td)->ts_cpu));
 	tdn = TDQ_CPU(td_get_sched(td)->ts_cpu);
 #ifdef SMP
 	tdq_load_rem(tdq, td);
 	/*
 	 * Do the lock dance required to avoid LOR.  We grab an extra
 	 * spinlock nesting to prevent preemption while we're
 	 * not holding either run-queue lock.
 	 */
 	spinlock_enter();
 	thread_lock_block(td);	/* This releases the lock on tdq. */
 
 	/*
 	 * Acquire both run-queue locks before placing the thread on the new
 	 * run-queue to avoid deadlocks created by placing a thread with a
 	 * blocked lock on the run-queue of a remote processor.  The deadlock
 	 * occurs when a third processor attempts to lock the two queues in
 	 * question while the target processor is spinning with its own
 	 * run-queue lock held while waiting for the blocked lock to clear.
 	 */
 	tdq_lock_pair(tdn, tdq);
 	tdq_add(tdn, td, flags);
 	tdq_notify(tdn, td);
 	TDQ_UNLOCK(tdn);
 	spinlock_exit();
 #endif
 	return (TDQ_LOCKPTR(tdn));
 }
 
 /*
  * Variadic version of thread_lock_unblock() that does not assume td_lock
  * is blocked.
  */
 static inline void
 thread_unblock_switch(struct thread *td, struct mtx *mtx)
 {
 	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
 	    (uintptr_t)mtx);
 }
 
 /*
  * Switch threads.  This function has to handle threads coming in while
  * blocked for some reason, running, or idle.  It also must deal with
  * migrating a thread from one queue to another as running threads may
  * be assigned elsewhere via binding.
  */
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
 	struct mtx *mtx;
 	int srqflag;
 	int cpuid, preempted;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
 
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_CPU(cpuid);
 	ts = td_get_sched(td);
 	mtx = td->td_lock;
 	sched_pctcpu_update(ts, 1);
 	ts->ts_rltick = ticks;
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
 	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
 	    (flags & SW_PREEMPT) != 0;
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	if (!TD_IS_IDLETHREAD(td))
 		tdq->tdq_switchcnt++;
 	/*
 	 * The lock pointer in an idle thread should never change.  Reset it
 	 * to CAN_RUN as well.
 	 */
 	if (TD_IS_IDLETHREAD(td)) {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		TD_SET_CAN_RUN(td);
 	} else if (TD_IS_RUNNING(td)) {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		srqflag = preempted ?
 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 		    SRQ_OURSELF|SRQ_YIELDING;
 #ifdef SMP
 		if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
 			ts->ts_cpu = sched_pickcpu(td, 0);
 #endif
 		if (ts->ts_cpu == cpuid)
 			tdq_runq_add(tdq, td, srqflag);
 		else {
 			KASSERT(THREAD_CAN_MIGRATE(td) ||
 			    (ts->ts_flags & TSF_BOUND) != 0,
 			    ("Thread %p shouldn't migrate", td));
 			mtx = sched_switch_migrate(tdq, td, srqflag);
 		}
 	} else {
 		/* This thread must be going to sleep. */
 		TDQ_LOCK(tdq);
 		mtx = thread_lock_block(td);
 		tdq_load_rem(tdq, td);
 	}
 
 #if (KTR_COMPILE & KTR_SCHED) != 0
 	if (TD_IS_IDLETHREAD(td))
 		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
 		    "prio:%d", td->td_priority);
 	else
 		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
 		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
 		    "lockname:\"%s\"", td->td_lockname);
 #endif
 
 	/*
 	 * We enter here with the thread blocked and assigned to the
 	 * appropriate cpu run-queue or sleep-queue and with the current
 	 * thread-queue locked.
 	 */
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	newtd = choosethread();
 	/*
 	 * Call the MD code to switch contexts if necessary.
 	 */
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 		sched_pctcpu_update(td_get_sched(newtd), 0);
 
 #ifdef KDTRACE_HOOKS
 		/*
 		 * If DTrace has set the active vtime enum to anything
 		 * other than INACTIVE (0), then it should have set the
 		 * function to call.
 		 */
 		if (dtrace_vtime_active)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 
 		cpu_switch(td, newtd, mtx);
 		/*
 		 * We may return from cpu_switch on a different cpu.  However,
 		 * we always return with td_lock pointing to the current cpu's
 		 * run queue lock.
 		 */
 		cpuid = PCPU_GET(cpuid);
 		tdq = TDQ_CPU(cpuid);
 		lock_profile_obtain_lock_success(
 		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
 
 		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else {
 		thread_unblock_switch(td, mtx);
 		SDT_PROBE0(sched, , , remain__cpu);
 	}
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 
 	/*
 	 * Assert that all went well and return.
 	 */
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	td->td_oncpu = cpuid;
 }
 
 /*
  * Adjust thread priorities as a result of a nice request.
  */
 void
 sched_nice(struct proc *p, int nice)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		sched_priority(td);
 		sched_prio(td, td->td_base_user_pri);
 		thread_unlock(td);
 	}
 }
 
 /*
  * Record the sleep time for the interactivity scorer.
  */
 void
 sched_sleep(struct thread *td, int prio)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	td->td_slptick = ticks;
 	if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
 		td->td_flags |= TDF_CANSWAP;
 	if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
 		return;
 	if (static_boost == 1 && prio)
 		sched_prio(td, prio);
 	else if (static_boost && td->td_priority > static_boost)
 		sched_prio(td, static_boost);
 }
 
 /*
  * Schedule a thread to resume execution and record how long it voluntarily
  * slept.  We also update the pctcpu, interactivity, and priority.
  */
 void
 sched_wakeup(struct thread *td)
 {
 	struct td_sched *ts;
 	int slptick;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	td->td_flags &= ~TDF_CANSWAP;
 	/*
 	 * If we slept for more than a tick update our interactivity and
 	 * priority.
 	 */
 	slptick = td->td_slptick;
 	td->td_slptick = 0;
 	if (slptick && slptick != ticks) {
 		ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
 		sched_interact_update(td);
 		sched_pctcpu_update(ts, 0);
 	}
 	/*
 	 * Reset the slice value since we slept and advanced the round-robin.
 	 */
 	ts->ts_slice = 0;
 	sched_add(td, SRQ_BORING);
 }
 
 /*
  * Penalize the parent for creating a new child and initialize the child's
  * priority.
  */
 void
 sched_fork(struct thread *td, struct thread *child)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_pctcpu_update(td_get_sched(td), 1);
 	sched_fork_thread(td, child);
 	/*
 	 * Penalize the parent and child for forking.
 	 */
 	sched_interact_fork(child);
 	sched_priority(child);
 	td_get_sched(td)->ts_runtime += tickincr;
 	sched_interact_update(td);
 	sched_priority(td);
 }
 
 /*
  * Fork a new thread, may be within the same process.
  */
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 	struct td_sched *ts;
 	struct td_sched *ts2;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Initialize child.
 	 */
 	ts = td_get_sched(td);
 	ts2 = td_get_sched(child);
 	child->td_oncpu = NOCPU;
 	child->td_lastcpu = NOCPU;
 	child->td_lock = TDQ_LOCKPTR(tdq);
 	child->td_cpuset = cpuset_ref(td->td_cpuset);
 	ts2->ts_cpu = ts->ts_cpu;
 	ts2->ts_flags = 0;
 	/*
 	 * Grab our parents cpu estimation information.
 	 */
 	ts2->ts_ticks = ts->ts_ticks;
 	ts2->ts_ltick = ts->ts_ltick;
 	ts2->ts_ftick = ts->ts_ftick;
 	/*
 	 * Do not inherit any borrowed priority from the parent.
 	 */
 	child->td_priority = child->td_base_pri;
 	/*
 	 * And update interactivity score.
 	 */
 	ts2->ts_slptime = ts->ts_slptime;
 	ts2->ts_runtime = ts->ts_runtime;
 	/* Attempt to quickly learn interactivity. */
 	ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
 #ifdef KTR
 	bzero(ts2->ts_name, sizeof(ts2->ts_name));
 #endif
 }
 
 /*
  * Adjust the priority class of a thread.
  */
 void
 sched_class(struct thread *td, int class)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_pri_class == class)
 		return;
 	td->td_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
 sched_exit(struct proc *p, struct thread *child)
 {
 	struct thread *td;
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
 	    "prio:%d", child->td_priority);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	sched_exit_thread(td, child);
 }
 
 /*
  * Penalize another thread for the time spent on this one.  This helps to
  * worsen the priority and interactivity of processes which schedule batch
  * jobs such as make.  This has little effect on the make process itself but
  * causes new processes spawned by it to receive worse scores immediately.
  */
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
 	    "prio:%d", child->td_priority);
 	/*
 	 * Give the child's runtime to the parent without returning the
 	 * sleep time as a penalty to the parent.  This causes shells that
 	 * launch expensive things to mark their children as expensive.
 	 */
 	thread_lock(td);
 	td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime;
 	sched_interact_update(td);
 	sched_priority(td);
 	thread_unlock(td);
 }
 
 void
 sched_preempt(struct thread *td)
 {
 	struct tdq *tdq;
 
 	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 
 	thread_lock(td);
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	tdq->tdq_ipipending = 0;
 	if (td->td_priority > tdq->tdq_lowpri) {
 		int flags;
 
 		flags = SW_INVOL | SW_PREEMPT;
 		if (td->td_critnest > 1)
 			td->td_owepreempt = 1;
 		else if (TD_IS_IDLETHREAD(td))
 			mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
 		else
 			mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
 	}
 	thread_unlock(td);
 }
 
 /*
  * Fix priorities on return to user-space.  Priorities may be elevated due
  * to static priorities in msleep() or similar.
  */
 void
 sched_userret(struct thread *td)
 {
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in  
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
 	if (td->td_priority != td->td_user_pri) {
 		thread_lock(td);
 		td->td_priority = td->td_user_pri;
 		td->td_base_pri = td->td_user_pri;
 		tdq_setlowpri(TDQ_SELF(), td);
 		thread_unlock(td);
         }
 }
 
 /*
  * Handle a stathz tick.  This is really only relevant for timeshare
  * threads.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
 #ifdef SMP
 	/*
 	 * We run the long term load balancer infrequently on the first cpu.
 	 */
 	if (balance_tdq == tdq) {
 		if (balance_ticks && --balance_ticks == 0)
 			sched_balance();
 	}
 #endif
 	/*
 	 * Save the old switch count so we have a record of the last ticks
 	 * activity.   Initialize the new switch count based on our load.
 	 * If there is some activity seed it to reflect that.
 	 */
 	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
 	tdq->tdq_switchcnt = tdq->tdq_load;
 	/*
 	 * Advance the insert index once for each tick to ensure that all
 	 * threads get a chance to run.
 	 */
 	if (tdq->tdq_idx == tdq->tdq_ridx) {
 		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
 		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
 			tdq->tdq_ridx = tdq->tdq_idx;
 	}
 	ts = td_get_sched(td);
 	sched_pctcpu_update(ts, 1);
 	if (td->td_pri_class & PRI_FIFO_BIT)
 		return;
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
 		/*
 		 * We used a tick; charge it to the thread so
 		 * that we can compute our interactivity.
 		 */
 		td_get_sched(td)->ts_runtime += tickincr;
 		sched_interact_update(td);
 		sched_priority(td);
 	}
 
 	/*
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
 	if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) {
 		ts->ts_slice = 0;
 		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
 	}
 }
 
 u_int
 sched_estcpu(struct thread *td __unused)
 {
 
 	return (0);
 }
 
 /*
  * Return whether the current CPU has runnable tasks.  Used for in-kernel
  * cooperative idle threads.
  */
 int
 sched_runnable(void)
 {
 	struct tdq *tdq;
 	int load;
 
 	load = 1;
 
 	tdq = TDQ_SELF();
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (tdq->tdq_load > 0)
 			goto out;
 	} else
 		if (tdq->tdq_load - 1 > 0)
 			goto out;
 	load = 0;
 out:
 	return (load);
 }
 
 /*
  * Choose the highest priority thread to run.  The thread is removed from
  * the run-queue while running however the load remains.  For SMP we set
  * the tdq in the global idle bitmask if it idles here.
  */
 struct thread *
 sched_choose(void)
 {
 	struct thread *td;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	td = tdq_choose(tdq);
 	if (td) {
 		tdq_runq_rem(tdq, td);
 		tdq->tdq_lowpri = td->td_priority;
 		return (td);
 	}
 	tdq->tdq_lowpri = PRI_MAX_IDLE;
 	return (PCPU_GET(idlethread));
 }
 
 /*
  * Set owepreempt if necessary.  Preemption never happens directly in ULE,
  * we always request it once we exit a critical section.
  */
 static inline void
 sched_setpreempt(struct thread *td)
 {
 	struct thread *ctd;
 	int cpri;
 	int pri;
 
 	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
 
 	ctd = curthread;
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
 	if (pri < cpri)
 		ctd->td_flags |= TDF_NEEDRESCHED;
 	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
 		return;
 	if (!sched_shouldpreempt(pri, cpri, 0))
 		return;
 	ctd->td_owepreempt = 1;
 }
 
 /*
  * Add a thread to a thread queue.  Select the appropriate runq and add the
  * thread to it.  This is the internal function called when the tdq is
  * predetermined.
  */
 void
 tdq_add(struct tdq *tdq, struct thread *td, int flags)
 {
 
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("sched_add: bad thread state"));
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
 	if (td->td_priority < tdq->tdq_lowpri)
 		tdq->tdq_lowpri = td->td_priority;
 	tdq_runq_add(tdq, td, flags);
 	tdq_load_add(tdq, td);
 }
 
 /*
  * Select the target thread queue and add a thread to it.  Request
  * preemption or IPI a remote processor if required.
  */
 void
 sched_add(struct thread *td, int flags)
 {
 	struct tdq *tdq;
 #ifdef SMP
 	int cpu;
 #endif
 
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
 	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
 	    flags & SRQ_PREEMPTED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Recalculate the priority before we select the target cpu or
 	 * run-queue.
 	 */
 	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_priority(td);
 #ifdef SMP
 	/*
 	 * Pick the destination cpu and if it isn't ours transfer to the
 	 * target cpu.
 	 */
 	td_get_sched(td)->ts_cpu = curcpu; /* Pick something valid to start */
 	cpu = sched_pickcpu(td, flags);
 	tdq = sched_setcpu(td, cpu, flags);
 	tdq_add(tdq, td, flags);
 	if (cpu != PCPU_GET(cpuid)) {
 		tdq_notify(tdq, td);
 		return;
 	}
 #else
 	tdq = TDQ_SELF();
 	TDQ_LOCK(tdq);
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
 	 * to the scheduler's lock.
 	 */
 	thread_lock_set(td, TDQ_LOCKPTR(tdq));
 	tdq_add(tdq, td, flags);
 #endif
 	if (!(flags & SRQ_YIELDING))
 		sched_setpreempt(td);
 }
 
 /*
  * Remove a thread from a run-queue without running it.  This is used
  * when we're stealing a thread from a remote queue.  Otherwise all threads
  * exit by calling sched_exit_thread() and sched_throw() themselves.
  */
 void
 sched_rem(struct thread *td)
 {
 	struct tdq *tdq;
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority);
 	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 	tdq = TDQ_CPU(td_get_sched(td)->ts_cpu);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	KASSERT(TD_ON_RUNQ(td),
 	    ("sched_rem: thread not on run queue"));
 	tdq_runq_rem(tdq, td);
 	tdq_load_rem(tdq, td);
 	TD_SET_CAN_RUN(td);
 	if (td->td_priority == tdq->tdq_lowpri)
 		tdq_setlowpri(tdq, NULL);
 }
 
 /*
  * Fetch cpu utilization information.  Updates on demand.
  */
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	fixpt_t pctcpu;
 	struct td_sched *ts;
 
 	pctcpu = 0;
 	ts = td_get_sched(td);
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_pctcpu_update(ts, TD_IS_RUNNING(td));
 	if (ts->ts_ticks) {
 		int rtick;
 
 		/* How many rtick per second ? */
 		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
 		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
 	}
 
 	return (pctcpu);
 }
 
 /*
  * Enforce affinity settings for a thread.  Called after adjustments to
  * cpumask.
  */
 void
 sched_affinity(struct thread *td)
 {
 #ifdef SMP
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	ts = td_get_sched(td);
 	if (THREAD_CAN_SCHED(td, ts->ts_cpu))
 		return;
 	if (TD_ON_RUNQ(td)) {
 		sched_rem(td);
 		sched_add(td, SRQ_BORING);
 		return;
 	}
 	if (!TD_IS_RUNNING(td))
 		return;
 	/*
 	 * Force a switch before returning to userspace.  If the
 	 * target thread is not running locally send an ipi to force
 	 * the issue.
 	 */
 	td->td_flags |= TDF_NEEDRESCHED;
 	if (td != curthread)
 		ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
 #endif
 }
 
 /*
  * Bind a thread to a target cpu.
  */
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
 	KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
 	ts = td_get_sched(td);
 	if (ts->ts_flags & TSF_BOUND)
 		sched_unbind(td);
 	KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
 	ts->ts_flags |= TSF_BOUND;
 	sched_pin();
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 	ts->ts_cpu = cpu;
 	/* When we return from mi_switch we'll be on the correct cpu. */
 	mi_switch(SW_VOL, NULL);
 }
 
 /*
  * Release a bound thread.
  */
 void
 sched_unbind(struct thread *td)
 {
 	struct td_sched *ts;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
 	ts = td_get_sched(td);
 	if ((ts->ts_flags & TSF_BOUND) == 0)
 		return;
 	ts->ts_flags &= ~TSF_BOUND;
 	sched_unpin();
 }
 
 int
 sched_is_bound(struct thread *td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	return (td_get_sched(td)->ts_flags & TSF_BOUND);
 }
 
 /*
  * Basic yield call.
  */
 void
 sched_relinquish(struct thread *td)
 {
 	thread_lock(td);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 	thread_unlock(td);
 }
 
 /*
  * Return the total system load.
  */
 int
 sched_load(void)
 {
 #ifdef SMP
 	int total;
 	int i;
 
 	total = 0;
 	CPU_FOREACH(i)
 		total += TDQ_CPU(i)->tdq_sysload;
 	return (total);
 #else
 	return (TDQ_SELF()->tdq_sysload);
 #endif
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 #ifdef SMP
 #define	TDQ_IDLESPIN(tdq)						\
     ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
 #else
 #define	TDQ_IDLESPIN(tdq)	1
 #endif
 
 /*
  * The actual idle process.
  */
 void
 sched_idletd(void *dummy)
 {
 	struct thread *td;
 	struct tdq *tdq;
 	int oldswitchcnt, switchcnt;
 	int i;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	td = curthread;
 	tdq = TDQ_SELF();
 	THREAD_NO_SLEEPING();
 	oldswitchcnt = -1;
 	for (;;) {
 		if (tdq->tdq_load) {
 			thread_lock(td);
 			mi_switch(SW_VOL | SWT_IDLE, NULL);
 			thread_unlock(td);
 		}
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 #ifdef SMP
 		if (switchcnt != oldswitchcnt) {
 			oldswitchcnt = switchcnt;
 			if (tdq_idled(tdq) == 0)
 				continue;
 		}
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 #else
 		oldswitchcnt = switchcnt;
 #endif
 		/*
 		 * If we're switching very frequently, spin while checking
 		 * for load rather than entering a low power state that 
 		 * may require an IPI.  However, don't do any busy
 		 * loops while on SMT machines as this simply steals
 		 * cycles from cores doing useful work.
 		 */
 		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
 			for (i = 0; i < sched_idlespins; i++) {
 				if (tdq->tdq_load)
 					break;
 				cpu_spinwait();
 			}
 		}
 
 		/* If there was context switch during spin, restart it. */
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 		if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
 			continue;
 
 		/* Run main MD idle handler. */
 		tdq->tdq_cpu_idle = 1;
 		/*
 		 * Make sure that tdq_cpu_idle update is globally visible
 		 * before cpu_idle() read tdq_load.  The order is important
 		 * to avoid race with tdq_notify.
 		 */
 		atomic_thread_fence_seq_cst();
 		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
 		tdq->tdq_cpu_idle = 0;
 
 		/*
 		 * Account thread-less hardware interrupts and
 		 * other wakeup reasons equal to context switches.
 		 */
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 		if (switchcnt != oldswitchcnt)
 			continue;
 		tdq->tdq_switchcnt++;
 		oldswitchcnt++;
 	}
 }
 
 /*
  * A CPU is entering for the first time or a thread is exiting.
  */
 void
 sched_throw(struct thread *td)
 {
 	struct thread *newtd;
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
 	if (td == NULL) {
 		/* Correct spinlock nesting and acquire the correct lock. */
 		TDQ_LOCK(tdq);
 		spinlock_exit();
 		PCPU_SET(switchtime, cpu_ticks());
 		PCPU_SET(switchticks, ticks);
 	} else {
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		tdq_load_rem(tdq, td);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 		td->td_lastcpu = td->td_oncpu;
 		td->td_oncpu = NOCPU;
 	}
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	newtd = choosethread();
 	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 	cpu_throw(td, newtd);		/* doesn't return */
 }
 
 /*
  * This is called from fork_exit().  Just acquire the correct locks and
  * let fork do the rest of the work.
  */
 void
 sched_fork_exit(struct thread *td)
 {
 	struct tdq *tdq;
 	int cpuid;
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with the scheduler lock held.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_CPU(cpuid);
 	if (TD_IS_IDLETHREAD(td))
 		td->td_lock = TDQ_LOCKPTR(tdq);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 	td->td_oncpu = cpuid;
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	lock_profile_obtain_lock_success(
 	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
 	    "prio:%d", td->td_priority);
 	SDT_PROBE0(sched, , , on__cpu);
 }
 
 /*
  * Create on first use to catch odd startup conditons.
  */
 char *
 sched_tdname(struct thread *td)
 {
 #ifdef KTR
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	if (ts->ts_name[0] == '\0')
 		snprintf(ts->ts_name, sizeof(ts->ts_name),
 		    "%s tid %d", td->td_name, td->td_tid);
 	return (ts->ts_name);
 #else
 	return (td->td_name);
 #endif
 }
 
 #ifdef KTR
 void
 sched_clear_tdname(struct thread *td)
 {
 	struct td_sched *ts;
 
 	ts = td_get_sched(td);
 	ts->ts_name[0] = '\0';
 }
 #endif
 
 #ifdef SMP
 
 /*
  * Build the CPU topology dump string. Is recursively called to collect
  * the topology tree.
  */
 static int
 sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
     int indent)
 {
 	char cpusetbuf[CPUSETBUFSIZ];
 	int i, first;
 
 	sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
 	    "", 1 + indent / 2, cg->cg_level);
 	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
 	    cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
 	first = TRUE;
 	for (i = 0; i < MAXCPU; i++) {
 		if (CPU_ISSET(i, &cg->cg_mask)) {
 			if (!first)
 				sbuf_printf(sb, ", ");
 			else
 				first = FALSE;
 			sbuf_printf(sb, "%d", i);
 		}
 	}
 	sbuf_printf(sb, "</cpu>\n");
 
 	if (cg->cg_flags != 0) {
 		sbuf_printf(sb, "%*s <flags>", indent, "");
 		if ((cg->cg_flags & CG_FLAG_HTT) != 0)
 			sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
 		if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
 			sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
 		if ((cg->cg_flags & CG_FLAG_SMT) != 0)
 			sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
 		sbuf_printf(sb, "</flags>\n");
 	}
 
 	if (cg->cg_children > 0) {
 		sbuf_printf(sb, "%*s <children>\n", indent, "");
 		for (i = 0; i < cg->cg_children; i++)
 			sysctl_kern_sched_topology_spec_internal(sb, 
 			    &cg->cg_child[i], indent+2);
 		sbuf_printf(sb, "%*s </children>\n", indent, "");
 	}
 	sbuf_printf(sb, "%*s</group>\n", indent, "");
 	return (0);
 }
 
 /*
  * Sysctl handler for retrieving topology dump. It's a wrapper for
  * the recursive sysctl_kern_smp_topology_spec_internal().
  */
 static int
 sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *topo;
 	int err;
 
 	KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
 
 	topo = sbuf_new_for_sysctl(NULL, NULL, 512, req);
 	if (topo == NULL)
 		return (ENOMEM);
 
 	sbuf_printf(topo, "<groups>\n");
 	err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
 	sbuf_printf(topo, "</groups>\n");
 
 	if (err == 0) {
 		err = sbuf_finish(topo);
 	}
 	sbuf_delete(topo);
 	return (err);
 }
 
 #endif
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val, period;
 
 	period = 1000000 / realstathz;
 	new_val = period * sched_slice;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
 	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
     "Scheduler name");
 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, sysctl_kern_quantum, "I",
     "Quantum for timeshare threads in microseconds");
 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
     "Quantum for timeshare threads in stathz ticks");
 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
     "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
     &preempt_thresh, 0,
     "Maximal (lowest) priority for preemption");
 SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
     "Assign static kernel priorities to sleeping threads");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
     "Number of times idle thread will spin waiting for new work");
 SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
     &sched_idlespinthresh, 0,
     "Threshold before we will permit idle thread spinning");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
     "Number of hz ticks to keep thread affinity for");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
     "Enables the long-term load balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
     &balance_interval, 0,
     "Average period in stathz ticks to run the long-term balancer");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
     "Attempts to steal work from other cores before idling");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
     "Minimum load on remote CPU before we'll steal");
 SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
     CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
     "XML dump of detected CPU topology");
 #endif
 
 /* ps compat.  All cpu percentages from ULE are weighted. */
 static int ccpu = 0;
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
Index: head/sys/kern/subr_acl_nfs4.c
===================================================================
--- head/sys/kern/subr_acl_nfs4.c	(revision 326270)
+++ head/sys/kern/subr_acl_nfs4.c	(revision 326271)
@@ -1,1418 +1,1420 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008-2010 Edward Tomasz Napierała <trasz@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * ACL support routines specific to NFSv4 access control lists.  These are
  * utility routines for code common across file systems implementing NFSv4
  * ACLs.
  */
 
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/vnode.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/acl.h>
 #else
 #include <errno.h>
 #include <assert.h>
 #include <sys/acl.h>
 #include <sys/stat.h>
 #define KASSERT(a, b) assert(a)
 #define CTASSERT(a)
 
 #endif /* !_KERNEL */
 
 #ifdef _KERNEL
 
 static void	acl_nfs4_trivial_from_mode(struct acl *aclp, mode_t mode);
 
 static int	acl_nfs4_old_semantics = 0;
 
 SYSCTL_INT(_vfs, OID_AUTO, acl_nfs4_old_semantics, CTLFLAG_RW,
     &acl_nfs4_old_semantics, 0, "Use pre-PSARC/2010/029 NFSv4 ACL semantics");
 
 static struct {
 	accmode_t accmode;
 	int mask;
 } accmode2mask[] = {{VREAD, ACL_READ_DATA},
 		    {VWRITE, ACL_WRITE_DATA},
 		    {VAPPEND, ACL_APPEND_DATA},
 		    {VEXEC, ACL_EXECUTE},
 		    {VREAD_NAMED_ATTRS, ACL_READ_NAMED_ATTRS},
 		    {VWRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS},
 		    {VDELETE_CHILD, ACL_DELETE_CHILD},
 		    {VREAD_ATTRIBUTES, ACL_READ_ATTRIBUTES},
 		    {VWRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES},
 		    {VDELETE, ACL_DELETE},
 		    {VREAD_ACL, ACL_READ_ACL},
 		    {VWRITE_ACL, ACL_WRITE_ACL},
 		    {VWRITE_OWNER, ACL_WRITE_OWNER},
 		    {VSYNCHRONIZE, ACL_SYNCHRONIZE},
 		    {0, 0}};
 
 static int
 _access_mask_from_accmode(accmode_t accmode)
 {
 	int access_mask = 0, i;
 
 	for (i = 0; accmode2mask[i].accmode != 0; i++) {
 		if (accmode & accmode2mask[i].accmode)
 			access_mask |= accmode2mask[i].mask;
 	}
 
 	/*
 	 * VAPPEND is just a modifier for VWRITE; if the caller asked
 	 * for 'VAPPEND | VWRITE', we want to check for ACL_APPEND_DATA only.
 	 */
 	if (access_mask & ACL_APPEND_DATA)
 		access_mask &= ~ACL_WRITE_DATA;
 
 	return (access_mask);
 }
 
 /*
  * Return 0, iff access is allowed, 1 otherwise.
  */
 static int
 _acl_denies(const struct acl *aclp, int access_mask, struct ucred *cred,
     int file_uid, int file_gid, int *denied_explicitly)
 {
 	int i;
 	const struct acl_entry *entry;
 
 	if (denied_explicitly != NULL)
 		*denied_explicitly = 0;
 
 	KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
 	    ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
 
 	for (i = 0; i < aclp->acl_cnt; i++) {
 		entry = &(aclp->acl_entry[i]);
 
 		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
 		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
 			continue;
 		if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
 			continue;
 		switch (entry->ae_tag) {
 		case ACL_USER_OBJ:
 			if (file_uid != cred->cr_uid)
 				continue;
 			break;
 		case ACL_USER:
 			if (entry->ae_id != cred->cr_uid)
 				continue;
 			break;
 		case ACL_GROUP_OBJ:
 			if (!groupmember(file_gid, cred))
 				continue;
 			break;
 		case ACL_GROUP:
 			if (!groupmember(entry->ae_id, cred))
 				continue;
 			break;
 		default:
 			KASSERT(entry->ae_tag == ACL_EVERYONE,
 			    ("entry->ae_tag == ACL_EVERYONE"));
 		}
 
 		if (entry->ae_entry_type == ACL_ENTRY_TYPE_DENY) {
 			if (entry->ae_perm & access_mask) {
 				if (denied_explicitly != NULL)
 					*denied_explicitly = 1;
 				return (1);
 			}
 		}
 
 		access_mask &= ~(entry->ae_perm);
 		if (access_mask == 0)
 			return (0);
 	}
 
 	if (access_mask == 0)
 		return (0);
 
 	return (1);
 }
 
 int
 vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
     struct acl *aclp, accmode_t accmode, struct ucred *cred, int *privused)
 {
 	accmode_t priv_granted = 0;
 	int denied, explicitly_denied, access_mask, is_directory,
 	    must_be_owner = 0;
 	mode_t file_mode = 0;
 
 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND |
 	    VEXPLICIT_DENY | VREAD_NAMED_ATTRS | VWRITE_NAMED_ATTRS |
 	    VDELETE_CHILD | VREAD_ATTRIBUTES | VWRITE_ATTRIBUTES | VDELETE |
 	    VREAD_ACL | VWRITE_ACL | VWRITE_OWNER | VSYNCHRONIZE)) == 0,
 	    ("invalid bit in accmode"));
 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 	    	("VAPPEND without VWRITE"));
 
 	if (privused != NULL)
 		*privused = 0;
 
 	if (accmode & VADMIN)
 		must_be_owner = 1;
 
 	/*
 	 * Ignore VSYNCHRONIZE permission.
 	 */
 	accmode &= ~VSYNCHRONIZE;
 
 	access_mask = _access_mask_from_accmode(accmode);
 
 	if (type == VDIR)
 		is_directory = 1;
 	else
 		is_directory = 0;
 
 	/*
 	 * File owner is always allowed to read and write the ACL
 	 * and basic attributes.  This is to prevent a situation
 	 * where user would change ACL in a way that prevents him
 	 * from undoing the change.
 	 */
 	if (file_uid == cred->cr_uid)
 		access_mask &= ~(ACL_READ_ACL | ACL_WRITE_ACL |
 		    ACL_READ_ATTRIBUTES | ACL_WRITE_ATTRIBUTES);
 
 	/*
 	 * Ignore append permission for regular files; use write
 	 * permission instead.
 	 */
 	if (!is_directory && (access_mask & ACL_APPEND_DATA)) {
 		access_mask &= ~ACL_APPEND_DATA;
 		access_mask |= ACL_WRITE_DATA;
 	}
 
 	denied = _acl_denies(aclp, access_mask, cred, file_uid, file_gid,
 	    &explicitly_denied);
 
 	if (must_be_owner) {
 		if (file_uid != cred->cr_uid)
 			denied = EPERM;
 	}
 
 	/*
 	 * For VEXEC, ensure that at least one execute bit is set for
 	 * non-directories. We have to check the mode here to stay
 	 * consistent with execve(2). See the test in
 	 * exec_check_permissions().
 	 */
 	acl_nfs4_sync_mode_from_acl(&file_mode, aclp);
 	if (!denied && !is_directory && (accmode & VEXEC) &&
 	    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
 		denied = EACCES;
 
 	if (!denied)
 		return (0);
 
 	/*
 	 * Access failed.  Iff it was not denied explicitly and
 	 * VEXPLICIT_DENY flag was specified, allow access.
 	 */
 	if ((accmode & VEXPLICIT_DENY) && explicitly_denied == 0)
 		return (0);
 
 	accmode &= ~VEXPLICIT_DENY;
 
 	/*
 	 * No match.  Try to use privileges, if there are any.
 	 */
 	if (is_directory) {
 		if ((accmode & VEXEC) && !priv_check_cred(cred,
 		    PRIV_VFS_LOOKUP, 0))
 			priv_granted |= VEXEC;
 	} else {
 		/*
 		 * Ensure that at least one execute bit is on. Otherwise,
 		 * a privileged user will always succeed, and we don't want
 		 * this to happen unless the file really is executable.
 		 */
 		if ((accmode & VEXEC) && (file_mode &
 		    (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
 			priv_granted |= VEXEC;
 	}
 
 	if ((accmode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0))
 		priv_granted |= VREAD;
 
 	if ((accmode & (VWRITE | VAPPEND | VDELETE_CHILD)) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
 		priv_granted |= (VWRITE | VAPPEND | VDELETE_CHILD);
 
 	if ((accmode & VADMIN_PERMS) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
 		priv_granted |= VADMIN_PERMS;
 
 	if ((accmode & VSTAT_PERMS) &&
 	    !priv_check_cred(cred, PRIV_VFS_STAT, 0))
 		priv_granted |= VSTAT_PERMS;
 
 	if ((accmode & priv_granted) == accmode) {
 		if (privused != NULL)
 			*privused = 1;
 
 		return (0);
 	}
 
 	if (accmode & (VADMIN_PERMS | VDELETE_CHILD | VDELETE))
 		denied = EPERM;
 	else
 		denied = EACCES;
 
 	return (denied);
 }
 #endif /* _KERNEL */
 
 static int
 _acl_entry_matches(struct acl_entry *entry, acl_tag_t tag, acl_perm_t perm,
     acl_entry_type_t entry_type)
 {
 	if (entry->ae_tag != tag)
 		return (0);
 
 	if (entry->ae_id != ACL_UNDEFINED_ID)
 		return (0);
 
 	if (entry->ae_perm != perm)
 		return (0);
 
 	if (entry->ae_entry_type != entry_type)
 		return (0);
 
 	if (entry->ae_flags != 0)
 		return (0);
 
 	return (1);
 }
 
 static struct acl_entry *
 _acl_append(struct acl *aclp, acl_tag_t tag, acl_perm_t perm,
     acl_entry_type_t entry_type)
 {
 	struct acl_entry *entry;
 
 	KASSERT(aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
 	    ("aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
 
 	entry = &(aclp->acl_entry[aclp->acl_cnt]);
 	aclp->acl_cnt++;
 
 	entry->ae_tag = tag;
 	entry->ae_id = ACL_UNDEFINED_ID;
 	entry->ae_perm = perm;
 	entry->ae_entry_type = entry_type;
 	entry->ae_flags = 0;
 
 	return (entry);
 }
 
 static struct acl_entry *
 _acl_duplicate_entry(struct acl *aclp, int entry_index)
 {
 	int i;
 
 	KASSERT(aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
 	    ("aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
 
 	for (i = aclp->acl_cnt; i > entry_index; i--)
 		aclp->acl_entry[i] = aclp->acl_entry[i - 1];
 
 	aclp->acl_cnt++;
 
 	return (&(aclp->acl_entry[entry_index + 1]));
 }
 
 static void
 acl_nfs4_sync_acl_from_mode_draft(struct acl *aclp, mode_t mode,
     int file_owner_id)
 {
 	int i, meets, must_append;
 	struct acl_entry *entry, *copy, *previous,
 	    *a1, *a2, *a3, *a4, *a5, *a6;
 	mode_t amode;
 	const int READ = 04;
 	const int WRITE = 02;
 	const int EXEC = 01;
 
 	KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
 	    ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
 
 	/*
 	 * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
 	 *
 	 * 3.16.6.3. Applying a Mode to an Existing ACL
 	 */
 
 	/*
 	 * 1. For each ACE:
 	 */
 	for (i = 0; i < aclp->acl_cnt; i++) {
 		entry = &(aclp->acl_entry[i]);
 
 		/*
 		 * 1.1. If the type is neither ALLOW or DENY - skip.
 		 */
 		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
 		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
 			continue;
 
 		/*
 		 * 1.2. If ACL_ENTRY_INHERIT_ONLY is set - skip.
 		 */
 		if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
 			continue;
 
 		/*
 		 * 1.3. If ACL_ENTRY_FILE_INHERIT or ACL_ENTRY_DIRECTORY_INHERIT
 		 *      are set:
 		 */
 		if (entry->ae_flags &
 		    (ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT)) {
 			/*
 			 * 1.3.1. A copy of the current ACE is made, and placed
 			 *        in the ACL immediately following the current
 			 *        ACE.
 			 */
 			copy = _acl_duplicate_entry(aclp, i);
 
 			/*
 			 * 1.3.2. In the first ACE, the flag
 			 *        ACL_ENTRY_INHERIT_ONLY is set.
 			 */
 			entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
 
 			/*
 			 * 1.3.3. In the second ACE, the following flags
 			 *        are cleared:
 			 *        ACL_ENTRY_FILE_INHERIT,
 			 *        ACL_ENTRY_DIRECTORY_INHERIT,
 			 *        ACL_ENTRY_NO_PROPAGATE_INHERIT.
 			 */
 			copy->ae_flags &= ~(ACL_ENTRY_FILE_INHERIT |
 			    ACL_ENTRY_DIRECTORY_INHERIT |
 			    ACL_ENTRY_NO_PROPAGATE_INHERIT);
 
 			/*
 			 * The algorithm continues on with the second ACE.
 			 */
 			i++;
 			entry = copy;
 		}
 
 		/*
 		 * 1.4. If it's owner@, group@ or everyone@ entry, clear
 		 *      ACL_READ_DATA, ACL_WRITE_DATA, ACL_APPEND_DATA
 		 *      and ACL_EXECUTE.  Continue to the next entry.
 		 */
 		if (entry->ae_tag == ACL_USER_OBJ ||
 		    entry->ae_tag == ACL_GROUP_OBJ ||
 		    entry->ae_tag == ACL_EVERYONE) {
 			entry->ae_perm &= ~(ACL_READ_DATA | ACL_WRITE_DATA |
 			    ACL_APPEND_DATA | ACL_EXECUTE);
 			continue;
 		}
 
 		/*
 		 * 1.5. Otherwise, if the "who" field did not match one
 		 *      of OWNER@, GROUP@, EVERYONE@:
 		 *
 		 * 1.5.1. If the type is ALLOW, check the preceding ACE.
 		 *        If it does not meet all of the following criteria:
 		 */
 		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW)
 			continue;
 
 		meets = 0;
 		if (i > 0) {
 			meets = 1;
 			previous = &(aclp->acl_entry[i - 1]);
 
 			/*
 			 * 1.5.1.1. The type field is DENY,
 			 */
 			if (previous->ae_entry_type != ACL_ENTRY_TYPE_DENY)
 				meets = 0;
 
 			/*
 			 * 1.5.1.2. The "who" field is the same as the current
 			 *          ACE,
 			 *
 			 * 1.5.1.3. The flag bit ACE4_IDENTIFIER_GROUP
 			 *          is the same as it is in the current ACE,
 			 *          and no other flag bits are set,
 			 */
 			if (previous->ae_id != entry->ae_id ||
 			    previous->ae_tag != entry->ae_tag)
 				meets = 0;
 
 			if (previous->ae_flags)
 				meets = 0;
 
 			/*
 			 * 1.5.1.4. The mask bits are a subset of the mask bits
 			 *          of the current ACE, and are also subset of
 			 *          the following: ACL_READ_DATA,
 			 *          ACL_WRITE_DATA, ACL_APPEND_DATA, ACL_EXECUTE
 			 */
 			if (previous->ae_perm & ~(entry->ae_perm))
 				meets = 0;
 
 			if (previous->ae_perm & ~(ACL_READ_DATA |
 			    ACL_WRITE_DATA | ACL_APPEND_DATA | ACL_EXECUTE))
 				meets = 0;
 		}
 
 		if (!meets) {
 			/*
 		 	 * Then the ACE of type DENY, with a who equal
 			 * to the current ACE, flag bits equal to
 			 * (<current ACE flags> & <ACE_IDENTIFIER_GROUP>)
 			 * and no mask bits, is prepended.
 			 */
 			previous = entry;
 			entry = _acl_duplicate_entry(aclp, i);
 
 			/* Adjust counter, as we've just added an entry. */
 			i++;
 
 			previous->ae_tag = entry->ae_tag;
 			previous->ae_id = entry->ae_id;
 			previous->ae_flags = entry->ae_flags;
 			previous->ae_perm = 0;
 			previous->ae_entry_type = ACL_ENTRY_TYPE_DENY;
 		}
 
 		/*
 		 * 1.5.2. The following modifications are made to the prepended
 		 *        ACE.  The intent is to mask the following ACE
 		 *        to disallow ACL_READ_DATA, ACL_WRITE_DATA,
 		 *        ACL_APPEND_DATA, or ACL_EXECUTE, based upon the group
 		 *        permissions of the new mode.  As a special case,
 		 *        if the ACE matches the current owner of the file,
 		 *        the owner bits are used, rather than the group bits.
 		 *        This is reflected in the algorithm below.
 		 */
 		amode = mode >> 3;
 
 		/*
 		 * If ACE4_IDENTIFIER_GROUP is not set, and the "who" field
 		 * in ACE matches the owner of the file, we shift amode three
 		 * more bits, in order to have the owner permission bits
 		 * placed in the three low order bits of amode.
 		 */
 		if (entry->ae_tag == ACL_USER && entry->ae_id == file_owner_id)
 			amode = amode >> 3;
 
 		if (entry->ae_perm & ACL_READ_DATA) {
 			if (amode & READ)
 				previous->ae_perm &= ~ACL_READ_DATA;
 			else
 				previous->ae_perm |= ACL_READ_DATA;
 		}
 
 		if (entry->ae_perm & ACL_WRITE_DATA) {
 			if (amode & WRITE)
 				previous->ae_perm &= ~ACL_WRITE_DATA;
 			else
 				previous->ae_perm |= ACL_WRITE_DATA;
 		}
 
 		if (entry->ae_perm & ACL_APPEND_DATA) {
 			if (amode & WRITE)
 				previous->ae_perm &= ~ACL_APPEND_DATA;
 			else
 				previous->ae_perm |= ACL_APPEND_DATA;
 		}
 
 		if (entry->ae_perm & ACL_EXECUTE) {
 			if (amode & EXEC)
 				previous->ae_perm &= ~ACL_EXECUTE;
 			else
 				previous->ae_perm |= ACL_EXECUTE;
 		}
 
 		/*
 		 * 1.5.3. If ACE4_IDENTIFIER_GROUP is set in the flags
 		 *        of the ALLOW ace:
 		 *
 		 * XXX: This point is not there in the Falkner's draft.
 		 */
 		if (entry->ae_tag == ACL_GROUP &&
 		    entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) {
 			mode_t extramode, ownermode;
 			extramode = (mode >> 3) & 07;
 			ownermode = mode >> 6;
 			extramode &= ~ownermode;
 
 			if (extramode) {
 				if (extramode & READ) {
 					entry->ae_perm &= ~ACL_READ_DATA;
 					previous->ae_perm &= ~ACL_READ_DATA;
 				}
 
 				if (extramode & WRITE) {
 					entry->ae_perm &=
 					    ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
 					previous->ae_perm &=
 					    ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
 				}
 
 				if (extramode & EXEC) {
 					entry->ae_perm &= ~ACL_EXECUTE;
 					previous->ae_perm &= ~ACL_EXECUTE;
 				}
 			}
 		}
 	}
 
 	/*
 	 * 2. If there at least six ACEs, the final six ACEs are examined.
 	 *    If they are not equal to what we want, append six ACEs.
 	 */
 	must_append = 0;
 	if (aclp->acl_cnt < 6) {
 		must_append = 1;
 	} else {
 		a6 = &(aclp->acl_entry[aclp->acl_cnt - 1]);
 		a5 = &(aclp->acl_entry[aclp->acl_cnt - 2]);
 		a4 = &(aclp->acl_entry[aclp->acl_cnt - 3]);
 		a3 = &(aclp->acl_entry[aclp->acl_cnt - 4]);
 		a2 = &(aclp->acl_entry[aclp->acl_cnt - 5]);
 		a1 = &(aclp->acl_entry[aclp->acl_cnt - 6]);
 
 		if (!_acl_entry_matches(a1, ACL_USER_OBJ, 0,
 		    ACL_ENTRY_TYPE_DENY))
 			must_append = 1;
 		if (!_acl_entry_matches(a2, ACL_USER_OBJ, ACL_WRITE_ACL |
 		    ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
 		    ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_ALLOW))
 			must_append = 1;
 		if (!_acl_entry_matches(a3, ACL_GROUP_OBJ, 0,
 		    ACL_ENTRY_TYPE_DENY))
 			must_append = 1;
 		if (!_acl_entry_matches(a4, ACL_GROUP_OBJ, 0,
 		    ACL_ENTRY_TYPE_ALLOW))
 			must_append = 1;
 		if (!_acl_entry_matches(a5, ACL_EVERYONE, ACL_WRITE_ACL |
 		    ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
 		    ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_DENY))
 			must_append = 1;
 		if (!_acl_entry_matches(a6, ACL_EVERYONE, ACL_READ_ACL |
 		    ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS |
 		    ACL_SYNCHRONIZE, ACL_ENTRY_TYPE_ALLOW))
 			must_append = 1;
 	}
 
 	if (must_append) {
 		KASSERT(aclp->acl_cnt + 6 <= ACL_MAX_ENTRIES,
 		    ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
 
 		a1 = _acl_append(aclp, ACL_USER_OBJ, 0, ACL_ENTRY_TYPE_DENY);
 		a2 = _acl_append(aclp, ACL_USER_OBJ, ACL_WRITE_ACL |
 		    ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
 		    ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_ALLOW);
 		a3 = _acl_append(aclp, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_DENY);
 		a4 = _acl_append(aclp, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_ALLOW);
 		a5 = _acl_append(aclp, ACL_EVERYONE, ACL_WRITE_ACL |
 		    ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
 		    ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_DENY);
 		a6 = _acl_append(aclp, ACL_EVERYONE, ACL_READ_ACL |
 		    ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS |
 		    ACL_SYNCHRONIZE, ACL_ENTRY_TYPE_ALLOW);
 
 		KASSERT(a1 != NULL && a2 != NULL && a3 != NULL && a4 != NULL &&
 		    a5 != NULL && a6 != NULL, ("couldn't append to ACL."));
 	}
 
 	/*
 	 * 3. The final six ACEs are adjusted according to the incoming mode.
 	 */
 	if (mode & S_IRUSR)
 		a2->ae_perm |= ACL_READ_DATA;
 	else
 		a1->ae_perm |= ACL_READ_DATA;
 	if (mode & S_IWUSR)
 		a2->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	else
 		a1->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	if (mode & S_IXUSR)
 		a2->ae_perm |= ACL_EXECUTE;
 	else
 		a1->ae_perm |= ACL_EXECUTE;
 
 	if (mode & S_IRGRP)
 		a4->ae_perm |= ACL_READ_DATA;
 	else
 		a3->ae_perm |= ACL_READ_DATA;
 	if (mode & S_IWGRP)
 		a4->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	else
 		a3->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	if (mode & S_IXGRP)
 		a4->ae_perm |= ACL_EXECUTE;
 	else
 		a3->ae_perm |= ACL_EXECUTE;
 
 	if (mode & S_IROTH)
 		a6->ae_perm |= ACL_READ_DATA;
 	else
 		a5->ae_perm |= ACL_READ_DATA;
 	if (mode & S_IWOTH)
 		a6->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	else
 		a5->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	if (mode & S_IXOTH)
 		a6->ae_perm |= ACL_EXECUTE;
 	else
 		a5->ae_perm |= ACL_EXECUTE;
 }
 
 #ifdef _KERNEL
 void
 acl_nfs4_sync_acl_from_mode(struct acl *aclp, mode_t mode,
     int file_owner_id)
 {
 
 	if (acl_nfs4_old_semantics)
 		acl_nfs4_sync_acl_from_mode_draft(aclp, mode, file_owner_id);
 	else
 		acl_nfs4_trivial_from_mode(aclp, mode);
 }
 #endif /* _KERNEL */
 
 void
 acl_nfs4_sync_mode_from_acl(mode_t *_mode, const struct acl *aclp)
 {
 	int i;
 	mode_t old_mode = *_mode, mode = 0, seen = 0;
 	const struct acl_entry *entry;
 
 	KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
 	    ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
 
 	/*
 	 * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
 	 *
 	 * 3.16.6.1. Recomputing mode upon SETATTR of ACL
 	 */
 
 	for (i = 0; i < aclp->acl_cnt; i++) {
 		entry = &(aclp->acl_entry[i]);
 
 		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
 		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
 			continue;
 
 		if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
 			continue;
 
 		if (entry->ae_tag == ACL_USER_OBJ) {
 			if ((entry->ae_perm & ACL_READ_DATA) &&
 			    ((seen & S_IRUSR) == 0)) {
 				seen |= S_IRUSR;
 				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 					mode |= S_IRUSR;
 			}
 			if ((entry->ae_perm & ACL_WRITE_DATA) &&
 			     ((seen & S_IWUSR) == 0)) {
 				seen |= S_IWUSR;
 				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 					mode |= S_IWUSR;
 			}
 			if ((entry->ae_perm & ACL_EXECUTE) &&
 			    ((seen & S_IXUSR) == 0)) {
 				seen |= S_IXUSR;
 				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 					mode |= S_IXUSR;
 			}
 		} else if (entry->ae_tag == ACL_GROUP_OBJ) {
 			if ((entry->ae_perm & ACL_READ_DATA) &&
 			    ((seen & S_IRGRP) == 0)) {
 				seen |= S_IRGRP;
 				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 					mode |= S_IRGRP;
 			}
 			if ((entry->ae_perm & ACL_WRITE_DATA) &&
 			    ((seen & S_IWGRP) == 0)) {
 				seen |= S_IWGRP;
 				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 					mode |= S_IWGRP;
 			}
 			if ((entry->ae_perm & ACL_EXECUTE) &&
 			    ((seen & S_IXGRP) == 0)) {
 				seen |= S_IXGRP;
 				if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 					mode |= S_IXGRP;
 			}
 		} else if (entry->ae_tag == ACL_EVERYONE) {
 			if (entry->ae_perm & ACL_READ_DATA) {
 				if ((seen & S_IRUSR) == 0) {
 					seen |= S_IRUSR;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IRUSR;
 				}
 				if ((seen & S_IRGRP) == 0) {
 					seen |= S_IRGRP;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IRGRP;
 				}
 				if ((seen & S_IROTH) == 0) {
 					seen |= S_IROTH;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IROTH;
 				}
 			}
 			if (entry->ae_perm & ACL_WRITE_DATA) {
 				if ((seen & S_IWUSR) == 0) {
 					seen |= S_IWUSR;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IWUSR;
 				}
 				if ((seen & S_IWGRP) == 0) {
 					seen |= S_IWGRP;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IWGRP;
 				}
 				if ((seen & S_IWOTH) == 0) {
 					seen |= S_IWOTH;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IWOTH;
 				}
 			}
 			if (entry->ae_perm & ACL_EXECUTE) {
 				if ((seen & S_IXUSR) == 0) {
 					seen |= S_IXUSR;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IXUSR;
 				}
 				if ((seen & S_IXGRP) == 0) {
 					seen |= S_IXGRP;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IXGRP;
 				}
 				if ((seen & S_IXOTH) == 0) {
 					seen |= S_IXOTH;
 					if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 						mode |= S_IXOTH;
 				}
 			}
 		}
 	}
 
 	*_mode = mode | (old_mode & ACL_PRESERVE_MASK);
 }
 
 #ifdef _KERNEL
 /*
  * Calculate inherited ACL in a manner compatible with NFSv4 Minor Version 1,
  * draft-ietf-nfsv4-minorversion1-03.txt.
  */
 static void		
 acl_nfs4_compute_inherited_acl_draft(const struct acl *parent_aclp,
     struct acl *child_aclp, mode_t mode, int file_owner_id,
     int is_directory)
 {
 	int i, flags;
 	const struct acl_entry *parent_entry;
 	struct acl_entry *entry, *copy;
 
 	KASSERT(child_aclp->acl_cnt == 0, ("child_aclp->acl_cnt == 0"));
 	KASSERT(parent_aclp->acl_cnt <= ACL_MAX_ENTRIES,
 	    ("parent_aclp->acl_cnt <= ACL_MAX_ENTRIES"));
 
 	/*
 	 * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
 	 *
 	 * 3.16.6.2. Applying the mode given to CREATE or OPEN
 	 *           to an inherited ACL
 	 */
 
 	/*
 	 * 1. Form an ACL that is the concatenation of all inheritable ACEs.
 	 */
 	for (i = 0; i < parent_aclp->acl_cnt; i++) {
 		parent_entry = &(parent_aclp->acl_entry[i]);
 		flags = parent_entry->ae_flags;
 
 		/*
 		 * Entry is not inheritable at all.
 		 */
 		if ((flags & (ACL_ENTRY_DIRECTORY_INHERIT |
 		    ACL_ENTRY_FILE_INHERIT)) == 0)
 			continue;
 
 		/*
 		 * We're creating a file, but entry is not inheritable
 		 * by files.
 		 */
 		if (!is_directory && (flags & ACL_ENTRY_FILE_INHERIT) == 0)
 			continue;
 
 		/*
 		 * Entry is inheritable only by files, but has NO_PROPAGATE
 		 * flag set, and we're creating a directory, so it wouldn't
 		 * propagate to any file in that directory anyway.
 		 */
 		if (is_directory &&
 		    (flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0 &&
 		    (flags & ACL_ENTRY_NO_PROPAGATE_INHERIT))
 			continue;
 
 		KASSERT(child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
 		    ("child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
 		child_aclp->acl_entry[child_aclp->acl_cnt] = *parent_entry;
 		child_aclp->acl_cnt++;
 	}
 
 	/*
 	 * 2. For each entry in the new ACL, adjust its flags, possibly
 	 *    creating two entries in place of one.
 	 */
 	for (i = 0; i < child_aclp->acl_cnt; i++) {
 		entry = &(child_aclp->acl_entry[i]);
 
 		/*
 		 * This is not in the specification, but SunOS
 		 * apparently does that.
 		 */
 		if (((entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT) ||
 		    !is_directory) &&
 		    entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 			entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER);
 
 		/*
 		 * 2.A. If the ACL_ENTRY_NO_PROPAGATE_INHERIT is set, or if the object
 		 *      being created is not a directory, then clear the
 		 *      following flags: ACL_ENTRY_NO_PROPAGATE_INHERIT,
 		 *      ACL_ENTRY_FILE_INHERIT, ACL_ENTRY_DIRECTORY_INHERIT,
 		 *      ACL_ENTRY_INHERIT_ONLY.
 		 */
 		if (entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT ||
 		    !is_directory) {
 			entry->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
 			ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
 			ACL_ENTRY_INHERIT_ONLY);
 
 			/*
 			 * Continue on to the next ACE.
 			 */
 			continue;
 		}
 
 		/*
 		 * 2.B. If the object is a directory and ACL_ENTRY_FILE_INHERIT
 		 *      is set, but ACL_ENTRY_NO_PROPAGATE_INHERIT is not set, ensure
 		 *      that ACL_ENTRY_INHERIT_ONLY is set.  Continue to the
 		 *      next ACE.  Otherwise...
 		 */
 		/*
 		 * XXX: Read it again and make sure what does the "otherwise"
 		 *      apply to.
 		 */
 		if (is_directory &&
 		    (entry->ae_flags & ACL_ENTRY_FILE_INHERIT) &&
 		    ((entry->ae_flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0)) {
 			entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
 			continue;
 		}
 
 		/*
 		 * 2.C. If the type of the ACE is neither ALLOW nor deny,
 		 *      then continue.
 		 */
 		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
 		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
 			continue;
 
 		/*
 		 * 2.D. Copy the original ACE into a second, adjacent ACE.
 		 */
 		copy = _acl_duplicate_entry(child_aclp, i);
 
 		/*
 		 * 2.E. On the first ACE, ensure that ACL_ENTRY_INHERIT_ONLY
 		 *      is set.
 		 */
 		entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
 
 		/*
 		 * 2.F. On the second ACE, clear the following flags:
 		 *      ACL_ENTRY_NO_PROPAGATE_INHERIT, ACL_ENTRY_FILE_INHERIT,
 		 *      ACL_ENTRY_DIRECTORY_INHERIT, ACL_ENTRY_INHERIT_ONLY.
 		 */
 		copy->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
 		    ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
 		    ACL_ENTRY_INHERIT_ONLY);
 
 		/*
 		 * 2.G. On the second ACE, if the type is ALLOW,
 		 *      an implementation MAY clear the following
 		 *      mask bits: ACL_WRITE_ACL, ACL_WRITE_OWNER.
 		 */
 		if (copy->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
 			copy->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER);
 
 		/*
 		 * Increment the counter to skip the copied entry.
 		 */
 		i++;
 	}
 
 	/*
 	 * 3. To ensure that the mode is honored, apply the algorithm describe
 	 *    in Section 2.16.6.3, using the mode that is to be used for file
 	 *    creation.
 	 */
 	acl_nfs4_sync_acl_from_mode(child_aclp, mode, file_owner_id);
 }
 #endif /* _KERNEL */
 
 /*
  * Populate the ACL with entries inherited from parent_aclp.
  */
 static void		
 acl_nfs4_inherit_entries(const struct acl *parent_aclp,
     struct acl *child_aclp, mode_t mode, int file_owner_id,
     int is_directory)
 {
 	int i, flags, tag;
 	const struct acl_entry *parent_entry;
 	struct acl_entry *entry;
 
 	KASSERT(parent_aclp->acl_cnt <= ACL_MAX_ENTRIES,
 	    ("parent_aclp->acl_cnt <= ACL_MAX_ENTRIES"));
 
 	for (i = 0; i < parent_aclp->acl_cnt; i++) {
 		parent_entry = &(parent_aclp->acl_entry[i]);
 		flags = parent_entry->ae_flags;
 		tag = parent_entry->ae_tag;
 
 		/*
 		 * Don't inherit owner@, group@, or everyone@ entries.
 		 */
 		if (tag == ACL_USER_OBJ || tag == ACL_GROUP_OBJ ||
 		    tag == ACL_EVERYONE)
 			continue;
 
 		/*
 		 * Entry is not inheritable at all.
 		 */
 		if ((flags & (ACL_ENTRY_DIRECTORY_INHERIT |
 		    ACL_ENTRY_FILE_INHERIT)) == 0)
 			continue;
 
 		/*
 		 * We're creating a file, but entry is not inheritable
 		 * by files.
 		 */
 		if (!is_directory && (flags & ACL_ENTRY_FILE_INHERIT) == 0)
 			continue;
 
 		/*
 		 * Entry is inheritable only by files, but has NO_PROPAGATE
 		 * flag set, and we're creating a directory, so it wouldn't
 		 * propagate to any file in that directory anyway.
 		 */
 		if (is_directory &&
 		    (flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0 &&
 		    (flags & ACL_ENTRY_NO_PROPAGATE_INHERIT))
 			continue;
 
 		/*
 		 * Entry qualifies for being inherited.
 		 */
 		KASSERT(child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
 		    ("child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
 		entry = &(child_aclp->acl_entry[child_aclp->acl_cnt]);
 		*entry = *parent_entry;
 		child_aclp->acl_cnt++;
 
 		entry->ae_flags &= ~ACL_ENTRY_INHERIT_ONLY;
 		entry->ae_flags |= ACL_ENTRY_INHERITED;
 
 		/*
 		 * If the type of the ACE is neither ALLOW nor DENY,
 		 * then leave it as it is and proceed to the next one.
 		 */
 		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
 		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
 			continue;
 
 		/*
 		 * If the ACL_ENTRY_NO_PROPAGATE_INHERIT is set, or if
 		 * the object being created is not a directory, then clear
 		 * the following flags: ACL_ENTRY_NO_PROPAGATE_INHERIT,
 		 * ACL_ENTRY_FILE_INHERIT, ACL_ENTRY_DIRECTORY_INHERIT,
 		 * ACL_ENTRY_INHERIT_ONLY.
 		 */
 		if (entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT ||
 		    !is_directory) {
 			entry->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
 			ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
 			ACL_ENTRY_INHERIT_ONLY);
 		}
 
 		/*
 		 * If the object is a directory and ACL_ENTRY_FILE_INHERIT
 		 * is set, but ACL_ENTRY_DIRECTORY_INHERIT is not set, ensure
 		 * that ACL_ENTRY_INHERIT_ONLY is set.
 		 */
 		if (is_directory &&
 		    (entry->ae_flags & ACL_ENTRY_FILE_INHERIT) &&
 		    ((entry->ae_flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0)) {
 			entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
 		}
 
 		if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW &&
 		    (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY) == 0) {
 			/*
 			 * Some permissions must never be inherited.
 			 */
 			entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
 			    ACL_WRITE_NAMED_ATTRS | ACL_WRITE_ATTRIBUTES);
 
 			/*
 			 * Others must be masked according to the file mode.
 			 */
 			if ((mode & S_IRGRP) == 0)
 				entry->ae_perm &= ~ACL_READ_DATA;
 			if ((mode & S_IWGRP) == 0)
 				entry->ae_perm &=
 				    ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
 			if ((mode & S_IXGRP) == 0)
 				entry->ae_perm &= ~ACL_EXECUTE;
 		}
 	}
 }
 
 /*
  * Calculate inherited ACL in a manner compatible with PSARC/2010/029.
  * It's also being used to calculate a trivial ACL, by inheriting from
  * a NULL ACL.
  */
 static void		
 acl_nfs4_compute_inherited_acl_psarc(const struct acl *parent_aclp,
     struct acl *aclp, mode_t mode, int file_owner_id, int is_directory)
 {
 	acl_perm_t user_allow_first = 0, user_deny = 0, group_deny = 0;
 	acl_perm_t user_allow, group_allow, everyone_allow;
 
 	KASSERT(aclp->acl_cnt == 0, ("aclp->acl_cnt == 0"));
 
 	user_allow = group_allow = everyone_allow = ACL_READ_ACL |
 	    ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS | ACL_SYNCHRONIZE;
 	user_allow |= ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
 	    ACL_WRITE_NAMED_ATTRS;
 
 	if (mode & S_IRUSR)
 		user_allow |= ACL_READ_DATA;
 	if (mode & S_IWUSR)
 		user_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	if (mode & S_IXUSR)
 		user_allow |= ACL_EXECUTE;
 
 	if (mode & S_IRGRP)
 		group_allow |= ACL_READ_DATA;
 	if (mode & S_IWGRP)
 		group_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	if (mode & S_IXGRP)
 		group_allow |= ACL_EXECUTE;
 
 	if (mode & S_IROTH)
 		everyone_allow |= ACL_READ_DATA;
 	if (mode & S_IWOTH)
 		everyone_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
 	if (mode & S_IXOTH)
 		everyone_allow |= ACL_EXECUTE;
 
 	user_deny = ((group_allow | everyone_allow) & ~user_allow);
 	group_deny = everyone_allow & ~group_allow;
 	user_allow_first = group_deny & ~user_deny;
 
 	if (user_allow_first != 0)
 		_acl_append(aclp, ACL_USER_OBJ, user_allow_first,
 		    ACL_ENTRY_TYPE_ALLOW);
 	if (user_deny != 0)
 		_acl_append(aclp, ACL_USER_OBJ, user_deny,
 		    ACL_ENTRY_TYPE_DENY);
 	if (group_deny != 0)
 		_acl_append(aclp, ACL_GROUP_OBJ, group_deny,
 		    ACL_ENTRY_TYPE_DENY);
 
 	if (parent_aclp != NULL)
 		acl_nfs4_inherit_entries(parent_aclp, aclp, mode,
 		    file_owner_id, is_directory);
 
 	_acl_append(aclp, ACL_USER_OBJ, user_allow, ACL_ENTRY_TYPE_ALLOW);
 	_acl_append(aclp, ACL_GROUP_OBJ, group_allow, ACL_ENTRY_TYPE_ALLOW);
 	_acl_append(aclp, ACL_EVERYONE, everyone_allow, ACL_ENTRY_TYPE_ALLOW);
 }
 
 #ifdef _KERNEL
 void		
 acl_nfs4_compute_inherited_acl(const struct acl *parent_aclp,
     struct acl *child_aclp, mode_t mode, int file_owner_id,
     int is_directory)
 {
 
 	if (acl_nfs4_old_semantics)
 		acl_nfs4_compute_inherited_acl_draft(parent_aclp, child_aclp,
 		    mode, file_owner_id, is_directory);
 	else
 		acl_nfs4_compute_inherited_acl_psarc(parent_aclp, child_aclp,
 		    mode, file_owner_id, is_directory);
 }
 #endif /* _KERNEL */
 
 /*
  * Calculate trivial ACL in a manner compatible with PSARC/2010/029.
  * Note that this results in an ACL different from (but semantically
  * equal to) the "canonical six" trivial ACL computed using algorithm
  * described in draft-ietf-nfsv4-minorversion1-03.txt, 3.16.6.2.
  */
 static void
 acl_nfs4_trivial_from_mode(struct acl *aclp, mode_t mode)
 {
 
 	aclp->acl_cnt = 0;
 	acl_nfs4_compute_inherited_acl_psarc(NULL, aclp, mode, -1, -1);
 }
 
 #ifndef _KERNEL
 /*
  * This routine is used by libc to implement acl_strip_np(3)
  * and acl_is_trivial_np(3).
  */
 void
 acl_nfs4_trivial_from_mode_libc(struct acl *aclp, int mode, int canonical_six)
 {
 
 	aclp->acl_cnt = 0;
 	if (canonical_six)
 		acl_nfs4_sync_acl_from_mode_draft(aclp, mode, -1);
 	else
 		acl_nfs4_trivial_from_mode(aclp, mode);
 }
 #endif /* !_KERNEL */
 
 #ifdef _KERNEL
 static int
 _acls_are_equal(const struct acl *a, const struct acl *b)
 {
 	int i;
 	const struct acl_entry *entrya, *entryb;
 
 	if (a->acl_cnt != b->acl_cnt)
 		return (0);
 
 	for (i = 0; i < b->acl_cnt; i++) {
 		entrya = &(a->acl_entry[i]);
 		entryb = &(b->acl_entry[i]);
 
 		if (entrya->ae_tag != entryb->ae_tag ||
 		    entrya->ae_id != entryb->ae_id ||
 		    entrya->ae_perm != entryb->ae_perm ||
 		    entrya->ae_entry_type != entryb->ae_entry_type ||
 		    entrya->ae_flags != entryb->ae_flags)
 			return (0);
 	}
 
 	return (1);
 }
 
 /*
  * This routine is used to determine whether to remove extended attribute
  * that stores ACL contents.
  */
 int
 acl_nfs4_is_trivial(const struct acl *aclp, int file_owner_id)
 {
 	int trivial;
 	mode_t tmpmode = 0;
 	struct acl *tmpaclp;
 
 	if (aclp->acl_cnt > 6)
 		return (0);
 
 	/*
 	 * Compute the mode from the ACL, then compute new ACL from that mode.
 	 * If the ACLs are identical, then the ACL is trivial.
 	 *
 	 * XXX: I guess there is a faster way to do this.  However, even
 	 *      this slow implementation significantly speeds things up
 	 *      for files that don't have non-trivial ACLs - it's critical
 	 *      for performance to not use EA when they are not needed.
 	 *
 	 * First try the PSARC/2010/029 semantics.
 	 */
 	tmpaclp = acl_alloc(M_WAITOK | M_ZERO);
 	acl_nfs4_sync_mode_from_acl(&tmpmode, aclp);
 	acl_nfs4_trivial_from_mode(tmpaclp, tmpmode);
 	trivial = _acls_are_equal(aclp, tmpaclp);
 	if (trivial) {
 		acl_free(tmpaclp);
 		return (trivial);
 	}
 
 	/*
 	 * Check if it's a draft-ietf-nfsv4-minorversion1-03.txt trivial ACL.
 	 */
 	tmpaclp->acl_cnt = 0;
 	acl_nfs4_sync_acl_from_mode_draft(tmpaclp, tmpmode, file_owner_id);
 	trivial = _acls_are_equal(aclp, tmpaclp);
 	acl_free(tmpaclp);
 
 	return (trivial);
 }
 #endif /* _KERNEL */
 
 int
 acl_nfs4_check(const struct acl *aclp, int is_directory)
 {
 	int i;
 	const struct acl_entry *entry;
 
 	/*
 	 * The spec doesn't seem to say anything about ACL validity.
 	 * It seems there is not much to do here.  There is even no need
 	 * to count "owner@" or "everyone@" (ACL_USER_OBJ and ACL_EVERYONE)
 	 * entries, as there can be several of them and that's perfectly
 	 * valid.  There can be none of them too.  Really.
 	 */
 
 	if (aclp->acl_cnt > ACL_MAX_ENTRIES || aclp->acl_cnt <= 0)
 		return (EINVAL);
 
 	for (i = 0; i < aclp->acl_cnt; i++) {
 		entry = &(aclp->acl_entry[i]);
 
 		switch (entry->ae_tag) {
 		case ACL_USER_OBJ:
 		case ACL_GROUP_OBJ:
 		case ACL_EVERYONE:
 			if (entry->ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			break;
 
 		case ACL_USER:
 		case ACL_GROUP:
 			if (entry->ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			break;
 
 		default:
 			return (EINVAL);
 		}
 
 		if ((entry->ae_perm | ACL_NFS4_PERM_BITS) != ACL_NFS4_PERM_BITS)
 			return (EINVAL);
 
 		/*
 		 * Disallow ACL_ENTRY_TYPE_AUDIT and ACL_ENTRY_TYPE_ALARM for now.
 		 */
 		if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
 		    entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
 			return (EINVAL);
 
 		if ((entry->ae_flags | ACL_FLAGS_BITS) != ACL_FLAGS_BITS)
 			return (EINVAL);
 
 		/* Disallow unimplemented flags. */
 		if (entry->ae_flags & (ACL_ENTRY_SUCCESSFUL_ACCESS |
 		    ACL_ENTRY_FAILED_ACCESS))
 			return (EINVAL);
 
 		/* Disallow flags not allowed for ordinary files. */
 		if (!is_directory) {
 			if (entry->ae_flags & (ACL_ENTRY_FILE_INHERIT |
 			    ACL_ENTRY_DIRECTORY_INHERIT |
 			    ACL_ENTRY_NO_PROPAGATE_INHERIT | ACL_ENTRY_INHERIT_ONLY))
 				return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 #ifdef	_KERNEL
 static int
 acl_nfs4_modload(module_t module, int what, void *arg)
 {
 	int ret;
 
 	ret = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 	case MOD_SHUTDOWN:
 		break;
 
 	case MOD_QUIESCE:
 		/* XXX TODO */
 		ret = 0;
 		break;
 
 	case MOD_UNLOAD:
 		/* XXX TODO */
 		ret = 0;
 		break;
 	default:
 		ret = EINVAL;
 		break;
 	}
 
 	return (ret);
 }
 
 static moduledata_t acl_nfs4_mod = {
 	"acl_nfs4",
 	acl_nfs4_modload,
 	NULL
 };
 
 /*
  * XXX TODO: which subsystem, order?
  */
 DECLARE_MODULE(acl_nfs4, acl_nfs4_mod, SI_SUB_VFS, SI_ORDER_FIRST);
 MODULE_VERSION(acl_nfs4, 1);
 #endif	/* _KERNEL */
Index: head/sys/kern/subr_acl_posix1e.c
===================================================================
--- head/sys/kern/subr_acl_posix1e.c	(revision 326270)
+++ head/sys/kern/subr_acl_posix1e.c	(revision 326271)
@@ -1,691 +1,693 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Developed by the TrustedBSD Project.
  *
  * ACL support routines specific to POSIX.1e access control lists.  These are
  * utility routines for code common across file systems implementing POSIX.1e
  * ACLs.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/vnode.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 
 /*
  * Implement a version of vaccess() that understands POSIX.1e ACL semantics;
  * the access ACL has already been prepared for evaluation by the file system
  * and is passed via 'uid', 'gid', and 'acl'.  Return 0 on success, else an
  * errno value.
  */
 int
 vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
     struct acl *acl, accmode_t accmode, struct ucred *cred, int *privused)
 {
 	struct acl_entry *acl_other, *acl_mask;
 	accmode_t dac_granted;
 	accmode_t priv_granted;
 	accmode_t acl_mask_granted;
 	int group_matched, i;
 
 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 	    ("invalid bit in accmode"));
 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 	    	("VAPPEND without VWRITE"));
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.  Otherwise, attempt to
 	 * use privileges granted via priv_granted.  In some cases, which
 	 * privileges to use may be ambiguous due to "best match", in which
 	 * case fall back on first match for the time being.
 	 */
 	if (privused != NULL)
 		*privused = 0;
 
 	/*
 	 * Determine privileges now, but don't apply until we've found a DAC
 	 * entry that matches but has failed to allow access.
 	 *
 	 * XXXRW: Ideally, we'd determine the privileges required before
 	 * asking for them.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		if ((accmode & VEXEC) && !priv_check_cred(cred,
 		     PRIV_VFS_LOOKUP, 0))
 			priv_granted |= VEXEC;
 	} else {
 		/*
 		 * Ensure that at least one execute bit is on. Otherwise,
 		 * a privileged user will always succeed, and we don't want
 		 * this to happen unless the file really is executable.
 		 */
 		if ((accmode & VEXEC) && (acl_posix1e_acl_to_mode(acl) &
 		    (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
 			priv_granted |= VEXEC;
 	}
 
 	if ((accmode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0))
 		priv_granted |= VREAD;
 
 	if (((accmode & VWRITE) || (accmode & VAPPEND)) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((accmode & VADMIN) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
 		priv_granted |= VADMIN;
 
 	/*
 	 * The owner matches if the effective uid associated with the
 	 * credential matches that of the ACL_USER_OBJ entry.  While we're
 	 * doing the first scan, also cache the location of the ACL_MASK and
 	 * ACL_OTHER entries, preventing some future iterations.
 	 */
 	acl_mask = acl_other = NULL;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			if (file_uid != cred->cr_uid)
 				break;
 			dac_granted = 0;
 			dac_granted |= VADMIN;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= (VWRITE | VAPPEND);
 			if ((accmode & dac_granted) == accmode)
 				return (0);
 
 			/*
 			 * XXXRW: Do privilege lookup here.
 			 */
 			if ((accmode & (dac_granted | priv_granted)) ==
 			    accmode) {
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 			}
 			goto error;
 
 		case ACL_MASK:
 			acl_mask = &acl->acl_entry[i];
 			break;
 
 		case ACL_OTHER:
 			acl_other = &acl->acl_entry[i];
 			break;
 
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * An ACL_OTHER entry should always exist in a valid access ACL.  If
 	 * it doesn't, then generate a serious failure.  For now, this means
 	 * a debugging message and EPERM, but in the future should probably
 	 * be a panic.
 	 */
 	if (acl_other == NULL) {
 		/*
 		 * XXX This should never happen
 		 */
 		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
 		return (EPERM);
 	}
 
 	/*
 	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are
 	 * masked by an ACL_MASK entry, if any.  As such, first identify the
 	 * ACL_MASK field, then iterate through identifying potential user
 	 * matches, then group matches.  If there is no ACL_MASK, assume that
 	 * the mask allows all requests to succeed.
 	 */
 	if (acl_mask != NULL) {
 		acl_mask_granted = 0;
 		if (acl_mask->ae_perm & ACL_EXECUTE)
 			acl_mask_granted |= VEXEC;
 		if (acl_mask->ae_perm & ACL_READ)
 			acl_mask_granted |= VREAD;
 		if (acl_mask->ae_perm & ACL_WRITE)
 			acl_mask_granted |= (VWRITE | VAPPEND);
 	} else
 		acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
 
 	/*
 	 * Check ACL_USER ACL entries.  There will either be one or no
 	 * matches; if there is one, we accept or rejected based on the
 	 * match; otherwise, we continue on to groups.
 	 */
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER:
 			if (acl->acl_entry[i].ae_id != cred->cr_uid)
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= (VWRITE | VAPPEND);
 			dac_granted &= acl_mask_granted;
 			if ((accmode & dac_granted) == accmode)
 				return (0);
 			/*
 			 * XXXRW: Do privilege lookup here.
 			 */
 			if ((accmode & (dac_granted | priv_granted)) !=
 			    accmode)
 				goto error;
 
 			if (privused != NULL)
 				*privused = 1;
 			return (0);
 		}
 	}
 
 	/*
 	 * Group match is best-match, not first-match, so find a "best"
 	 * match.  Iterate across, testing each potential group match.  Make
 	 * sure we keep track of whether we found a match or not, so that we
 	 * know if we should try again with any available privilege, or if we
 	 * should move on to ACL_OTHER.
 	 */
 	group_matched = 0;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_GROUP_OBJ:
 			if (!groupmember(file_gid, cred))
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= (VWRITE | VAPPEND);
 			dac_granted  &= acl_mask_granted;
 
 			if ((accmode & dac_granted) == accmode)
 				return (0);
 
 			group_matched = 1;
 			break;
 
 		case ACL_GROUP:
 			if (!groupmember(acl->acl_entry[i].ae_id, cred))
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= (VWRITE | VAPPEND);
 			dac_granted  &= acl_mask_granted;
 
 			if ((accmode & dac_granted) == accmode)
 				return (0);
 
 			group_matched = 1;
 			break;
 
 		default:
 			break;
 		}
 	}
 
 	if (group_matched == 1) {
 		/*
 		 * There was a match, but it did not grant rights via pure
 		 * DAC.  Try again, this time with privilege.
 		 */
 		for (i = 0; i < acl->acl_cnt; i++) {
 			switch (acl->acl_entry[i].ae_tag) {
 			case ACL_GROUP_OBJ:
 				if (!groupmember(file_gid, cred))
 					break;
 				dac_granted = 0;
 				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 					dac_granted |= VEXEC;
 				if (acl->acl_entry[i].ae_perm & ACL_READ)
 					dac_granted |= VREAD;
 				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 					dac_granted |= (VWRITE | VAPPEND);
 				dac_granted &= acl_mask_granted;
 
 				/*
 				 * XXXRW: Do privilege lookup here.
 				 */
 				if ((accmode & (dac_granted | priv_granted))
 				    != accmode)
 					break;
 
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 
 			case ACL_GROUP:
 				if (!groupmember(acl->acl_entry[i].ae_id,
 				    cred))
 					break;
 				dac_granted = 0;
 				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 				if (acl->acl_entry[i].ae_perm & ACL_READ)
 					dac_granted |= VREAD;
 				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 					dac_granted |= (VWRITE | VAPPEND);
 				dac_granted &= acl_mask_granted;
 
 				/*
 				 * XXXRW: Do privilege lookup here.
 				 */
 				if ((accmode & (dac_granted | priv_granted))
 				    != accmode)
 					break;
 
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 
 			default:
 				break;
 			}
 		}
 		/*
 		 * Even with privilege, group membership was not sufficient.
 		 * Return failure.
 		 */
 		goto error;
 	}
 		
 	/*
 	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
 	 */
 	dac_granted = 0;
 	if (acl_other->ae_perm & ACL_EXECUTE)
 		dac_granted |= VEXEC;
 	if (acl_other->ae_perm & ACL_READ)
 		dac_granted |= VREAD;
 	if (acl_other->ae_perm & ACL_WRITE)
 		dac_granted |= (VWRITE | VAPPEND);
 
 	if ((accmode & dac_granted) == accmode)
 		return (0);
 	/*
 	 * XXXRW: Do privilege lookup here.
 	 */
 	if ((accmode & (dac_granted | priv_granted)) == accmode) {
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 error:
 	return ((accmode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * For the purposes of filesystems maintaining the _OBJ entries in an inode
  * with a mode_t field, this routine converts a mode_t entry to an
  * acl_perm_t.
  */
 acl_perm_t
 acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
 {
 	acl_perm_t	perm = 0;
 
 	switch(tag) {
 	case ACL_USER_OBJ:
 		if (mode & S_IXUSR)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IRUSR)
 			perm |= ACL_READ;
 		if (mode & S_IWUSR)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	case ACL_GROUP_OBJ:
 		if (mode & S_IXGRP)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IRGRP)
 			perm |= ACL_READ;
 		if (mode & S_IWGRP)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	case ACL_OTHER:
 		if (mode & S_IXOTH)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IROTH)
 			perm |= ACL_READ;
 		if (mode & S_IWOTH)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	default:
 		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
 		return (0);
 	}
 }
 
 /*
  * Given inode information (uid, gid, mode), return an acl entry of the
  * appropriate type.
  */
 struct acl_entry
 acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
 {
 	struct acl_entry	acl_entry;
 
 	acl_entry.ae_tag = tag;
 	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
 	acl_entry.ae_entry_type = 0;
 	acl_entry.ae_flags = 0;
 	switch(tag) {
 	case ACL_USER_OBJ:
 		acl_entry.ae_id = uid;
 		break;
 
 	case ACL_GROUP_OBJ:
 		acl_entry.ae_id = gid;
 		break;
 
 	case ACL_OTHER:
 		acl_entry.ae_id = ACL_UNDEFINED_ID;
 		break;
 
 	default:
 		acl_entry.ae_id = ACL_UNDEFINED_ID;
 		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
 	}
 
 	return (acl_entry);
 }
 
 /*
  * Utility function to generate a file mode given appropriate ACL entries.
  */
 mode_t
 acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
     struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
 {
 	mode_t	mode;
 
 	mode = 0;
 	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXUSR;
 	if (acl_user_obj_entry->ae_perm & ACL_READ)
 		mode |= S_IRUSR;
 	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWUSR;
 	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXGRP;
 	if (acl_group_obj_entry->ae_perm & ACL_READ)
 		mode |= S_IRGRP;
 	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWGRP;
 	if (acl_other_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXOTH;
 	if (acl_other_entry->ae_perm & ACL_READ)
 		mode |= S_IROTH;
 	if (acl_other_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWOTH;
 
 	return (mode);
 }
 
 /*
  * Utility function to generate a file mode given a complete POSIX.1e access
  * ACL.  Note that if the ACL is improperly formed, this may result in a
  * panic.
  */
 mode_t
 acl_posix1e_acl_to_mode(struct acl *acl)
 {
 	struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj, *acl_other;
 	int i;
 
 	/*
 	 * Find the ACL entries relevant to a POSIX permission mode.
 	 */
 	acl_user_obj = acl_group_obj = acl_other = acl_mask = NULL;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			acl_user_obj = &acl->acl_entry[i];
 			break;
 
 		case ACL_GROUP_OBJ:
 			acl_group_obj = &acl->acl_entry[i];
 			break;
 
 		case ACL_OTHER:
 			acl_other = &acl->acl_entry[i];
 			break;
 
 		case ACL_MASK:
 			acl_mask = &acl->acl_entry[i];
 			break;
 
 		case ACL_USER:
 		case ACL_GROUP:
 			break;
 
 		default:
 			panic("acl_posix1e_acl_to_mode: bad ae_tag");
 		}
 	}
 
 	if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL)
 		panic("acl_posix1e_acl_to_mode: missing base ae_tags");
 
 	/*
 	 * POSIX.1e specifies that if there is an ACL_MASK entry, we replace
 	 * the mode "group" bits with its permissions.  If there isn't, we
 	 * use the ACL_GROUP_OBJ permissions.
 	 */
 	if (acl_mask != NULL)
 		return (acl_posix1e_perms_to_mode(acl_user_obj, acl_mask,
 		    acl_other));
 	else
 		return (acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj,
 		    acl_other));
 }
 
 /*
  * Perform a syntactic check of the ACL, sufficient to allow an implementing
  * filesystem to determine if it should accept this and rely on the POSIX.1e
  * ACL properties.
  */
 int
 acl_posix1e_check(struct acl *acl)
 {
 	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
 	int num_acl_mask, num_acl_other, i;
 
 	/*
 	 * Verify that the number of entries does not exceed the maximum
 	 * defined for acl_t.
 	 *
 	 * Verify that the correct number of various sorts of ae_tags are
 	 * present:
 	 *   Exactly one ACL_USER_OBJ
 	 *   Exactly one ACL_GROUP_OBJ
 	 *   Exactly one ACL_OTHER
 	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
 	 *   ACL_MASK entry must also appear.
 	 *
 	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
 	 *
 	 * Verify all ae_tag entries are understood by this implementation.
 	 *
 	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
 	 */
 	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
 	    num_acl_mask = num_acl_other = 0;
 	if (acl->acl_cnt > ACL_MAX_ENTRIES)
 		return (EINVAL);
 	for (i = 0; i < acl->acl_cnt; i++) {
 		/*
 		 * Check for a valid tag.
 		 */
 		switch(acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_user_obj++;
 			break;
 		case ACL_GROUP_OBJ:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_group_obj++;
 			break;
 		case ACL_USER:
 			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_user++;
 			break;
 		case ACL_GROUP:
 			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_group++;
 			break;
 		case ACL_OTHER:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_other++;
 			break;
 		case ACL_MASK:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_mask++;
 			break;
 		default:
 			return (EINVAL);
 		}
 		/*
 		 * Check for valid perm entries.
 		 */
 		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
 		    ACL_PERM_BITS)
 			return (EINVAL);
 	}
 	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
 	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
 		return (EINVAL);
 	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
 	    (num_acl_mask != 1))
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Given a requested mode for a new object, and a default ACL, combine the
  * two to produce a new mode.  Be careful not to clear any bits that aren't
  * intended to be affected by the POSIX.1e ACL.  Eventually, this might also
  * take the cmask as an argument, if we push that down into
  * per-filesystem-code.
  */
 mode_t
 acl_posix1e_newfilemode(mode_t cmode, struct acl *dacl)
 {
 	mode_t mode;
 
 	mode = cmode;
 	/*
 	 * The current composition policy is that a permission bit must be
 	 * set in *both* the ACL and the requested creation mode for it to
 	 * appear in the resulting mode/ACL.  First clear any possibly
 	 * effected bits, then reconstruct.
 	 */
 	mode &= ACL_PRESERVE_MASK;
 	mode |= (ACL_OVERRIDE_MASK & cmode & acl_posix1e_acl_to_mode(dacl));
 
 	return (mode);
 }
 
 
 static int
 acl_posix1e_modload(module_t mod, int what, void *arg)
 {
 	int ret;
 
 	ret = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 	case MOD_SHUTDOWN:
 		break;
 
 	case MOD_QUIESCE:
 		/* XXX TODO */
 		ret = 0;
 		break;
 
 	case MOD_UNLOAD:
 		/* XXX TODO */
 		ret = 0;
 		break;
 	default:
 		ret = EINVAL;
 		break;
 	}
 
 	return (ret);
 }
 
 static moduledata_t acl_posix1e_mod = {
 	"acl_posix1e",
 	acl_posix1e_modload,
 	NULL
 };
 
 DECLARE_MODULE(acl_posix1e, acl_posix1e_mod, SI_SUB_VFS, SI_ORDER_FIRST);
 MODULE_VERSION(acl_posix1e, 1);
Index: head/sys/kern/subr_bufring.c
===================================================================
--- head/sys/kern/subr_bufring.c	(revision 326270)
+++ head/sys/kern/subr_bufring.c	(revision 326271)
@@ -1,65 +1,67 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2007, 2008 Kip Macy <kmacy@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/ktr.h>
 #include <sys/buf_ring.h>
 
 
 struct buf_ring *
 buf_ring_alloc(int count, struct malloc_type *type, int flags, struct mtx *lock)
 {
 	struct buf_ring *br;
 
 	KASSERT(powerof2(count), ("buf ring must be size power of 2"));
 	
 	br = malloc(sizeof(struct buf_ring) + count*sizeof(caddr_t),
 	    type, flags|M_ZERO);
 	if (br == NULL)
 		return (NULL);
 #ifdef DEBUG_BUFRING
 	br->br_lock = lock;
 #endif	
 	br->br_prod_size = br->br_cons_size = count;
 	br->br_prod_mask = br->br_cons_mask = count-1;
 	br->br_prod_head = br->br_cons_head = 0;
 	br->br_prod_tail = br->br_cons_tail = 0;
 		
 	return (br);
 }
 
 void
 buf_ring_free(struct buf_ring *br, struct malloc_type *type)
 {
 	free(br, type);
 }
Index: head/sys/kern/subr_bus.c
===================================================================
--- head/sys/kern/subr_bus.c	(revision 326270)
+++ head/sys/kern/subr_bus.c	(revision 326271)
@@ -1,5625 +1,5627 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1997,1998,2003 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bus.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/filio.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 #include <sys/queue.h>
 #include <machine/bus.h>
 #include <sys/random.h>
 #include <sys/rman.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/cpuset.h>
 
 #include <net/vnet.h>
 
 #include <machine/cpu.h>
 #include <machine/stdarg.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
 SYSCTL_ROOT_NODE(OID_AUTO, dev, CTLFLAG_RW, NULL, NULL);
 
 /*
  * Used to attach drivers to devclasses.
  */
 typedef struct driverlink *driverlink_t;
 struct driverlink {
 	kobj_class_t	driver;
 	TAILQ_ENTRY(driverlink) link;	/* list of drivers in devclass */
 	int		pass;
 	TAILQ_ENTRY(driverlink) passlink;
 };
 
 /*
  * Forward declarations
  */
 typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
 typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
 typedef TAILQ_HEAD(device_list, device) device_list_t;
 
 struct devclass {
 	TAILQ_ENTRY(devclass) link;
 	devclass_t	parent;		/* parent in devclass hierarchy */
 	driver_list_t	drivers;     /* bus devclasses store drivers for bus */
 	char		*name;
 	device_t	*devices;	/* array of devices indexed by unit */
 	int		maxunit;	/* size of devices array */
 	int		flags;
 #define DC_HAS_CHILDREN		1
 
 	struct sysctl_ctx_list sysctl_ctx;
 	struct sysctl_oid *sysctl_tree;
 };
 
 /**
  * @brief Implementation of device.
  */
 struct device {
 	/*
 	 * A device is a kernel object. The first field must be the
 	 * current ops table for the object.
 	 */
 	KOBJ_FIELDS;
 
 	/*
 	 * Device hierarchy.
 	 */
 	TAILQ_ENTRY(device)	link;	/**< list of devices in parent */
 	TAILQ_ENTRY(device)	devlink; /**< global device list membership */
 	device_t	parent;		/**< parent of this device  */
 	device_list_t	children;	/**< list of child devices */
 
 	/*
 	 * Details of this device.
 	 */
 	driver_t	*driver;	/**< current driver */
 	devclass_t	devclass;	/**< current device class */
 	int		unit;		/**< current unit number */
 	char*		nameunit;	/**< name+unit e.g. foodev0 */
 	char*		desc;		/**< driver specific description */
 	int		busy;		/**< count of calls to device_busy() */
 	device_state_t	state;		/**< current device state  */
 	uint32_t	devflags;	/**< api level flags for device_get_flags() */
 	u_int		flags;		/**< internal device flags  */
 	u_int	order;			/**< order from device_add_child_ordered() */
 	void	*ivars;			/**< instance variables  */
 	void	*softc;			/**< current driver's variables  */
 
 	struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables  */
 	struct sysctl_oid *sysctl_tree;	/**< state for sysctl variables */
 };
 
 static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
 static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
 
 static void devctl2_init(void);
 
 #define DRIVERNAME(d)	((d)? d->name : "no driver")
 #define DEVCLANAME(d)	((d)? d->name : "no devclass")
 
 #ifdef BUS_DEBUG
 
 static int bus_debug = 1;
 SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RWTUN, &bus_debug, 0,
     "Bus debug level");
 
 #define PDEBUG(a)	if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
 #define DEVICENAME(d)	((d)? device_get_name(d): "no device")
 
 /**
  * Produce the indenting, indent*2 spaces plus a '.' ahead of that to
  * prevent syslog from deleting initial spaces
  */
 #define indentprintf(p)	do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf("  "); printf p ; } while (0)
 
 static void print_device_short(device_t dev, int indent);
 static void print_device(device_t dev, int indent);
 void print_device_tree_short(device_t dev, int indent);
 void print_device_tree(device_t dev, int indent);
 static void print_driver_short(driver_t *driver, int indent);
 static void print_driver(driver_t *driver, int indent);
 static void print_driver_list(driver_list_t drivers, int indent);
 static void print_devclass_short(devclass_t dc, int indent);
 static void print_devclass(devclass_t dc, int indent);
 void print_devclass_list_short(void);
 void print_devclass_list(void);
 
 #else
 /* Make the compiler ignore the function calls */
 #define PDEBUG(a)			/* nop */
 #define DEVICENAME(d)			/* nop */
 
 #define print_device_short(d,i)		/* nop */
 #define print_device(d,i)		/* nop */
 #define print_device_tree_short(d,i)	/* nop */
 #define print_device_tree(d,i)		/* nop */
 #define print_driver_short(d,i)		/* nop */
 #define print_driver(d,i)		/* nop */
 #define print_driver_list(d,i)		/* nop */
 #define print_devclass_short(d,i)	/* nop */
 #define print_devclass(d,i)		/* nop */
 #define print_devclass_list_short()	/* nop */
 #define print_devclass_list()		/* nop */
 #endif
 
 /*
  * dev sysctl tree
  */
 
 enum {
 	DEVCLASS_SYSCTL_PARENT,
 };
 
 static int
 devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	devclass_t dc = (devclass_t)arg1;
 	const char *value;
 
 	switch (arg2) {
 	case DEVCLASS_SYSCTL_PARENT:
 		value = dc->parent ? dc->parent->name : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (SYSCTL_OUT_STR(req, value));
 }
 
 static void
 devclass_sysctl_init(devclass_t dc)
 {
 
 	if (dc->sysctl_tree != NULL)
 		return;
 	sysctl_ctx_init(&dc->sysctl_ctx);
 	dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
 	    CTLFLAG_RD, NULL, "");
 	SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
 	    OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
 	    dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
 	    "parent class");
 }
 
 enum {
 	DEVICE_SYSCTL_DESC,
 	DEVICE_SYSCTL_DRIVER,
 	DEVICE_SYSCTL_LOCATION,
 	DEVICE_SYSCTL_PNPINFO,
 	DEVICE_SYSCTL_PARENT,
 };
 
 static int
 device_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	device_t dev = (device_t)arg1;
 	const char *value;
 	char *buf;
 	int error;
 
 	buf = NULL;
 	switch (arg2) {
 	case DEVICE_SYSCTL_DESC:
 		value = dev->desc ? dev->desc : "";
 		break;
 	case DEVICE_SYSCTL_DRIVER:
 		value = dev->driver ? dev->driver->name : "";
 		break;
 	case DEVICE_SYSCTL_LOCATION:
 		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
 		bus_child_location_str(dev, buf, 1024);
 		break;
 	case DEVICE_SYSCTL_PNPINFO:
 		value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
 		bus_child_pnpinfo_str(dev, buf, 1024);
 		break;
 	case DEVICE_SYSCTL_PARENT:
 		value = dev->parent ? dev->parent->nameunit : "";
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = SYSCTL_OUT_STR(req, value);
 	if (buf != NULL)
 		free(buf, M_BUS);
 	return (error);
 }
 
 static void
 device_sysctl_init(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 	int domain;
 
 	if (dev->sysctl_tree != NULL)
 		return;
 	devclass_sysctl_init(dc);
 	sysctl_ctx_init(&dev->sysctl_ctx);
 	dev->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&dev->sysctl_ctx,
 	    SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
 	    dev->nameunit + strlen(dc->name),
 	    CTLFLAG_RD, NULL, "", "device_index");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
 	    "device description");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%driver", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
 	    "device driver name");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%location", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
 	    "device location relative to parent");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%pnpinfo", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
 	    "device identification");
 	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
 	    OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
 	    "parent device");
 	if (bus_get_domain(dev, &domain) == 0)
 		SYSCTL_ADD_INT(&dev->sysctl_ctx,
 		    SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%domain",
 		    CTLFLAG_RD, NULL, domain, "NUMA domain");
 }
 
 static void
 device_sysctl_update(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 
 	if (dev->sysctl_tree == NULL)
 		return;
 	sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name));
 }
 
 static void
 device_sysctl_fini(device_t dev)
 {
 	if (dev->sysctl_tree == NULL)
 		return;
 	sysctl_ctx_free(&dev->sysctl_ctx);
 	dev->sysctl_tree = NULL;
 }
 
 /*
  * /dev/devctl implementation
  */
 
 /*
  * This design allows only one reader for /dev/devctl.  This is not desirable
  * in the long run, but will get a lot of hair out of this implementation.
  * Maybe we should make this device a clonable device.
  *
  * Also note: we specifically do not attach a device to the device_t tree
  * to avoid potential chicken and egg problems.  One could argue that all
  * of this belongs to the root node.  One could also further argue that the
  * sysctl interface that we have not might more properly be an ioctl
  * interface, but at this stage of the game, I'm not inclined to rock that
  * boat.
  *
  * I'm also not sure that the SIGIO support is done correctly or not, as
  * I copied it from a driver that had SIGIO support that likely hasn't been
  * tested since 3.4 or 2.2.8!
  */
 
 /* Deprecated way to adjust queue length */
 static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_disable, "I",
     "devctl disable -- deprecated");
 
 #define DEVCTL_DEFAULT_QUEUE_LEN 1000
 static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS);
 static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
 SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RWTUN |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_queue, "I", "devctl queue length");
 
 static d_open_t		devopen;
 static d_close_t	devclose;
 static d_read_t		devread;
 static d_ioctl_t	devioctl;
 static d_poll_t		devpoll;
 static d_kqfilter_t	devkqfilter;
 
 static struct cdevsw dev_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	devopen,
 	.d_close =	devclose,
 	.d_read =	devread,
 	.d_ioctl =	devioctl,
 	.d_poll =	devpoll,
 	.d_kqfilter =	devkqfilter,
 	.d_name =	"devctl",
 };
 
 struct dev_event_info
 {
 	char *dei_data;
 	TAILQ_ENTRY(dev_event_info) dei_link;
 };
 
 TAILQ_HEAD(devq, dev_event_info);
 
 static struct dev_softc
 {
 	int	inuse;
 	int	nonblock;
 	int	queued;
 	int	async;
 	struct mtx mtx;
 	struct cv cv;
 	struct selinfo sel;
 	struct devq devq;
 	struct sigio *sigio;
 } devsoftc;
 
 static void	filt_devctl_detach(struct knote *kn);
 static int	filt_devctl_read(struct knote *kn, long hint);
 
 struct filterops devctl_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_devctl_detach,
 	.f_event = filt_devctl_read,
 };
 
 static struct cdev *devctl_dev;
 
 static void
 devinit(void)
 {
 	devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0600, "devctl");
 	mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
 	cv_init(&devsoftc.cv, "dev cv");
 	TAILQ_INIT(&devsoftc.devq);
 	knlist_init_mtx(&devsoftc.sel.si_note, &devsoftc.mtx);
 	devctl2_init();
 }
 
 static int
 devopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 
 	mtx_lock(&devsoftc.mtx);
 	if (devsoftc.inuse) {
 		mtx_unlock(&devsoftc.mtx);
 		return (EBUSY);
 	}
 	/* move to init */
 	devsoftc.inuse = 1;
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 static int
 devclose(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 
 	mtx_lock(&devsoftc.mtx);
 	devsoftc.inuse = 0;
 	devsoftc.nonblock = 0;
 	devsoftc.async = 0;
 	cv_broadcast(&devsoftc.cv);
 	funsetown(&devsoftc.sigio);
 	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 /*
  * The read channel for this device is used to report changes to
  * userland in realtime.  We are required to free the data as well as
  * the n1 object because we allocate them separately.  Also note that
  * we return one record at a time.  If you try to read this device a
  * character at a time, you will lose the rest of the data.  Listening
  * programs are expected to cope.
  */
 static int
 devread(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct dev_event_info *n1;
 	int rv;
 
 	mtx_lock(&devsoftc.mtx);
 	while (TAILQ_EMPTY(&devsoftc.devq)) {
 		if (devsoftc.nonblock) {
 			mtx_unlock(&devsoftc.mtx);
 			return (EAGAIN);
 		}
 		rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
 		if (rv) {
 			/*
 			 * Need to translate ERESTART to EINTR here? -- jake
 			 */
 			mtx_unlock(&devsoftc.mtx);
 			return (rv);
 		}
 	}
 	n1 = TAILQ_FIRST(&devsoftc.devq);
 	TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 	devsoftc.queued--;
 	mtx_unlock(&devsoftc.mtx);
 	rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
 	free(n1->dei_data, M_BUS);
 	free(n1, M_BUS);
 	return (rv);
 }
 
 static	int
 devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	switch (cmd) {
 
 	case FIONBIO:
 		if (*(int*)data)
 			devsoftc.nonblock = 1;
 		else
 			devsoftc.nonblock = 0;
 		return (0);
 	case FIOASYNC:
 		if (*(int*)data)
 			devsoftc.async = 1;
 		else
 			devsoftc.async = 0;
 		return (0);
 	case FIOSETOWN:
 		return fsetown(*(int *)data, &devsoftc.sigio);
 	case FIOGETOWN:
 		*(int *)data = fgetown(&devsoftc.sigio);
 		return (0);
 
 		/* (un)Support for other fcntl() calls. */
 	case FIOCLEX:
 	case FIONCLEX:
 	case FIONREAD:
 	default:
 		break;
 	}
 	return (ENOTTY);
 }
 
 static	int
 devpoll(struct cdev *dev, int events, struct thread *td)
 {
 	int	revents = 0;
 
 	mtx_lock(&devsoftc.mtx);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (!TAILQ_EMPTY(&devsoftc.devq))
 			revents = events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &devsoftc.sel);
 	}
 	mtx_unlock(&devsoftc.mtx);
 
 	return (revents);
 }
 
 static int
 devkqfilter(struct cdev *dev, struct knote *kn)
 {
 	int error;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &devctl_rfiltops;
 		knlist_add(&devsoftc.sel.si_note, kn, 0);
 		error = 0;
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_devctl_detach(struct knote *kn)
 {
 
 	knlist_remove(&devsoftc.sel.si_note, kn, 0);
 }
 
 static int
 filt_devctl_read(struct knote *kn, long hint)
 {
 	kn->kn_data = devsoftc.queued;
 	return (kn->kn_data != 0);
 }
 
 /**
  * @brief Return whether the userland process is running
  */
 boolean_t
 devctl_process_running(void)
 {
 	return (devsoftc.inuse == 1);
 }
 
 /**
  * @brief Queue data to be read from the devctl device
  *
  * Generic interface to queue data to the devctl device.  It is
  * assumed that @p data is properly formatted.  It is further assumed
  * that @p data is allocated using the M_BUS malloc type.
  */
 void
 devctl_queue_data_f(char *data, int flags)
 {
 	struct dev_event_info *n1 = NULL, *n2 = NULL;
 
 	if (strlen(data) == 0)
 		goto out;
 	if (devctl_queue_length == 0)
 		goto out;
 	n1 = malloc(sizeof(*n1), M_BUS, flags);
 	if (n1 == NULL)
 		goto out;
 	n1->dei_data = data;
 	mtx_lock(&devsoftc.mtx);
 	if (devctl_queue_length == 0) {
 		mtx_unlock(&devsoftc.mtx);
 		free(n1->dei_data, M_BUS);
 		free(n1, M_BUS);
 		return;
 	}
 	/* Leave at least one spot in the queue... */
 	while (devsoftc.queued > devctl_queue_length - 1) {
 		n2 = TAILQ_FIRST(&devsoftc.devq);
 		TAILQ_REMOVE(&devsoftc.devq, n2, dei_link);
 		free(n2->dei_data, M_BUS);
 		free(n2, M_BUS);
 		devsoftc.queued--;
 	}
 	TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
 	devsoftc.queued++;
 	cv_broadcast(&devsoftc.cv);
 	KNOTE_LOCKED(&devsoftc.sel.si_note, 0);
 	mtx_unlock(&devsoftc.mtx);
 	selwakeup(&devsoftc.sel);
 	if (devsoftc.async && devsoftc.sigio != NULL)
 		pgsigio(&devsoftc.sigio, SIGIO, 0);
 	return;
 out:
 	/*
 	 * We have to free data on all error paths since the caller
 	 * assumes it will be free'd when this item is dequeued.
 	 */
 	free(data, M_BUS);
 	return;
 }
 
 void
 devctl_queue_data(char *data)
 {
 
 	devctl_queue_data_f(data, M_NOWAIT);
 }
 
 /**
  * @brief Send a 'notification' to userland, using standard ways
  */
 void
 devctl_notify_f(const char *system, const char *subsystem, const char *type,
     const char *data, int flags)
 {
 	int len = 0;
 	char *msg;
 
 	if (system == NULL)
 		return;		/* BOGUS!  Must specify system. */
 	if (subsystem == NULL)
 		return;		/* BOGUS!  Must specify subsystem. */
 	if (type == NULL)
 		return;		/* BOGUS!  Must specify type. */
 	len += strlen(" system=") + strlen(system);
 	len += strlen(" subsystem=") + strlen(subsystem);
 	len += strlen(" type=") + strlen(type);
 	/* add in the data message plus newline. */
 	if (data != NULL)
 		len += strlen(data);
 	len += 3;	/* '!', '\n', and NUL */
 	msg = malloc(len, M_BUS, flags);
 	if (msg == NULL)
 		return;		/* Drop it on the floor */
 	if (data != NULL)
 		snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n",
 		    system, subsystem, type, data);
 	else
 		snprintf(msg, len, "!system=%s subsystem=%s type=%s\n",
 		    system, subsystem, type);
 	devctl_queue_data_f(msg, flags);
 }
 
 void
 devctl_notify(const char *system, const char *subsystem, const char *type,
     const char *data)
 {
 
 	devctl_notify_f(system, subsystem, type, data, M_NOWAIT);
 }
 
 /*
  * Common routine that tries to make sending messages as easy as possible.
  * We allocate memory for the data, copy strings into that, but do not
  * free it unless there's an error.  The dequeue part of the driver should
  * free the data.  We don't send data when the device is disabled.  We do
  * send data, even when we have no listeners, because we wish to avoid
  * races relating to startup and restart of listening applications.
  *
  * devaddq is designed to string together the type of event, with the
  * object of that event, plus the plug and play info and location info
  * for that event.  This is likely most useful for devices, but less
  * useful for other consumers of this interface.  Those should use
  * the devctl_queue_data() interface instead.
  */
 static void
 devaddq(const char *type, const char *what, device_t dev)
 {
 	char *data = NULL;
 	char *loc = NULL;
 	char *pnp = NULL;
 	const char *parstr;
 
 	if (!devctl_queue_length)/* Rare race, but lost races safely discard */
 		return;
 	data = malloc(1024, M_BUS, M_NOWAIT);
 	if (data == NULL)
 		goto bad;
 
 	/* get the bus specific location of this device */
 	loc = malloc(1024, M_BUS, M_NOWAIT);
 	if (loc == NULL)
 		goto bad;
 	*loc = '\0';
 	bus_child_location_str(dev, loc, 1024);
 
 	/* Get the bus specific pnp info of this device */
 	pnp = malloc(1024, M_BUS, M_NOWAIT);
 	if (pnp == NULL)
 		goto bad;
 	*pnp = '\0';
 	bus_child_pnpinfo_str(dev, pnp, 1024);
 
 	/* Get the parent of this device, or / if high enough in the tree. */
 	if (device_get_parent(dev) == NULL)
 		parstr = ".";	/* Or '/' ? */
 	else
 		parstr = device_get_nameunit(device_get_parent(dev));
 	/* String it all together. */
 	snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp,
 	  parstr);
 	free(loc, M_BUS);
 	free(pnp, M_BUS);
 	devctl_queue_data(data);
 	return;
 bad:
 	free(pnp, M_BUS);
 	free(loc, M_BUS);
 	free(data, M_BUS);
 	return;
 }
 
 /*
  * A device was added to the tree.  We are called just after it successfully
  * attaches (that is, probe and attach success for this device).  No call
  * is made if a device is merely parented into the tree.  See devnomatch
  * if probe fails.  If attach fails, no notification is sent (but maybe
  * we should have a different message for this).
  */
 static void
 devadded(device_t dev)
 {
 	devaddq("+", device_get_nameunit(dev), dev);
 }
 
 /*
  * A device was removed from the tree.  We are called just before this
  * happens.
  */
 static void
 devremoved(device_t dev)
 {
 	devaddq("-", device_get_nameunit(dev), dev);
 }
 
 /*
  * Called when there's no match for this device.  This is only called
  * the first time that no match happens, so we don't keep getting this
  * message.  Should that prove to be undesirable, we can change it.
  * This is called when all drivers that can attach to a given bus
  * decline to accept this device.  Other errors may not be detected.
  */
 static void
 devnomatch(device_t dev)
 {
 	devaddq("?", "", dev);
 }
 
 static int
 sysctl_devctl_disable(SYSCTL_HANDLER_ARGS)
 {
 	struct dev_event_info *n1;
 	int dis, error;
 
 	dis = (devctl_queue_length == 0);
 	error = sysctl_handle_int(oidp, &dis, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (mtx_initialized(&devsoftc.mtx))
 		mtx_lock(&devsoftc.mtx);
 	if (dis) {
 		while (!TAILQ_EMPTY(&devsoftc.devq)) {
 			n1 = TAILQ_FIRST(&devsoftc.devq);
 			TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 			free(n1->dei_data, M_BUS);
 			free(n1, M_BUS);
 		}
 		devsoftc.queued = 0;
 		devctl_queue_length = 0;
 	} else {
 		devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
 	}
 	if (mtx_initialized(&devsoftc.mtx))
 		mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 static int
 sysctl_devctl_queue(SYSCTL_HANDLER_ARGS)
 {
 	struct dev_event_info *n1;
 	int q, error;
 
 	q = devctl_queue_length;
 	error = sysctl_handle_int(oidp, &q, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (q < 0)
 		return (EINVAL);
 	if (mtx_initialized(&devsoftc.mtx))
 		mtx_lock(&devsoftc.mtx);
 	devctl_queue_length = q;
 	while (devsoftc.queued > devctl_queue_length) {
 		n1 = TAILQ_FIRST(&devsoftc.devq);
 		TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
 		free(n1->dei_data, M_BUS);
 		free(n1, M_BUS);
 		devsoftc.queued--;
 	}
 	if (mtx_initialized(&devsoftc.mtx))
 		mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
 /**
  * @brief safely quotes strings that might have double quotes in them.
  *
  * The devctl protocol relies on quoted strings having matching quotes.
  * This routine quotes any internal quotes so the resulting string
  * is safe to pass to snprintf to construct, for example pnp info strings.
  * Strings are always terminated with a NUL, but may be truncated if longer
  * than @p len bytes after quotes.
  *
  * @param dst	Buffer to hold the string. Must be at least @p len bytes long
  * @param src	Original buffer.
  * @param len	Length of buffer pointed to by @dst, including trailing NUL
  */
 void
 devctl_safe_quote(char *dst, const char *src, size_t len)
 {
 	char *walker = dst, *ep = dst + len - 1;
 
 	if (len == 0)
 		return;
 	while (src != NULL && walker < ep)
 	{
 		if (*src == '"' || *src == '\\') {
 			if (ep - walker < 2)
 				break;
 			*walker++ = '\\';
 		}
 		*walker++ = *src++;
 	}
 	*walker = '\0';
 }
 
 /* End of /dev/devctl code */
 
 static TAILQ_HEAD(,device)	bus_data_devices;
 static int bus_data_generation = 1;
 
 static kobj_method_t null_methods[] = {
 	KOBJMETHOD_END
 };
 
 DEFINE_CLASS(null, null_methods, 0);
 
 /*
  * Bus pass implementation
  */
 
 static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes);
 int bus_current_pass = BUS_PASS_ROOT;
 
 /**
  * @internal
  * @brief Register the pass level of a new driver attachment
  *
  * Register a new driver attachment's pass level.  If no driver
  * attachment with the same pass level has been added, then @p new
  * will be added to the global passes list.
  *
  * @param new		the new driver attachment
  */
 static void
 driver_register_pass(struct driverlink *new)
 {
 	struct driverlink *dl;
 
 	/* We only consider pass numbers during boot. */
 	if (bus_current_pass == BUS_PASS_DEFAULT)
 		return;
 
 	/*
 	 * Walk the passes list.  If we already know about this pass
 	 * then there is nothing to do.  If we don't, then insert this
 	 * driver link into the list.
 	 */
 	TAILQ_FOREACH(dl, &passes, passlink) {
 		if (dl->pass < new->pass)
 			continue;
 		if (dl->pass == new->pass)
 			return;
 		TAILQ_INSERT_BEFORE(dl, new, passlink);
 		return;
 	}
 	TAILQ_INSERT_TAIL(&passes, new, passlink);
 }
 
 /**
  * @brief Raise the current bus pass
  *
  * Raise the current bus pass level to @p pass.  Call the BUS_NEW_PASS()
  * method on the root bus to kick off a new device tree scan for each
  * new pass level that has at least one driver.
  */
 void
 bus_set_pass(int pass)
 {
 	struct driverlink *dl;
 
 	if (bus_current_pass > pass)
 		panic("Attempt to lower bus pass level");
 
 	TAILQ_FOREACH(dl, &passes, passlink) {
 		/* Skip pass values below the current pass level. */
 		if (dl->pass <= bus_current_pass)
 			continue;
 
 		/*
 		 * Bail once we hit a driver with a pass level that is
 		 * too high.
 		 */
 		if (dl->pass > pass)
 			break;
 
 		/*
 		 * Raise the pass level to the next level and rescan
 		 * the tree.
 		 */
 		bus_current_pass = dl->pass;
 		BUS_NEW_PASS(root_bus);
 	}
 
 	/*
 	 * If there isn't a driver registered for the requested pass,
 	 * then bus_current_pass might still be less than 'pass'.  Set
 	 * it to 'pass' in that case.
 	 */
 	if (bus_current_pass < pass)
 		bus_current_pass = pass;
 	KASSERT(bus_current_pass == pass, ("Failed to update bus pass level"));
 }
 
 /*
  * Devclass implementation
  */
 
 static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
 
 /**
  * @internal
  * @brief Find or create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise if @p create is non-zero create and return a new device
  * class.
  *
  * If @p parentname is non-NULL, the parent of the devclass is set to
  * the devclass of that name.
  *
  * @param classname	the devclass name to find or create
  * @param parentname	the parent devclass name or @c NULL
  * @param create	non-zero to create a devclass
  */
 static devclass_t
 devclass_find_internal(const char *classname, const char *parentname,
 		       int create)
 {
 	devclass_t dc;
 
 	PDEBUG(("looking for %s", classname));
 	if (!classname)
 		return (NULL);
 
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		if (!strcmp(dc->name, classname))
 			break;
 	}
 
 	if (create && !dc) {
 		PDEBUG(("creating %s", classname));
 		dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
 		    M_BUS, M_NOWAIT | M_ZERO);
 		if (!dc)
 			return (NULL);
 		dc->parent = NULL;
 		dc->name = (char*) (dc + 1);
 		strcpy(dc->name, classname);
 		TAILQ_INIT(&dc->drivers);
 		TAILQ_INSERT_TAIL(&devclasses, dc, link);
 
 		bus_data_generation_update();
 	}
 
 	/*
 	 * If a parent class is specified, then set that as our parent so
 	 * that this devclass will support drivers for the parent class as
 	 * well.  If the parent class has the same name don't do this though
 	 * as it creates a cycle that can trigger an infinite loop in
 	 * device_probe_child() if a device exists for which there is no
 	 * suitable driver.
 	 */
 	if (parentname && dc && !dc->parent &&
 	    strcmp(classname, parentname) != 0) {
 		dc->parent = devclass_find_internal(parentname, NULL, TRUE);
 		dc->parent->flags |= DC_HAS_CHILDREN;
 	}
 
 	return (dc);
 }
 
 /**
  * @brief Create a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise create and return a new device class.
  *
  * @param classname	the devclass name to find or create
  */
 devclass_t
 devclass_create(const char *classname)
 {
 	return (devclass_find_internal(classname, NULL, TRUE));
 }
 
 /**
  * @brief Find a device class
  *
  * If a device class with the name @p classname exists, return it,
  * otherwise return @c NULL.
  *
  * @param classname	the devclass name to find
  */
 devclass_t
 devclass_find(const char *classname)
 {
 	return (devclass_find_internal(classname, NULL, FALSE));
 }
 
 /**
  * @brief Register that a device driver has been added to a devclass
  *
  * Register that a device driver has been added to a devclass.  This
  * is called by devclass_add_driver to accomplish the recursive
  * notification of all the children classes of dc, as well as dc.
  * Each layer will have BUS_DRIVER_ADDED() called for all instances of
  * the devclass.
  *
  * We do a full search here of the devclass list at each iteration
  * level to save storing children-lists in the devclass structure.  If
  * we ever move beyond a few dozen devices doing this, we may need to
  * reevaluate...
  *
  * @param dc		the devclass to edit
  * @param driver	the driver that was just added
  */
 static void
 devclass_driver_added(devclass_t dc, driver_t *driver)
 {
 	devclass_t parent;
 	int i;
 
 	/*
 	 * Call BUS_DRIVER_ADDED for any existing buses in this class.
 	 */
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i] && device_is_attached(dc->devices[i]))
 			BUS_DRIVER_ADDED(dc->devices[i], driver);
 
 	/*
 	 * Walk through the children classes.  Since we only keep a
 	 * single parent pointer around, we walk the entire list of
 	 * devclasses looking for children.  We set the
 	 * DC_HAS_CHILDREN flag when a child devclass is created on
 	 * the parent, so we only walk the list for those devclasses
 	 * that have children.
 	 */
 	if (!(dc->flags & DC_HAS_CHILDREN))
 		return;
 	parent = dc;
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		if (dc->parent == parent)
 			devclass_driver_added(dc, driver);
 	}
 }
 
 /**
  * @brief Add a device driver to a device class
  *
  * Add a device driver to a devclass. This is normally called
  * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
  * all devices in the devclass will be called to allow them to attempt
  * to re-probe any unmatched children.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to register
  */
 int
 devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp)
 {
 	driverlink_t dl;
 	const char *parentname;
 
 	PDEBUG(("%s", DRIVERNAME(driver)));
 
 	/* Don't allow invalid pass values. */
 	if (pass <= BUS_PASS_ROOT)
 		return (EINVAL);
 
 	dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dl)
 		return (ENOMEM);
 
 	/*
 	 * Compile the driver's methods. Also increase the reference count
 	 * so that the class doesn't get freed when the last instance
 	 * goes. This means we can safely use static methods and avoids a
 	 * double-free in devclass_delete_driver.
 	 */
 	kobj_class_compile((kobj_class_t) driver);
 
 	/*
 	 * If the driver has any base classes, make the
 	 * devclass inherit from the devclass of the driver's
 	 * first base class. This will allow the system to
 	 * search for drivers in both devclasses for children
 	 * of a device using this driver.
 	 */
 	if (driver->baseclasses)
 		parentname = driver->baseclasses[0]->name;
 	else
 		parentname = NULL;
 	*dcp = devclass_find_internal(driver->name, parentname, TRUE);
 
 	dl->driver = driver;
 	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
 	driver->refs++;		/* XXX: kobj_mtx */
 	dl->pass = pass;
 	driver_register_pass(dl);
 
 	devclass_driver_added(dc, driver);
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Register that a device driver has been deleted from a devclass
  *
  * Register that a device driver has been removed from a devclass.
  * This is called by devclass_delete_driver to accomplish the
  * recursive notification of all the children classes of busclass, as
  * well as busclass.  Each layer will attempt to detach the driver
  * from any devices that are children of the bus's devclass.  The function
  * will return an error if a device fails to detach.
  *
  * We do a full search here of the devclass list at each iteration
  * level to save storing children-lists in the devclass structure.  If
  * we ever move beyond a few dozen devices doing this, we may need to
  * reevaluate...
  *
  * @param busclass	the devclass of the parent bus
  * @param dc		the devclass of the driver being deleted
  * @param driver	the driver being deleted
  */
 static int
 devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver)
 {
 	devclass_t parent;
 	device_t dev;
 	int error, i;
 
 	/*
 	 * Disassociate from any devices.  We iterate through all the
 	 * devices in the devclass of the driver and detach any which are
 	 * using the driver and which have a parent in the devclass which
 	 * we are deleting from.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not detach devices which are not children of devices in
 	 * the affected devclass.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_detach(dev)) != 0)
 					return (error);
 				BUS_PROBE_NOMATCH(dev->parent, dev);
 				devnomatch(dev);
 				dev->flags |= DF_DONENOMATCH;
 			}
 		}
 	}
 
 	/*
 	 * Walk through the children classes.  Since we only keep a
 	 * single parent pointer around, we walk the entire list of
 	 * devclasses looking for children.  We set the
 	 * DC_HAS_CHILDREN flag when a child devclass is created on
 	 * the parent, so we only walk the list for those devclasses
 	 * that have children.
 	 */
 	if (!(busclass->flags & DC_HAS_CHILDREN))
 		return (0);
 	parent = busclass;
 	TAILQ_FOREACH(busclass, &devclasses, link) {
 		if (busclass->parent == parent) {
 			error = devclass_driver_deleted(busclass, dc, driver);
 			if (error)
 				return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Delete a device driver from a device class
  *
  * Delete a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_delete_driver() will first attempt to detach from each
  * device. If one of the detach calls fails, the driver will not be
  * deleted.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 int
 devclass_delete_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	error = devclass_driver_deleted(busclass, dc, driver);
 	if (error != 0)
 		return (error);
 
 	TAILQ_REMOVE(&busclass->drivers, dl, link);
 	free(dl, M_BUS);
 
 	/* XXX: kobj_mtx */
 	driver->refs--;
 	if (driver->refs == 0)
 		kobj_class_free((kobj_class_t) driver);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Quiesces a set of device drivers from a device class
  *
  * Quiesce a device driver from a devclass. This is normally called
  * automatically by DRIVER_MODULE().
  *
  * If the driver is currently attached to any devices,
  * devclass_quiesece_driver() will first attempt to quiesce each
  * device.
  *
  * @param dc		the devclass to edit
  * @param driver	the driver to unregister
  */
 static int
 devclass_quiesce_driver(devclass_t busclass, driver_t *driver)
 {
 	devclass_t dc = devclass_find(driver->name);
 	driverlink_t dl;
 	device_t dev;
 	int i;
 	int error;
 
 	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
 
 	if (!dc)
 		return (0);
 
 	/*
 	 * Find the link structure in the bus' list of drivers.
 	 */
 	TAILQ_FOREACH(dl, &busclass->drivers, link) {
 		if (dl->driver == driver)
 			break;
 	}
 
 	if (!dl) {
 		PDEBUG(("%s not found in %s list", driver->name,
 		    busclass->name));
 		return (ENOENT);
 	}
 
 	/*
 	 * Quiesce all devices.  We iterate through all the devices in
 	 * the devclass of the driver and quiesce any which are using
 	 * the driver and which have a parent in the devclass which we
 	 * are quiescing.
 	 *
 	 * Note that since a driver can be in multiple devclasses, we
 	 * should not quiesce devices which are not children of
 	 * devices in the affected devclass.
 	 */
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			dev = dc->devices[i];
 			if (dev->driver == driver && dev->parent &&
 			    dev->parent->devclass == busclass) {
 				if ((error = device_quiesce(dev)) != 0)
 					return (error);
 			}
 		}
 	}
 
 	return (0);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 devclass_find_driver_internal(devclass_t dc, const char *classname)
 {
 	driverlink_t dl;
 
 	PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		if (!strcmp(dl->driver->name, classname))
 			return (dl);
 	}
 
 	PDEBUG(("not found"));
 	return (NULL);
 }
 
 /**
  * @brief Return the name of the devclass
  */
 const char *
 devclass_get_name(devclass_t dc)
 {
 	return (dc->name);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  *
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 devclass_get_device(devclass_t dc, int unit)
 {
 	if (dc == NULL || unit < 0 || unit >= dc->maxunit)
 		return (NULL);
 	return (dc->devices[unit]);
 }
 
 /**
  * @brief Find the softc field of a device given a unit number
  *
  * @param dc		the devclass to search
  * @param unit		the unit number to search for
  *
  * @returns		the softc field of the device with the given
  *			unit number or @c NULL if there is no such
  *			device
  */
 void *
 devclass_get_softc(devclass_t dc, int unit)
 {
 	device_t dev;
 
 	dev = devclass_get_device(dc, unit);
 	if (!dev)
 		return (NULL);
 
 	return (device_get_softc(dev));
 }
 
 /**
  * @brief Get a list of devices in the devclass
  *
  * An array containing a list of all the devices in the given devclass
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP), even if @p *devcountp is 0.
  *
  * @param dc		the devclass to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
 {
 	int count, i;
 	device_t *list;
 
 	count = devclass_get_count(dc);
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++) {
 		if (dc->devices[i]) {
 			list[count] = dc->devices[i];
 			count++;
 		}
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get a list of drivers in the devclass
  *
  * An array containing a list of pointers to all the drivers in the
  * given devclass is allocated and returned in @p *listp.  The number
  * of drivers in the array is returned in @p *countp. The caller should
  * free the array using @c free(p, M_TEMP).
  *
  * @param dc		the devclass to examine
  * @param listp		gives location for array pointer return value
  * @param countp	gives location for number of array elements
  *			return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp)
 {
 	driverlink_t dl;
 	driver_t **list;
 	int count;
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link)
 		count++;
 	list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
 	if (list == NULL)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		list[count] = dl->driver;
 		count++;
 	}
 	*listp = list;
 	*countp = count;
 
 	return (0);
 }
 
 /**
  * @brief Get the number of devices in a devclass
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_count(devclass_t dc)
 {
 	int count, i;
 
 	count = 0;
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			count++;
 	return (count);
 }
 
 /**
  * @brief Get the maximum unit number used in a devclass
  *
  * Note that this is one greater than the highest currently-allocated
  * unit.  If a null devclass_t is passed in, -1 is returned to indicate
  * that not even the devclass has been allocated yet.
  *
  * @param dc		the devclass to examine
  */
 int
 devclass_get_maxunit(devclass_t dc)
 {
 	if (dc == NULL)
 		return (-1);
 	return (dc->maxunit);
 }
 
 /**
  * @brief Find a free unit number in a devclass
  *
  * This function searches for the first unused unit number greater
  * that or equal to @p unit.
  *
  * @param dc		the devclass to examine
  * @param unit		the first unit number to check
  */
 int
 devclass_find_free_unit(devclass_t dc, int unit)
 {
 	if (dc == NULL)
 		return (unit);
 	while (unit < dc->maxunit && dc->devices[unit] != NULL)
 		unit++;
 	return (unit);
 }
 
 /**
  * @brief Set the parent of a devclass
  *
  * The parent class is normally initialised automatically by
  * DRIVER_MODULE().
  *
  * @param dc		the devclass to edit
  * @param pdc		the new parent devclass
  */
 void
 devclass_set_parent(devclass_t dc, devclass_t pdc)
 {
 	dc->parent = pdc;
 }
 
 /**
  * @brief Get the parent of a devclass
  *
  * @param dc		the devclass to examine
  */
 devclass_t
 devclass_get_parent(devclass_t dc)
 {
 	return (dc->parent);
 }
 
 struct sysctl_ctx_list *
 devclass_get_sysctl_ctx(devclass_t dc)
 {
 	return (&dc->sysctl_ctx);
 }
 
 struct sysctl_oid *
 devclass_get_sysctl_tree(devclass_t dc)
 {
 	return (dc->sysctl_tree);
 }
 
 /**
  * @internal
  * @brief Allocate a unit number
  *
  * On entry, @p *unitp is the desired unit number (or @c -1 if any
  * will do). The allocated unit number is returned in @p *unitp.
 
  * @param dc		the devclass to allocate from
  * @param unitp		points at the location for the allocated unit
  *			number
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp)
 {
 	const char *s;
 	int unit = *unitp;
 
 	PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	/* Ask the parent bus if it wants to wire this device. */
 	if (unit == -1)
 		BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name,
 		    &unit);
 
 	/* If we were given a wired unit number, check for existing device */
 	/* XXX imp XXX */
 	if (unit != -1) {
 		if (unit >= 0 && unit < dc->maxunit &&
 		    dc->devices[unit] != NULL) {
 			if (bootverbose)
 				printf("%s: %s%d already exists; skipping it\n",
 				    dc->name, dc->name, *unitp);
 			return (EEXIST);
 		}
 	} else {
 		/* Unwired device, find the next available slot for it */
 		unit = 0;
 		for (unit = 0;; unit++) {
 			/* If there is an "at" hint for a unit then skip it. */
 			if (resource_string_value(dc->name, unit, "at", &s) ==
 			    0)
 				continue;
 
 			/* If this device slot is already in use, skip it. */
 			if (unit < dc->maxunit && dc->devices[unit] != NULL)
 				continue;
 
 			break;
 		}
 	}
 
 	/*
 	 * We've selected a unit beyond the length of the table, so let's
 	 * extend the table to make room for all units up to and including
 	 * this one.
 	 */
 	if (unit >= dc->maxunit) {
 		device_t *newlist, *oldlist;
 		int newsize;
 
 		oldlist = dc->devices;
 		newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
 		newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
 		if (!newlist)
 			return (ENOMEM);
 		if (oldlist != NULL)
 			bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit);
 		bzero(newlist + dc->maxunit,
 		    sizeof(device_t) * (newsize - dc->maxunit));
 		dc->devices = newlist;
 		dc->maxunit = newsize;
 		if (oldlist != NULL)
 			free(oldlist, M_BUS);
 	}
 	PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
 
 	*unitp = unit;
 	return (0);
 }
 
 /**
  * @internal
  * @brief Add a device to a devclass
  *
  * A unit number is allocated for the device (using the device's
  * preferred unit number if any) and the device is registered in the
  * devclass. This allows the device to be looked up by its unit
  * number, e.g. by decoding a dev_t minor number.
  *
  * @param dc		the devclass to add to
  * @param dev		the device to add
  *
  * @retval 0		success
  * @retval EEXIST	the requested unit number is already allocated
  * @retval ENOMEM	memory allocation failure
  */
 static int
 devclass_add_device(devclass_t dc, device_t dev)
 {
 	int buflen, error;
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX);
 	if (buflen < 0)
 		return (ENOMEM);
 	dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev->nameunit)
 		return (ENOMEM);
 
 	if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) {
 		free(dev->nameunit, M_BUS);
 		dev->nameunit = NULL;
 		return (error);
 	}
 	dc->devices[dev->unit] = dev;
 	dev->devclass = dc;
 	snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Delete a device from a devclass
  *
  * The device is removed from the devclass's device list and its unit
  * number is freed.
 
  * @param dc		the devclass to delete from
  * @param dev		the device to delete
  *
  * @retval 0		success
  */
 static int
 devclass_delete_device(devclass_t dc, device_t dev)
 {
 	if (!dc || !dev)
 		return (0);
 
 	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
 
 	if (dev->devclass != dc || dc->devices[dev->unit] != dev)
 		panic("devclass_delete_device: inconsistent device class");
 	dc->devices[dev->unit] = NULL;
 	if (dev->flags & DF_WILDCARD)
 		dev->unit = -1;
 	dev->devclass = NULL;
 	free(dev->nameunit, M_BUS);
 	dev->nameunit = NULL;
 
 	return (0);
 }
 
 /**
  * @internal
  * @brief Make a new device and add it as a child of @p parent
  *
  * @param parent	the parent of the new device
  * @param name		the devclass name of the new device or @c NULL
  *			to leave the devclass unspecified
  * @parem unit		the unit number of the new device of @c -1 to
  *			leave the unit number unspecified
  *
  * @returns the new device
  */
 static device_t
 make_device(device_t parent, const char *name, int unit)
 {
 	device_t dev;
 	devclass_t dc;
 
 	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
 
 	if (name) {
 		dc = devclass_find_internal(name, NULL, TRUE);
 		if (!dc) {
 			printf("make_device: can't find device class %s\n",
 			    name);
 			return (NULL);
 		}
 	} else {
 		dc = NULL;
 	}
 
 	dev = malloc(sizeof(*dev), M_BUS, M_NOWAIT|M_ZERO);
 	if (!dev)
 		return (NULL);
 
 	dev->parent = parent;
 	TAILQ_INIT(&dev->children);
 	kobj_init((kobj_t) dev, &null_class);
 	dev->driver = NULL;
 	dev->devclass = NULL;
 	dev->unit = unit;
 	dev->nameunit = NULL;
 	dev->desc = NULL;
 	dev->busy = 0;
 	dev->devflags = 0;
 	dev->flags = DF_ENABLED;
 	dev->order = 0;
 	if (unit == -1)
 		dev->flags |= DF_WILDCARD;
 	if (name) {
 		dev->flags |= DF_FIXEDCLASS;
 		if (devclass_add_device(dc, dev)) {
 			kobj_delete((kobj_t) dev, M_BUS);
 			return (NULL);
 		}
 	}
 	dev->ivars = NULL;
 	dev->softc = NULL;
 
 	dev->state = DS_NOTPRESENT;
 
 	TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
 	bus_data_generation_update();
 
 	return (dev);
 }
 
 /**
  * @internal
  * @brief Print a description of a device.
  */
 static int
 device_print_child(device_t dev, device_t child)
 {
 	int retval = 0;
 
 	if (device_is_alive(child))
 		retval += BUS_PRINT_CHILD(dev, child);
 	else
 		retval += device_printf(child, " not found\n");
 
 	return (retval);
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with order zero.
  *
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  *
  * @returns		the new device
  */
 device_t
 device_add_child(device_t dev, const char *name, int unit)
 {
 	return (device_add_child_ordered(dev, 0, name, unit));
 }
 
 /**
  * @brief Create a new device
  *
  * This creates a new device and adds it as a child of an existing
  * parent device. The new device will be added after the last existing
  * child with the same order.
  *
  * @param dev		the device which will be the parent of the
  *			new child device
  * @param order		a value which is used to partially sort the
  *			children of @p dev - devices created using
  *			lower values of @p order appear first in @p
  *			dev's list of children
  * @param name		devclass name for new device or @c NULL if not
  *			specified
  * @param unit		unit number for new device or @c -1 if not
  *			specified
  *
  * @returns		the new device
  */
 device_t
 device_add_child_ordered(device_t dev, u_int order, const char *name, int unit)
 {
 	device_t child;
 	device_t place;
 
 	PDEBUG(("%s at %s with order %u as unit %d",
 	    name, DEVICENAME(dev), order, unit));
 	KASSERT(name != NULL || unit == -1,
 	    ("child device with wildcard name and specific unit number"));
 
 	child = make_device(dev, name, unit);
 	if (child == NULL)
 		return (child);
 	child->order = order;
 
 	TAILQ_FOREACH(place, &dev->children, link) {
 		if (place->order > order)
 			break;
 	}
 
 	if (place) {
 		/*
 		 * The device 'place' is the first device whose order is
 		 * greater than the new child.
 		 */
 		TAILQ_INSERT_BEFORE(place, child, link);
 	} else {
 		/*
 		 * The new child's order is greater or equal to the order of
 		 * any existing device. Add the child to the tail of the list.
 		 */
 		TAILQ_INSERT_TAIL(&dev->children, child, link);
 	}
 
 	bus_data_generation_update();
 	return (child);
 }
 
 /**
  * @brief Delete a device
  *
  * This function deletes a device along with all of its children. If
  * the device currently has a driver attached to it, the device is
  * detached first using device_detach().
  *
  * @param dev		the parent device
  * @param child		the device to delete
  *
  * @retval 0		success
  * @retval non-zero	a unit error code describing the error
  */
 int
 device_delete_child(device_t dev, device_t child)
 {
 	int error;
 	device_t grandchild;
 
 	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
 
 	/* detach parent before deleting children, if any */
 	if ((error = device_detach(child)) != 0)
 		return (error);
 	
 	/* remove children second */
 	while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) {
 		error = device_delete_child(child, grandchild);
 		if (error)
 			return (error);
 	}
 
 	if (child->devclass)
 		devclass_delete_device(child->devclass, child);
 	if (child->parent)
 		BUS_CHILD_DELETED(dev, child);
 	TAILQ_REMOVE(&dev->children, child, link);
 	TAILQ_REMOVE(&bus_data_devices, child, devlink);
 	kobj_delete((kobj_t) child, M_BUS);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Delete all children devices of the given device, if any.
  *
  * This function deletes all children devices of the given device, if
  * any, using the device_delete_child() function for each device it
  * finds. If a child device cannot be deleted, this function will
  * return an error code.
  *
  * @param dev		the parent device
  *
  * @retval 0		success
  * @retval non-zero	a device would not detach
  */
 int
 device_delete_children(device_t dev)
 {
 	device_t child;
 	int error;
 
 	PDEBUG(("Deleting all children of %s", DEVICENAME(dev)));
 
 	error = 0;
 
 	while ((child = TAILQ_FIRST(&dev->children)) != NULL) {
 		error = device_delete_child(dev, child);
 		if (error) {
 			PDEBUG(("Failed deleting %s", DEVICENAME(child)));
 			break;
 		}
 	}
 	return (error);
 }
 
 /**
  * @brief Find a device given a unit number
  *
  * This is similar to devclass_get_devices() but only searches for
  * devices which have @p dev as a parent.
  *
  * @param dev		the parent device to search
  * @param unit		the unit number to search for.  If the unit is -1,
  *			return the first child of @p dev which has name
  *			@p classname (that is, the one with the lowest unit.)
  *
  * @returns		the device with the given unit number or @c
  *			NULL if there is no such device
  */
 device_t
 device_find_child(device_t dev, const char *classname, int unit)
 {
 	devclass_t dc;
 	device_t child;
 
 	dc = devclass_find(classname);
 	if (!dc)
 		return (NULL);
 
 	if (unit != -1) {
 		child = devclass_get_device(dc, unit);
 		if (child && child->parent == dev)
 			return (child);
 	} else {
 		for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
 			child = devclass_get_device(dc, unit);
 			if (child && child->parent == dev)
 				return (child);
 		}
 	}
 	return (NULL);
 }
 
 /**
  * @internal
  */
 static driverlink_t
 first_matching_driver(devclass_t dc, device_t dev)
 {
 	if (dev->devclass)
 		return (devclass_find_driver_internal(dc, dev->devclass->name));
 	return (TAILQ_FIRST(&dc->drivers));
 }
 
 /**
  * @internal
  */
 static driverlink_t
 next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
 {
 	if (dev->devclass) {
 		driverlink_t dl;
 		for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
 			if (!strcmp(dev->devclass->name, dl->driver->name))
 				return (dl);
 		return (NULL);
 	}
 	return (TAILQ_NEXT(last, link));
 }
 
 /**
  * @internal
  */
 int
 device_probe_child(device_t dev, device_t child)
 {
 	devclass_t dc;
 	driverlink_t best = NULL;
 	driverlink_t dl;
 	int result, pri = 0;
 	int hasclass = (child->devclass != NULL);
 
 	GIANT_REQUIRED;
 
 	dc = dev->devclass;
 	if (!dc)
 		panic("device_probe_child: parent device has no devclass");
 
 	/*
 	 * If the state is already probed, then return.  However, don't
 	 * return if we can rebid this object.
 	 */
 	if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0)
 		return (0);
 
 	for (; dc; dc = dc->parent) {
 		for (dl = first_matching_driver(dc, child);
 		     dl;
 		     dl = next_matching_driver(dc, child, dl)) {
 			/* If this driver's pass is too high, then ignore it. */
 			if (dl->pass > bus_current_pass)
 				continue;
 
 			PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
 			result = device_set_driver(child, dl->driver);
 			if (result == ENOMEM)
 				return (result);
 			else if (result != 0)
 				continue;
 			if (!hasclass) {
 				if (device_set_devclass(child,
 				    dl->driver->name) != 0) {
 					char const * devname =
 					    device_get_name(child);
 					if (devname == NULL)
 						devname = "(unknown)";
 					printf("driver bug: Unable to set "
 					    "devclass (class: %s "
 					    "devname: %s)\n",
 					    dl->driver->name,
 					    devname);
 					(void)device_set_driver(child, NULL);
 					continue;
 				}
 			}
 
 			/* Fetch any flags for the device before probing. */
 			resource_int_value(dl->driver->name, child->unit,
 			    "flags", &child->devflags);
 
 			result = DEVICE_PROBE(child);
 
 			/* Reset flags and devclass before the next probe. */
 			child->devflags = 0;
 			if (!hasclass)
 				(void)device_set_devclass(child, NULL);
 
 			/*
 			 * If the driver returns SUCCESS, there can be
 			 * no higher match for this device.
 			 */
 			if (result == 0) {
 				best = dl;
 				pri = 0;
 				break;
 			}
 
 			/*
 			 * Reset DF_QUIET in case this driver doesn't
 			 * end up as the best driver.
 			 */
 			device_verbose(child);
 
 			/*
 			 * Probes that return BUS_PROBE_NOWILDCARD or lower
 			 * only match on devices whose driver was explicitly
 			 * specified.
 			 */
 			if (result <= BUS_PROBE_NOWILDCARD &&
 			    !(child->flags & DF_FIXEDCLASS)) {
 				result = ENXIO;
 			}
 
 			/*
 			 * The driver returned an error so it
 			 * certainly doesn't match.
 			 */
 			if (result > 0) {
 				(void)device_set_driver(child, NULL);
 				continue;
 			}
 
 			/*
 			 * A priority lower than SUCCESS, remember the
 			 * best matching driver. Initialise the value
 			 * of pri for the first match.
 			 */
 			if (best == NULL || result > pri) {
 				best = dl;
 				pri = result;
 				continue;
 			}
 		}
 		/*
 		 * If we have an unambiguous match in this devclass,
 		 * don't look in the parent.
 		 */
 		if (best && pri == 0)
 			break;
 	}
 
 	/*
 	 * If we found a driver, change state and initialise the devclass.
 	 */
 	/* XXX What happens if we rebid and got no best? */
 	if (best) {
 		/*
 		 * If this device was attached, and we were asked to
 		 * rescan, and it is a different driver, then we have
 		 * to detach the old driver and reattach this new one.
 		 * Note, we don't have to check for DF_REBID here
 		 * because if the state is > DS_ALIVE, we know it must
 		 * be.
 		 *
 		 * This assumes that all DF_REBID drivers can have
 		 * their probe routine called at any time and that
 		 * they are idempotent as well as completely benign in
 		 * normal operations.
 		 *
 		 * We also have to make sure that the detach
 		 * succeeded, otherwise we fail the operation (or
 		 * maybe it should just fail silently?  I'm torn).
 		 */
 		if (child->state > DS_ALIVE && best->driver != child->driver)
 			if ((result = device_detach(dev)) != 0)
 				return (result);
 
 		/* Set the winning driver, devclass, and flags. */
 		if (!child->devclass) {
 			result = device_set_devclass(child, best->driver->name);
 			if (result != 0)
 				return (result);
 		}
 		result = device_set_driver(child, best->driver);
 		if (result != 0)
 			return (result);
 		resource_int_value(best->driver->name, child->unit,
 		    "flags", &child->devflags);
 
 		if (pri < 0) {
 			/*
 			 * A bit bogus. Call the probe method again to make
 			 * sure that we have the right description.
 			 */
 			DEVICE_PROBE(child);
 #if 0
 			child->flags |= DF_REBID;
 #endif
 		} else
 			child->flags &= ~DF_REBID;
 		child->state = DS_ALIVE;
 
 		bus_data_generation_update();
 		return (0);
 	}
 
 	return (ENXIO);
 }
 
 /**
  * @brief Return the parent of a device
  */
 device_t
 device_get_parent(device_t dev)
 {
 	return (dev->parent);
 }
 
 /**
  * @brief Get a list of children of a device
  *
  * An array containing a list of all the children of the given device
  * is allocated and returned in @p *devlistp. The number of devices
  * in the array is returned in @p *devcountp. The caller should free
  * the array using @c free(p, M_TEMP).
  *
  * @param dev		the device to examine
  * @param devlistp	points at location for array pointer return
  *			value
  * @param devcountp	points at location for array size return value
  *
  * @retval 0		success
  * @retval ENOMEM	the array allocation failed
  */
 int
 device_get_children(device_t dev, device_t **devlistp, int *devcountp)
 {
 	int count;
 	device_t child;
 	device_t *list;
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		count++;
 	}
 	if (count == 0) {
 		*devlistp = NULL;
 		*devcountp = 0;
 		return (0);
 	}
 
 	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
 	if (!list)
 		return (ENOMEM);
 
 	count = 0;
 	TAILQ_FOREACH(child, &dev->children, link) {
 		list[count] = child;
 		count++;
 	}
 
 	*devlistp = list;
 	*devcountp = count;
 
 	return (0);
 }
 
 /**
  * @brief Return the current driver for the device or @c NULL if there
  * is no driver currently attached
  */
 driver_t *
 device_get_driver(device_t dev)
 {
 	return (dev->driver);
 }
 
 /**
  * @brief Return the current devclass for the device or @c NULL if
  * there is none.
  */
 devclass_t
 device_get_devclass(device_t dev)
 {
 	return (dev->devclass);
 }
 
 /**
  * @brief Return the name of the device's devclass or @c NULL if there
  * is none.
  */
 const char *
 device_get_name(device_t dev)
 {
 	if (dev != NULL && dev->devclass)
 		return (devclass_get_name(dev->devclass));
 	return (NULL);
 }
 
 /**
  * @brief Return a string containing the device's devclass name
  * followed by an ascii representation of the device's unit number
  * (e.g. @c "foo2").
  */
 const char *
 device_get_nameunit(device_t dev)
 {
 	return (dev->nameunit);
 }
 
 /**
  * @brief Return the device's unit number.
  */
 int
 device_get_unit(device_t dev)
 {
 	return (dev->unit);
 }
 
 /**
  * @brief Return the device's description string
  */
 const char *
 device_get_desc(device_t dev)
 {
 	return (dev->desc);
 }
 
 /**
  * @brief Return the device's flags
  */
 uint32_t
 device_get_flags(device_t dev)
 {
 	return (dev->devflags);
 }
 
 struct sysctl_ctx_list *
 device_get_sysctl_ctx(device_t dev)
 {
 	return (&dev->sysctl_ctx);
 }
 
 struct sysctl_oid *
 device_get_sysctl_tree(device_t dev)
 {
 	return (dev->sysctl_tree);
 }
 
 /**
  * @brief Print the name of the device followed by a colon and a space
  *
  * @returns the number of characters printed
  */
 int
 device_print_prettyname(device_t dev)
 {
 	const char *name = device_get_name(dev);
 
 	if (name == NULL)
 		return (printf("unknown: "));
 	return (printf("%s%d: ", name, device_get_unit(dev)));
 }
 
 /**
  * @brief Print the name of the device followed by a colon, a space
  * and the result of calling vprintf() with the value of @p fmt and
  * the following arguments.
  *
  * @returns the number of characters printed
  */
 int
 device_printf(device_t dev, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = device_print_prettyname(dev);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 /**
  * @internal
  */
 static void
 device_set_desc_internal(device_t dev, const char* desc, int copy)
 {
 	if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
 		free(dev->desc, M_BUS);
 		dev->flags &= ~DF_DESCMALLOCED;
 		dev->desc = NULL;
 	}
 
 	if (copy && desc) {
 		dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
 		if (dev->desc) {
 			strcpy(dev->desc, desc);
 			dev->flags |= DF_DESCMALLOCED;
 		}
 	} else {
 		/* Avoid a -Wcast-qual warning */
 		dev->desc = (char *)(uintptr_t) desc;
 	}
 
 	bus_data_generation_update();
 }
 
 /**
  * @brief Set the device's description
  *
  * The value of @c desc should be a string constant that will not
  * change (at least until the description is changed in a subsequent
  * call to device_set_desc() or device_set_desc_copy()).
  */
 void
 device_set_desc(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, FALSE);
 }
 
 /**
  * @brief Set the device's description
  *
  * The string pointed to by @c desc is copied. Use this function if
  * the device description is generated, (e.g. with sprintf()).
  */
 void
 device_set_desc_copy(device_t dev, const char* desc)
 {
 	device_set_desc_internal(dev, desc, TRUE);
 }
 
 /**
  * @brief Set the device's flags
  */
 void
 device_set_flags(device_t dev, uint32_t flags)
 {
 	dev->devflags = flags;
 }
 
 /**
  * @brief Return the device's softc field
  *
  * The softc is allocated and zeroed when a driver is attached, based
  * on the size field of the driver.
  */
 void *
 device_get_softc(device_t dev)
 {
 	return (dev->softc);
 }
 
 /**
  * @brief Set the device's softc field
  *
  * Most drivers do not need to use this since the softc is allocated
  * automatically when the driver is attached.
  */
 void
 device_set_softc(device_t dev, void *softc)
 {
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
 		free(dev->softc, M_BUS_SC);
 	dev->softc = softc;
 	if (dev->softc)
 		dev->flags |= DF_EXTERNALSOFTC;
 	else
 		dev->flags &= ~DF_EXTERNALSOFTC;
 }
 
 /**
  * @brief Free claimed softc
  *
  * Most drivers do not need to use this since the softc is freed
  * automatically when the driver is detached.
  */
 void
 device_free_softc(void *softc)
 {
 	free(softc, M_BUS_SC);
 }
 
 /**
  * @brief Claim softc
  *
  * This function can be used to let the driver free the automatically
  * allocated softc using "device_free_softc()". This function is
  * useful when the driver is refcounting the softc and the softc
  * cannot be freed when the "device_detach" method is called.
  */
 void
 device_claim_softc(device_t dev)
 {
 	if (dev->softc)
 		dev->flags |= DF_EXTERNALSOFTC;
 	else
 		dev->flags &= ~DF_EXTERNALSOFTC;
 }
 
 /**
  * @brief Get the device's ivars field
  *
  * The ivars field is used by the parent device to store per-device
  * state (e.g. the physical location of the device or a list of
  * resources).
  */
 void *
 device_get_ivars(device_t dev)
 {
 
 	KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
 	return (dev->ivars);
 }
 
 /**
  * @brief Set the device's ivars field
  */
 void
 device_set_ivars(device_t dev, void * ivars)
 {
 
 	KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
 	dev->ivars = ivars;
 }
 
 /**
  * @brief Return the device's state
  */
 device_state_t
 device_get_state(device_t dev)
 {
 	return (dev->state);
 }
 
 /**
  * @brief Set the DF_ENABLED flag for the device
  */
 void
 device_enable(device_t dev)
 {
 	dev->flags |= DF_ENABLED;
 }
 
 /**
  * @brief Clear the DF_ENABLED flag for the device
  */
 void
 device_disable(device_t dev)
 {
 	dev->flags &= ~DF_ENABLED;
 }
 
 /**
  * @brief Increment the busy counter for the device
  */
 void
 device_busy(device_t dev)
 {
 	if (dev->state < DS_ATTACHING)
 		panic("device_busy: called for unattached device");
 	if (dev->busy == 0 && dev->parent)
 		device_busy(dev->parent);
 	dev->busy++;
 	if (dev->state == DS_ATTACHED)
 		dev->state = DS_BUSY;
 }
 
 /**
  * @brief Decrement the busy counter for the device
  */
 void
 device_unbusy(device_t dev)
 {
 	if (dev->busy != 0 && dev->state != DS_BUSY &&
 	    dev->state != DS_ATTACHING)
 		panic("device_unbusy: called for non-busy device %s",
 		    device_get_nameunit(dev));
 	dev->busy--;
 	if (dev->busy == 0) {
 		if (dev->parent)
 			device_unbusy(dev->parent);
 		if (dev->state == DS_BUSY)
 			dev->state = DS_ATTACHED;
 	}
 }
 
 /**
  * @brief Set the DF_QUIET flag for the device
  */
 void
 device_quiet(device_t dev)
 {
 	dev->flags |= DF_QUIET;
 }
 
 /**
  * @brief Clear the DF_QUIET flag for the device
  */
 void
 device_verbose(device_t dev)
 {
 	dev->flags &= ~DF_QUIET;
 }
 
 /**
  * @brief Return non-zero if the DF_QUIET flag is set on the device
  */
 int
 device_is_quiet(device_t dev)
 {
 	return ((dev->flags & DF_QUIET) != 0);
 }
 
 /**
  * @brief Return non-zero if the DF_ENABLED flag is set on the device
  */
 int
 device_is_enabled(device_t dev)
 {
 	return ((dev->flags & DF_ENABLED) != 0);
 }
 
 /**
  * @brief Return non-zero if the device was successfully probed
  */
 int
 device_is_alive(device_t dev)
 {
 	return (dev->state >= DS_ALIVE);
 }
 
 /**
  * @brief Return non-zero if the device currently has a driver
  * attached to it
  */
 int
 device_is_attached(device_t dev)
 {
 	return (dev->state >= DS_ATTACHED);
 }
 
 /**
  * @brief Return non-zero if the device is currently suspended.
  */
 int
 device_is_suspended(device_t dev)
 {
 	return ((dev->flags & DF_SUSPENDED) != 0);
 }
 
 /**
  * @brief Set the devclass of a device
  * @see devclass_add_device().
  */
 int
 device_set_devclass(device_t dev, const char *classname)
 {
 	devclass_t dc;
 	int error;
 
 	if (!classname) {
 		if (dev->devclass)
 			devclass_delete_device(dev->devclass, dev);
 		return (0);
 	}
 
 	if (dev->devclass) {
 		printf("device_set_devclass: device class already set\n");
 		return (EINVAL);
 	}
 
 	dc = devclass_find_internal(classname, NULL, TRUE);
 	if (!dc)
 		return (ENOMEM);
 
 	error = devclass_add_device(dc, dev);
 
 	bus_data_generation_update();
 	return (error);
 }
 
 /**
  * @brief Set the devclass of a device and mark the devclass fixed.
  * @see device_set_devclass()
  */
 int
 device_set_devclass_fixed(device_t dev, const char *classname)
 {
 	int error;
 
 	if (classname == NULL)
 		return (EINVAL);
 
 	error = device_set_devclass(dev, classname);
 	if (error)
 		return (error);
 	dev->flags |= DF_FIXEDCLASS;
 	return (0);
 }
 
 /**
  * @brief Set the driver of a device
  *
  * @retval 0		success
  * @retval EBUSY	the device already has a driver attached
  * @retval ENOMEM	a memory allocation failure occurred
  */
 int
 device_set_driver(device_t dev, driver_t *driver)
 {
 	if (dev->state >= DS_ATTACHED)
 		return (EBUSY);
 
 	if (dev->driver == driver)
 		return (0);
 
 	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
 		free(dev->softc, M_BUS_SC);
 		dev->softc = NULL;
 	}
 	device_set_desc(dev, NULL);
 	kobj_delete((kobj_t) dev, NULL);
 	dev->driver = driver;
 	if (driver) {
 		kobj_init((kobj_t) dev, (kobj_class_t) driver);
 		if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
 			dev->softc = malloc(driver->size, M_BUS_SC,
 			    M_NOWAIT | M_ZERO);
 			if (!dev->softc) {
 				kobj_delete((kobj_t) dev, NULL);
 				kobj_init((kobj_t) dev, &null_class);
 				dev->driver = NULL;
 				return (ENOMEM);
 			}
 		}
 	} else {
 		kobj_init((kobj_t) dev, &null_class);
 	}
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /**
  * @brief Probe a device, and return this status.
  *
  * This function is the core of the device autoconfiguration
  * system. Its purpose is to select a suitable driver for a device and
  * then call that driver to initialise the hardware appropriately. The
  * driver is selected by calling the DEVICE_PROBE() method of a set of
  * candidate drivers and then choosing the driver which returned the
  * best value. This driver is then attached to the device using
  * device_attach().
  *
  * The set of suitable drivers is taken from the list of drivers in
  * the parent device's devclass. If the device was originally created
  * with a specific class name (see device_add_child()), only drivers
  * with that name are probed, otherwise all drivers in the devclass
  * are probed. If no drivers return successful probe values in the
  * parent devclass, the search continues in the parent of that
  * devclass (see devclass_get_parent()) if any.
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  * @retval -1		Device already attached
  */
 int
 device_probe(device_t dev)
 {
 	int error;
 
 	GIANT_REQUIRED;
 
 	if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0)
 		return (-1);
 
 	if (!(dev->flags & DF_ENABLED)) {
 		if (bootverbose && device_get_name(dev) != NULL) {
 			device_print_prettyname(dev);
 			printf("not probed (disabled)\n");
 		}
 		return (-1);
 	}
 	if ((error = device_probe_child(dev->parent, dev)) != 0) {
 		if (bus_current_pass == BUS_PASS_DEFAULT &&
 		    !(dev->flags & DF_DONENOMATCH)) {
 			BUS_PROBE_NOMATCH(dev->parent, dev);
 			devnomatch(dev);
 			dev->flags |= DF_DONENOMATCH;
 		}
 		return (error);
 	}
 	return (0);
 }
 
 /**
  * @brief Probe a device and attach a driver if possible
  *
  * calls device_probe() and attaches if that was successful.
  */
 int
 device_probe_and_attach(device_t dev)
 {
 	int error;
 
 	GIANT_REQUIRED;
 
 	error = device_probe(dev);
 	if (error == -1)
 		return (0);
 	else if (error != 0)
 		return (error);
 
 	CURVNET_SET_QUIET(vnet0);
 	error = device_attach(dev);
 	CURVNET_RESTORE();
 	return error;
 }
 
 /**
  * @brief Attach a device driver to a device
  *
  * This function is a wrapper around the DEVICE_ATTACH() driver
  * method. In addition to calling DEVICE_ATTACH(), it initialises the
  * device's sysctl tree, optionally prints a description of the device
  * and queues a notification event for user-based device management
  * services.
  *
  * Normally this function is only called internally from
  * device_probe_and_attach().
  *
  * @param dev		the device to initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_attach(device_t dev)
 {
 	uint64_t attachtime;
 	int error;
 
 	if (resource_disabled(dev->driver->name, dev->unit)) {
 		device_disable(dev);
 		if (bootverbose)
 			 device_printf(dev, "disabled via hints entry\n");
 		return (ENXIO);
 	}
 
 	device_sysctl_init(dev);
 	if (!device_is_quiet(dev))
 		device_print_child(dev->parent, dev);
 	attachtime = get_cyclecount();
 	dev->state = DS_ATTACHING;
 	if ((error = DEVICE_ATTACH(dev)) != 0) {
 		printf("device_attach: %s%d attach returned %d\n",
 		    dev->driver->name, dev->unit, error);
 		if (!(dev->flags & DF_FIXEDCLASS))
 			devclass_delete_device(dev->devclass, dev);
 		(void)device_set_driver(dev, NULL);
 		device_sysctl_fini(dev);
 		KASSERT(dev->busy == 0, ("attach failed but busy"));
 		dev->state = DS_NOTPRESENT;
 		return (error);
 	}
 	attachtime = get_cyclecount() - attachtime;
 	/*
 	 * 4 bits per device is a reasonable value for desktop and server
 	 * hardware with good get_cyclecount() implementations, but WILL
 	 * need to be adjusted on other platforms.
 	 */
 #define	RANDOM_PROBE_BIT_GUESS	4
 	if (bootverbose)
 		printf("random: harvesting attach, %zu bytes (%d bits) from %s%d\n",
 		    sizeof(attachtime), RANDOM_PROBE_BIT_GUESS,
 		    dev->driver->name, dev->unit);
 	random_harvest_direct(&attachtime, sizeof(attachtime),
 	    RANDOM_PROBE_BIT_GUESS, RANDOM_ATTACH);
 	device_sysctl_update(dev);
 	if (dev->busy)
 		dev->state = DS_BUSY;
 	else
 		dev->state = DS_ATTACHED;
 	dev->flags &= ~DF_DONENOMATCH;
 	EVENTHANDLER_INVOKE(device_attach, dev);
 	devadded(dev);
 	return (0);
 }
 
 /**
  * @brief Detach a driver from a device
  *
  * This function is a wrapper around the DEVICE_DETACH() driver
  * method. If the call to DEVICE_DETACH() succeeds, it calls
  * BUS_CHILD_DETACHED() for the parent of @p dev, queues a
  * notification event for user-based device management services and
  * cleans up the device's sysctl tree.
  *
  * @param dev		the device to un-initialise
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_detach(device_t dev)
 {
 	int error;
 
 	GIANT_REQUIRED;
 
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->state == DS_BUSY)
 		return (EBUSY);
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	EVENTHANDLER_INVOKE(device_detach, dev, EVHDEV_DETACH_BEGIN);
 	if ((error = DEVICE_DETACH(dev)) != 0) {
 		EVENTHANDLER_INVOKE(device_detach, dev, EVHDEV_DETACH_FAILED);
 		return (error);
 	} else {
 		EVENTHANDLER_INVOKE(device_detach, dev, EVHDEV_DETACH_COMPLETE);
 	}
 	devremoved(dev);
 	if (!device_is_quiet(dev))
 		device_printf(dev, "detached\n");
 	if (dev->parent)
 		BUS_CHILD_DETACHED(dev->parent, dev);
 
 	if (!(dev->flags & DF_FIXEDCLASS))
 		devclass_delete_device(dev->devclass, dev);
 
 	device_verbose(dev);
 	dev->state = DS_NOTPRESENT;
 	(void)device_set_driver(dev, NULL);
 	device_sysctl_fini(dev);
 
 	return (0);
 }
 
 /**
  * @brief Tells a driver to quiesce itself.
  *
  * This function is a wrapper around the DEVICE_QUIESCE() driver
  * method. If the call to DEVICE_QUIESCE() succeeds.
  *
  * @param dev		the device to quiesce
  *
  * @retval 0		success
  * @retval ENXIO	no driver was found
  * @retval ENOMEM	memory allocation failure
  * @retval non-zero	some other unix error code
  */
 int
 device_quiesce(device_t dev)
 {
 
 	PDEBUG(("%s", DEVICENAME(dev)));
 	if (dev->state == DS_BUSY)
 		return (EBUSY);
 	if (dev->state != DS_ATTACHED)
 		return (0);
 
 	return (DEVICE_QUIESCE(dev));
 }
 
 /**
  * @brief Notify a device of system shutdown
  *
  * This function calls the DEVICE_SHUTDOWN() driver method if the
  * device currently has an attached driver.
  *
  * @returns the value returned by DEVICE_SHUTDOWN()
  */
 int
 device_shutdown(device_t dev)
 {
 	if (dev->state < DS_ATTACHED)
 		return (0);
 	return (DEVICE_SHUTDOWN(dev));
 }
 
 /**
  * @brief Set the unit number of a device
  *
  * This function can be used to override the unit number used for a
  * device (e.g. to wire a device to a pre-configured unit number).
  */
 int
 device_set_unit(device_t dev, int unit)
 {
 	devclass_t dc;
 	int err;
 
 	dc = device_get_devclass(dev);
 	if (unit < dc->maxunit && dc->devices[unit])
 		return (EBUSY);
 	err = devclass_delete_device(dc, dev);
 	if (err)
 		return (err);
 	dev->unit = unit;
 	err = devclass_add_device(dc, dev);
 	if (err)
 		return (err);
 
 	bus_data_generation_update();
 	return (0);
 }
 
 /*======================================*/
 /*
  * Some useful method implementations to make life easier for bus drivers.
  */
 
 void
 resource_init_map_request_impl(struct resource_map_request *args, size_t sz)
 {
 
 	bzero(args, sz);
 	args->size = sz;
 	args->memattr = VM_MEMATTR_UNCACHEABLE;
 }
 
 /**
  * @brief Initialise a resource list.
  *
  * @param rl		the resource list to initialise
  */
 void
 resource_list_init(struct resource_list *rl)
 {
 	STAILQ_INIT(rl);
 }
 
 /**
  * @brief Reclaim memory used by a resource list.
  *
  * This function frees the memory for all resource entries on the list
  * (if any).
  *
  * @param rl		the resource list to free
  */
 void
 resource_list_free(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	while ((rle = STAILQ_FIRST(rl)) != NULL) {
 		if (rle->res)
 			panic("resource_list_free: resource entry is busy");
 		STAILQ_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Add a resource entry.
  *
  * This function adds a resource entry using the given @p type, @p
  * start, @p end and @p count values. A rid value is chosen by
  * searching sequentially for the first unused rid starting at zero.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 int
 resource_list_add_next(struct resource_list *rl, int type, rman_res_t start,
     rman_res_t end, rman_res_t count)
 {
 	int rid;
 
 	rid = 0;
 	while (resource_list_find(rl, type, rid) != NULL)
 		rid++;
 	resource_list_add(rl, type, rid, start, end, count);
 	return (rid);
 }
 
 /**
  * @brief Add or modify a resource entry.
  *
  * If an existing entry exists with the same type and rid, it will be
  * modified using the given values of @p start, @p end and @p
  * count. If no entry exists, a new one will be created using the
  * given values.  The resource list entry that matches is then returned.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  * @param start		the start address of the resource
  * @param end		the end address of the resource
  * @param count		XXX end-start+1
  */
 struct resource_list_entry *
 resource_list_add(struct resource_list *rl, int type, int rid,
     rman_res_t start, rman_res_t end, rman_res_t count)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle) {
 		rle = malloc(sizeof(struct resource_list_entry), M_BUS,
 		    M_NOWAIT);
 		if (!rle)
 			panic("resource_list_add: can't record entry");
 		STAILQ_INSERT_TAIL(rl, rle, link);
 		rle->type = type;
 		rle->rid = rid;
 		rle->res = NULL;
 		rle->flags = 0;
 	}
 
 	if (rle->res)
 		panic("resource_list_add: resource entry is busy");
 
 	rle->start = start;
 	rle->end = end;
 	rle->count = count;
 	return (rle);
 }
 
 /**
  * @brief Determine if a resource entry is busy.
  *
  * Returns true if a resource entry is busy meaning that it has an
  * associated resource that is not an unallocated "reserved" resource.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns Non-zero if the entry is busy, zero otherwise.
  */
 int
 resource_list_busy(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (rle == NULL || rle->res == NULL)
 		return (0);
 	if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) {
 		KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE),
 		    ("reserved resource is active"));
 		return (0);
 	}
 	return (1);
 }
 
 /**
  * @brief Determine if a resource entry is reserved.
  *
  * Returns true if a resource entry is reserved meaning that it has an
  * associated "reserved" resource.  The resource can either be
  * allocated or unallocated.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns Non-zero if the entry is reserved, zero otherwise.
  */
 int
 resource_list_reserved(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (rle != NULL && rle->flags & RLE_RESERVED)
 		return (1);
 	return (0);
 }
 
 /**
  * @brief Find a resource entry by type and rid.
  *
  * @param rl		the resource list to search
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  *
  * @returns the resource entry pointer or NULL if there is no such
  * entry.
  */
 struct resource_list_entry *
 resource_list_find(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle;
 
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type == type && rle->rid == rid)
 			return (rle);
 	}
 	return (NULL);
 }
 
 /**
  * @brief Delete a resource entry.
  *
  * @param rl		the resource list to edit
  * @param type		the resource entry type (e.g. SYS_RES_MEMORY)
  * @param rid		the resource identifier
  */
 void
 resource_list_delete(struct resource_list *rl, int type, int rid)
 {
 	struct resource_list_entry *rle = resource_list_find(rl, type, rid);
 
 	if (rle) {
 		if (rle->res != NULL)
 			panic("resource_list_delete: resource has not been released");
 		STAILQ_REMOVE(rl, rle, resource_list_entry, link);
 		free(rle, M_BUS);
 	}
 }
 
 /**
  * @brief Allocate a reserved resource
  *
  * This can be used by buses to force the allocation of resources
  * that are always active in the system even if they are not allocated
  * by a driver (e.g. PCI BARs).  This function is usually called when
  * adding a new child to the bus.  The resource is allocated from the
  * parent bus when it is reserved.  The resource list entry is marked
  * with RLE_RESERVED to note that it is a reserved resource.
  *
  * Subsequent attempts to allocate the resource with
  * resource_list_alloc() will succeed the first time and will set
  * RLE_ALLOCATED to note that it has been allocated.  When a reserved
  * resource that has been allocated is released with
  * resource_list_release() the resource RLE_ALLOCATED is cleared, but
  * the actual resource remains allocated.  The resource can be released to
  * the parent bus by calling resource_list_unreserve().
  *
  * @param rl		the resource list to allocate from
  * @param bus		the parent device of @p child
  * @param child		the device for which the resource is being reserved
  * @param type		the type of resource to allocate
  * @param rid		a pointer to the resource identifier
  * @param start		hint at the start of the resource range - pass
  *			@c 0 for any start address
  * @param end		hint at the end of the resource range - pass
  *			@c ~0 for any end address
  * @param count		hint at the size of range required - pass @c 1
  *			for any size
  * @param flags		any extra flags to control the resource
  *			allocation - see @c RF_XXX flags in
  *			<sys/rman.h> for details
  *
  * @returns		the resource which was allocated or @c NULL if no
  *			resource could be allocated
  */
 struct resource *
 resource_list_reserve(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	struct resource *r;
 
 	if (passthrough)
 		panic(
     "resource_list_reserve() should only be called for direct children");
 	if (flags & RF_ACTIVE)
 		panic(
     "resource_list_reserve() should only reserve inactive resources");
 
 	r = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
 	    flags);
 	if (r != NULL) {
 		rle = resource_list_find(rl, type, *rid);
 		rle->flags |= RLE_RESERVED;
 	}
 	return (r);
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE()
  *
  * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
  * and passing the allocation up to the parent of @p bus. This assumes
  * that the first entry of @c device_get_ivars(child) is a struct
  * resource_list. This also handles 'passthrough' allocations where a
  * child is a remote descendant of bus by passing the allocation up to
  * the parent of bus.
  *
  * Typically, a bus driver would store a list of child resources
  * somewhere in the child device's ivars (see device_get_ivars()) and
  * its implementation of BUS_ALLOC_RESOURCE() would find that list and
  * then call resource_list_alloc() to perform the allocation.
  *
  * @param rl		the resource list to allocate from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting an allocation
  * @param type		the type of resource to allocate
  * @param rid		a pointer to the resource identifier
  * @param start		hint at the start of the resource range - pass
  *			@c 0 for any start address
  * @param end		hint at the end of the resource range - pass
  *			@c ~0 for any end address
  * @param count		hint at the size of range required - pass @c 1
  *			for any size
  * @param flags		any extra flags to control the resource
  *			allocation - see @c RF_XXX flags in
  *			<sys/rman.h> for details
  *
  * @returns		the resource which was allocated or @c NULL if no
  *			resource could be allocated
  */
 struct resource *
 resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	int isdefault = RMAN_IS_DEFAULT_RANGE(start, end);
 
 	if (passthrough) {
 		return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 		    type, rid, start, end, count, flags));
 	}
 
 	rle = resource_list_find(rl, type, *rid);
 
 	if (!rle)
 		return (NULL);		/* no resource of that type/rid */
 
 	if (rle->res) {
 		if (rle->flags & RLE_RESERVED) {
 			if (rle->flags & RLE_ALLOCATED)
 				return (NULL);
 			if ((flags & RF_ACTIVE) &&
 			    bus_activate_resource(child, type, *rid,
 			    rle->res) != 0)
 				return (NULL);
 			rle->flags |= RLE_ALLOCATED;
 			return (rle->res);
 		}
 		device_printf(bus,
 		    "resource entry %#x type %d for child %s is busy\n", *rid,
 		    type, device_get_nameunit(child));
 		return (NULL);
 	}
 
 	if (isdefault) {
 		start = rle->start;
 		count = ulmax(count, rle->count);
 		end = ulmax(rle->end, start + count - 1);
 	}
 
 	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
 	    type, rid, start, end, count, flags);
 
 	/*
 	 * Record the new range.
 	 */
 	if (rle->res) {
 		rle->start = rman_get_start(rle->res);
 		rle->end = rman_get_end(rle->res);
 		rle->count = count;
 	}
 
 	return (rle->res);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE()
  *
  * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
  * used with resource_list_alloc().
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device which is requesting a release
  * @param type		the type of resource to release
  * @param rid		the resource identifier
  * @param res		the resource to release
  *
  * @retval 0		success
  * @retval non-zero	a standard unix error code indicating what
  *			error condition prevented the operation
  */
 int
 resource_list_release(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid, struct resource *res)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	int error;
 
 	if (passthrough) {
 		return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 		    type, rid, res));
 	}
 
 	rle = resource_list_find(rl, type, rid);
 
 	if (!rle)
 		panic("resource_list_release: can't find resource");
 	if (!rle->res)
 		panic("resource_list_release: resource entry is not busy");
 	if (rle->flags & RLE_RESERVED) {
 		if (rle->flags & RLE_ALLOCATED) {
 			if (rman_get_flags(res) & RF_ACTIVE) {
 				error = bus_deactivate_resource(child, type,
 				    rid, res);
 				if (error)
 					return (error);
 			}
 			rle->flags &= ~RLE_ALLOCATED;
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
 	    type, rid, res);
 	if (error)
 		return (error);
 
 	rle->res = NULL;
 	return (0);
 }
 
 /**
  * @brief Release all active resources of a given type
  *
  * Release all active resources of a specified type.  This is intended
  * to be used to cleanup resources leaked by a driver after detach or
  * a failed attach.
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device whose active resources are being released
  * @param type		the type of resources to release
  *
  * @retval 0		success
  * @retval EBUSY	at least one resource was active
  */
 int
 resource_list_release_active(struct resource_list *rl, device_t bus,
     device_t child, int type)
 {
 	struct resource_list_entry *rle;
 	int error, retval;
 
 	retval = 0;
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type != type)
 			continue;
 		if (rle->res == NULL)
 			continue;
 		if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) ==
 		    RLE_RESERVED)
 			continue;
 		retval = EBUSY;
 		error = resource_list_release(rl, bus, child, type,
 		    rman_get_rid(rle->res), rle->res);
 		if (error != 0)
 			device_printf(bus,
 			    "Failed to release active resource: %d\n", error);
 	}
 	return (retval);
 }
 
 
 /**
  * @brief Fully release a reserved resource
  *
  * Fully releases a resource reserved via resource_list_reserve().
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
  * @param child		the device whose reserved resource is being released
  * @param type		the type of resource to release
  * @param rid		the resource identifier
  * @param res		the resource to release
  *
  * @retval 0		success
  * @retval non-zero	a standard unix error code indicating what
  *			error condition prevented the operation
  */
 int
 resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid)
 {
 	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 
 	if (passthrough)
 		panic(
     "resource_list_unreserve() should only be called for direct children");
 
 	rle = resource_list_find(rl, type, rid);
 
 	if (!rle)
 		panic("resource_list_unreserve: can't find resource");
 	if (!(rle->flags & RLE_RESERVED))
 		return (EINVAL);
 	if (rle->flags & RLE_ALLOCATED)
 		return (EBUSY);
 	rle->flags &= ~RLE_RESERVED;
 	return (resource_list_release(rl, bus, child, type, rid, rle->res));
 }
 
 /**
  * @brief Print a description of resources in a resource list
  *
  * Print all resources of a specified type, for use in BUS_PRINT_CHILD().
  * The name is printed if at least one resource of the given type is available.
  * The format is used to print resource start and end.
  *
  * @param rl		the resource list to print
  * @param name		the name of @p type, e.g. @c "memory"
  * @param type		type type of resource entry to print
  * @param format	printf(9) format string to print resource
  *			start and end values
  *
  * @returns		the number of characters printed
  */
 int
 resource_list_print_type(struct resource_list *rl, const char *name, int type,
     const char *format)
 {
 	struct resource_list_entry *rle;
 	int printed, retval;
 
 	printed = 0;
 	retval = 0;
 	/* Yes, this is kinda cheating */
 	STAILQ_FOREACH(rle, rl, link) {
 		if (rle->type == type) {
 			if (printed == 0)
 				retval += printf(" %s ", name);
 			else
 				retval += printf(",");
 			printed++;
 			retval += printf(format, rle->start);
 			if (rle->count > 1) {
 				retval += printf("-");
 				retval += printf(format, rle->start +
 						 rle->count - 1);
 			}
 		}
 	}
 	return (retval);
 }
 
 /**
  * @brief Releases all the resources in a list.
  *
  * @param rl		The resource list to purge.
  *
  * @returns		nothing
  */
 void
 resource_list_purge(struct resource_list *rl)
 {
 	struct resource_list_entry *rle;
 
 	while ((rle = STAILQ_FIRST(rl)) != NULL) {
 		if (rle->res)
 			bus_release_resource(rman_get_device(rle->res),
 			    rle->type, rle->rid, rle->res);
 		STAILQ_REMOVE_HEAD(rl, link);
 		free(rle, M_BUS);
 	}
 }
 
 device_t
 bus_generic_add_child(device_t dev, u_int order, const char *name, int unit)
 {
 
 	return (device_add_child_ordered(dev, order, name, unit));
 }
 
 /**
  * @brief Helper function for implementing DEVICE_PROBE()
  *
  * This function can be used to help implement the DEVICE_PROBE() for
  * a bus (i.e. a device which has other devices attached to it). It
  * calls the DEVICE_IDENTIFY() method of each driver in the device's
  * devclass.
  */
 int
 bus_generic_probe(device_t dev)
 {
 	devclass_t dc = dev->devclass;
 	driverlink_t dl;
 
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		/*
 		 * If this driver's pass is too high, then ignore it.
 		 * For most drivers in the default pass, this will
 		 * never be true.  For early-pass drivers they will
 		 * only call the identify routines of eligible drivers
 		 * when this routine is called.  Drivers for later
 		 * passes should have their identify routines called
 		 * on early-pass buses during BUS_NEW_PASS().
 		 */
 		if (dl->pass > bus_current_pass)
 			continue;
 		DEVICE_IDENTIFY(dl->driver, dev);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_ATTACH()
  *
  * This function can be used to help implement the DEVICE_ATTACH() for
  * a bus. It calls device_probe_and_attach() for each of the device's
  * children.
  */
 int
 bus_generic_attach(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_probe_and_attach(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_DETACH()
  *
  * This function can be used to help implement the DEVICE_DETACH() for
  * a bus. It calls device_detach() for each of the device's
  * children.
  */
 int
 bus_generic_detach(device_t dev)
 {
 	device_t child;
 	int error;
 
 	if (dev->state != DS_ATTACHED)
 		return (EBUSY);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if ((error = device_detach(child)) != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SHUTDOWN()
  *
  * This function can be used to help implement the DEVICE_SHUTDOWN()
  * for a bus. It calls device_shutdown() for each of the device's
  * children.
  */
 int
 bus_generic_shutdown(device_t dev)
 {
 	device_t child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		device_shutdown(child);
 	}
 
 	return (0);
 }
 
 /**
  * @brief Default function for suspending a child device.
  *
  * This function is to be used by a bus's DEVICE_SUSPEND_CHILD().
  */
 int
 bus_generic_suspend_child(device_t dev, device_t child)
 {
 	int	error;
 
 	error = DEVICE_SUSPEND(child);
 
 	if (error == 0)
 		child->flags |= DF_SUSPENDED;
 
 	return (error);
 }
 
 /**
  * @brief Default function for resuming a child device.
  *
  * This function is to be used by a bus's DEVICE_RESUME_CHILD().
  */
 int
 bus_generic_resume_child(device_t dev, device_t child)
 {
 
 	DEVICE_RESUME(child);
 	child->flags &= ~DF_SUSPENDED;
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_SUSPEND()
  *
  * This function can be used to help implement the DEVICE_SUSPEND()
  * for a bus. It calls DEVICE_SUSPEND() for each of the device's
  * children. If any call to DEVICE_SUSPEND() fails, the suspend
  * operation is aborted and any devices which were suspended are
  * resumed immediately by calling their DEVICE_RESUME() methods.
  */
 int
 bus_generic_suspend(device_t dev)
 {
 	int		error;
 	device_t	child, child2;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		error = BUS_SUSPEND_CHILD(dev, child);
 		if (error) {
 			for (child2 = TAILQ_FIRST(&dev->children);
 			     child2 && child2 != child;
 			     child2 = TAILQ_NEXT(child2, link))
 				BUS_RESUME_CHILD(dev, child2);
 			return (error);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing DEVICE_RESUME()
  *
  * This function can be used to help implement the DEVICE_RESUME() for
  * a bus. It calls DEVICE_RESUME() on each of the device's children.
  */
 int
 bus_generic_resume(device_t dev)
 {
 	device_t	child;
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		BUS_RESUME_CHILD(dev, child);
 		/* if resume fails, there's nothing we can usefully do... */
 	}
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the first part of the ascii representation of
  * @p child, including its name, unit and description (if any - see
  * device_set_desc()).
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_header(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	if (device_get_desc(child)) {
 		retval += device_printf(child, "<%s>", device_get_desc(child));
 	} else {
 		retval += printf("%s", device_get_nameunit(child));
 	}
 
 	return (retval);
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints the last part of the ascii representation of
  * @p child, which consists of the string @c " on " followed by the
  * name and unit of the @p dev.
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_footer(device_t dev, device_t child)
 {
 	return (printf(" on %s\n", device_get_nameunit(dev)));
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function prints out the VM domain for the given device.
  *
  * @returns the number of characters printed
  */
 int
 bus_print_child_domain(device_t dev, device_t child)
 {
 	int domain;
 
 	/* No domain? Don't print anything */
 	if (BUS_GET_DOMAIN(dev, child, &domain) != 0)
 		return (0);
 
 	return (printf(" numa-domain %d", domain));
 }
 
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
  * This function simply calls bus_print_child_header() followed by
  * bus_print_child_footer().
  *
  * @returns the number of characters printed
  */
 int
 bus_generic_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += bus_print_child_domain(dev, child);
 	retval += bus_print_child_footer(dev, child);
 
 	return (retval);
 }
 
 /**
  * @brief Stub function for implementing BUS_READ_IVAR().
  *
  * @returns ENOENT
  */
 int
 bus_generic_read_ivar(device_t dev, device_t child, int index,
     uintptr_t * result)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_WRITE_IVAR().
  *
  * @returns ENOENT
  */
 int
 bus_generic_write_ivar(device_t dev, device_t child, int index,
     uintptr_t value)
 {
 	return (ENOENT);
 }
 
 /**
  * @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
  *
  * @returns NULL
  */
 struct resource_list *
 bus_generic_get_resource_list(device_t dev, device_t child)
 {
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DRIVER_ADDED().
  *
  * This implementation of BUS_DRIVER_ADDED() simply calls the driver's
  * DEVICE_IDENTIFY() method to allow it to add new children to the bus
  * and then calls device_probe_and_attach() for each unattached child.
  */
 void
 bus_generic_driver_added(device_t dev, driver_t *driver)
 {
 	device_t child;
 
 	DEVICE_IDENTIFY(driver, dev);
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if (child->state == DS_NOTPRESENT ||
 		    (child->flags & DF_REBID))
 			device_probe_and_attach(child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_NEW_PASS().
  *
  * This implementing of BUS_NEW_PASS() first calls the identify
  * routines for any drivers that probe at the current pass.  Then it
  * walks the list of devices for this bus.  If a device is already
  * attached, then it calls BUS_NEW_PASS() on that device.  If the
  * device is not already attached, it attempts to attach a driver to
  * it.
  */
 void
 bus_generic_new_pass(device_t dev)
 {
 	driverlink_t dl;
 	devclass_t dc;
 	device_t child;
 
 	dc = dev->devclass;
 	TAILQ_FOREACH(dl, &dc->drivers, link) {
 		if (dl->pass == bus_current_pass)
 			DEVICE_IDENTIFY(dl->driver, dev);
 	}
 	TAILQ_FOREACH(child, &dev->children, link) {
 		if (child->state >= DS_ATTACHED)
 			BUS_NEW_PASS(child);
 		else if (child->state == DS_NOTPRESENT)
 			device_probe_and_attach(child);
 	}
 }
 
 /**
  * @brief Helper function for implementing BUS_SETUP_INTR().
  *
  * This simple implementation of BUS_SETUP_INTR() simply calls the
  * BUS_SETUP_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
     int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg,
     void **cookiep)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
 		    filter, intr, arg, cookiep));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_TEARDOWN_INTR().
  *
  * This simple implementation of BUS_TEARDOWN_INTR() simply calls the
  * BUS_TEARDOWN_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ADJUST_RESOURCE().
  *
  * This simple implementation of BUS_ADJUST_RESOURCE() simply calls the
  * BUS_ADJUST_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_adjust_resource(device_t dev, device_t child, int type,
     struct resource *r, rman_res_t start, rman_res_t end)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start,
 		    end));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
  * BUS_ALLOC_RESOURCE() method of the parent of @p dev.
  */
 struct resource *
 bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
 		    start, end, count, flags));
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
  * BUS_RELEASE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
  * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
     struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
  *
  * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
  * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_deactivate_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
 		    r));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_MAP_RESOURCE().
  *
  * This simple implementation of BUS_MAP_RESOURCE() simply calls the
  * BUS_MAP_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_map_resource(device_t dev, device_t child, int type,
     struct resource *r, struct resource_map_request *args,
     struct resource_map *map)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_MAP_RESOURCE(dev->parent, child, type, r, args,
 		    map));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_UNMAP_RESOURCE().
  *
  * This simple implementation of BUS_UNMAP_RESOURCE() simply calls the
  * BUS_UNMAP_RESOURCE() method of the parent of @p dev.
  */
 int
 bus_generic_unmap_resource(device_t dev, device_t child, int type,
     struct resource *r, struct resource_map *map)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_UNMAP_RESOURCE(dev->parent, child, type, r, map));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_BIND_INTR().
  *
  * This simple implementation of BUS_BIND_INTR() simply calls the
  * BUS_BIND_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq,
     int cpu)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_BIND_INTR(dev->parent, child, irq, cpu));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_CONFIG_INTR().
  *
  * This simple implementation of BUS_CONFIG_INTR() simply calls the
  * BUS_CONFIG_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_DESCRIBE_INTR().
  *
  * This simple implementation of BUS_DESCRIBE_INTR() simply calls the
  * BUS_DESCRIBE_INTR() method of the parent of @p dev.
  */
 int
 bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie, const char *descr)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie,
 		    descr));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_CPUS().
  *
  * This simple implementation of BUS_GET_CPUS() simply calls the
  * BUS_GET_CPUS() method of the parent of @p dev.
  */
 int
 bus_generic_get_cpus(device_t dev, device_t child, enum cpu_sets op,
     size_t setsize, cpuset_t *cpuset)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent != NULL)
 		return (BUS_GET_CPUS(dev->parent, child, op, setsize, cpuset));
 	return (EINVAL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_DMA_TAG().
  *
  * This simple implementation of BUS_GET_DMA_TAG() simply calls the
  * BUS_GET_DMA_TAG() method of the parent of @p dev.
  */
 bus_dma_tag_t
 bus_generic_get_dma_tag(device_t dev, device_t child)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent != NULL)
 		return (BUS_GET_DMA_TAG(dev->parent, child));
 	return (NULL);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_BUS_TAG().
  *
  * This simple implementation of BUS_GET_BUS_TAG() simply calls the
  * BUS_GET_BUS_TAG() method of the parent of @p dev.
  */
 bus_space_tag_t
 bus_generic_get_bus_tag(device_t dev, device_t child)
 {
 
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent != NULL)
 		return (BUS_GET_BUS_TAG(dev->parent, child));
 	return ((bus_space_tag_t)0);
 }
 
 /**
  * @brief Helper function for implementing BUS_GET_RESOURCE().
  *
  * This implementation of BUS_GET_RESOURCE() uses the
  * resource_list_find() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * search.
  */
 int
 bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t *startp, rman_res_t *countp)
 {
 	struct resource_list *		rl = NULL;
 	struct resource_list_entry *	rle = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle)
 		return (ENOENT);
 
 	if (startp)
 		*startp = rle->start;
 	if (countp)
 		*countp = rle->count;
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_SET_RESOURCE().
  *
  * This implementation of BUS_SET_RESOURCE() uses the
  * resource_list_add() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 int
 bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t start, rman_res_t count)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	resource_list_add(rl, type, rid, start, (start + count - 1), count);
 
 	return (0);
 }
 
 /**
  * @brief Helper function for implementing BUS_DELETE_RESOURCE().
  *
  * This implementation of BUS_DELETE_RESOURCE() uses the
  * resource_list_delete() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
  * edit.
  */
 void
 bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
 {
 	struct resource_list *		rl = NULL;
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return;
 
 	resource_list_delete(rl, type, rid);
 
 	return;
 }
 
 /**
  * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
  *
  * This implementation of BUS_RELEASE_RESOURCE() uses the
  * resource_list_release() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 int
 bus_generic_rl_release_resource(device_t dev, device_t child, int type,
     int rid, struct resource *r)
 {
 	struct resource_list *		rl = NULL;
 
 	if (device_get_parent(child) != dev)
 		return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
 		    type, rid, r));
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (EINVAL);
 
 	return (resource_list_release(rl, dev, child, type, rid, r));
 }
 
 /**
  * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
  *
  * This implementation of BUS_ALLOC_RESOURCE() uses the
  * resource_list_alloc() function to do most of the work. It calls
  * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
  */
 struct resource *
 bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
     int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource_list *		rl = NULL;
 
 	if (device_get_parent(child) != dev)
 		return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
 		    type, rid, start, end, count, flags));
 
 	rl = BUS_GET_RESOURCE_LIST(dev, child);
 	if (!rl)
 		return (NULL);
 
 	return (resource_list_alloc(rl, dev, child, type, rid,
 	    start, end, count, flags));
 }
 
 /**
  * @brief Helper function for implementing BUS_CHILD_PRESENT().
  *
  * This simple implementation of BUS_CHILD_PRESENT() simply calls the
  * BUS_CHILD_PRESENT() method of the parent of @p dev.
  */
 int
 bus_generic_child_present(device_t dev, device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
 }
 
 int
 bus_generic_get_domain(device_t dev, device_t child, int *domain)
 {
 
 	if (dev->parent)
 		return (BUS_GET_DOMAIN(dev->parent, dev, domain));
 
 	return (ENOENT);
 }
 
 /**
  * @brief Helper function for implementing BUS_RESCAN().
  *
  * This null implementation of BUS_RESCAN() always fails to indicate
  * the bus does not support rescanning.
  */
 int
 bus_null_rescan(device_t dev)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Some convenience functions to make it easier for drivers to use the
  * resource-management functions.  All these really do is hide the
  * indirection through the parent's method table, making for slightly
  * less-wordy code.  In the future, it might make sense for this code
  * to maintain some sort of a list of resources allocated by each device.
  */
 
 int
 bus_alloc_resources(device_t dev, struct resource_spec *rs,
     struct resource **res)
 {
 	int i;
 
 	for (i = 0; rs[i].type != -1; i++)
 		res[i] = NULL;
 	for (i = 0; rs[i].type != -1; i++) {
 		res[i] = bus_alloc_resource_any(dev,
 		    rs[i].type, &rs[i].rid, rs[i].flags);
 		if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
 			bus_release_resources(dev, rs, res);
 			return (ENXIO);
 		}
 	}
 	return (0);
 }
 
 void
 bus_release_resources(device_t dev, const struct resource_spec *rs,
     struct resource **res)
 {
 	int i;
 
 	for (i = 0; rs[i].type != -1; i++)
 		if (res[i] != NULL) {
 			bus_release_resource(
 			    dev, rs[i].type, rs[i].rid, res[i]);
 			res[i] = NULL;
 		}
 }
 
 /**
  * @brief Wrapper function for BUS_ALLOC_RESOURCE().
  *
  * This function simply calls the BUS_ALLOC_RESOURCE() method of the
  * parent of @p dev.
  */
 struct resource *
 bus_alloc_resource(device_t dev, int type, int *rid, rman_res_t start,
     rman_res_t end, rman_res_t count, u_int flags)
 {
 	struct resource *res;
 
 	if (dev->parent == NULL)
 		return (NULL);
 	res = BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
 	    count, flags);
 	return (res);
 }
 
 /**
  * @brief Wrapper function for BUS_ADJUST_RESOURCE().
  *
  * This function simply calls the BUS_ADJUST_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_adjust_resource(device_t dev, int type, struct resource *r, rman_res_t start,
     rman_res_t end)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end));
 }
 
 /**
  * @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
  *
  * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
 
 /**
  * @brief Wrapper function for BUS_MAP_RESOURCE().
  *
  * This function simply calls the BUS_MAP_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_map_resource(device_t dev, int type, struct resource *r,
     struct resource_map_request *args, struct resource_map *map)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_MAP_RESOURCE(dev->parent, dev, type, r, args, map));
 }
 
 /**
  * @brief Wrapper function for BUS_UNMAP_RESOURCE().
  *
  * This function simply calls the BUS_UNMAP_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_unmap_resource(device_t dev, int type, struct resource *r,
     struct resource_map *map)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_UNMAP_RESOURCE(dev->parent, dev, type, r, map));
 }
 
 /**
  * @brief Wrapper function for BUS_RELEASE_RESOURCE().
  *
  * This function simply calls the BUS_RELEASE_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_release_resource(device_t dev, int type, int rid, struct resource *r)
 {
 	int rv;
 
 	if (dev->parent == NULL)
 		return (EINVAL);
 	rv = BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r);
 	return (rv);
 }
 
 /**
  * @brief Wrapper function for BUS_SETUP_INTR().
  *
  * This function simply calls the BUS_SETUP_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_setup_intr(device_t dev, struct resource *r, int flags,
     driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
 {
 	int error;
 
 	if (dev->parent == NULL)
 		return (EINVAL);
 	error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler,
 	    arg, cookiep);
 	if (error != 0)
 		return (error);
 	if (handler != NULL && !(flags & INTR_MPSAFE))
 		device_printf(dev, "[GIANT-LOCKED]\n");
 	return (0);
 }
 
 /**
  * @brief Wrapper function for BUS_TEARDOWN_INTR().
  *
  * This function simply calls the BUS_TEARDOWN_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
 }
 
 /**
  * @brief Wrapper function for BUS_BIND_INTR().
  *
  * This function simply calls the BUS_BIND_INTR() method of the
  * parent of @p dev.
  */
 int
 bus_bind_intr(device_t dev, struct resource *r, int cpu)
 {
 	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_BIND_INTR(dev->parent, dev, r, cpu));
 }
 
 /**
  * @brief Wrapper function for BUS_DESCRIBE_INTR().
  *
  * This function first formats the requested description into a
  * temporary buffer and then calls the BUS_DESCRIBE_INTR() method of
  * the parent of @p dev.
  */
 int
 bus_describe_intr(device_t dev, struct resource *irq, void *cookie,
     const char *fmt, ...)
 {
 	va_list ap;
 	char descr[MAXCOMLEN + 1];
 
 	if (dev->parent == NULL)
 		return (EINVAL);
 	va_start(ap, fmt);
 	vsnprintf(descr, sizeof(descr), fmt, ap);
 	va_end(ap);
 	return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr));
 }
 
 /**
  * @brief Wrapper function for BUS_SET_RESOURCE().
  *
  * This function simply calls the BUS_SET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_set_resource(device_t dev, int type, int rid,
     rman_res_t start, rman_res_t count)
 {
 	return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    start, count));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev.
  */
 int
 bus_get_resource(device_t dev, int type, int rid,
     rman_res_t *startp, rman_res_t *countp)
 {
 	return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    startp, countp));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the start value.
  */
 rman_res_t
 bus_get_resource_start(device_t dev, int type, int rid)
 {
 	rman_res_t start;
 	rman_res_t count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (start);
 }
 
 /**
  * @brief Wrapper function for BUS_GET_RESOURCE().
  *
  * This function simply calls the BUS_GET_RESOURCE() method of the
  * parent of @p dev and returns the count value.
  */
 rman_res_t
 bus_get_resource_count(device_t dev, int type, int rid)
 {
 	rman_res_t start;
 	rman_res_t count;
 	int error;
 
 	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
 	    &start, &count);
 	if (error)
 		return (0);
 	return (count);
 }
 
 /**
  * @brief Wrapper function for BUS_DELETE_RESOURCE().
  *
  * This function simply calls the BUS_DELETE_RESOURCE() method of the
  * parent of @p dev.
  */
 void
 bus_delete_resource(device_t dev, int type, int rid)
 {
 	BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PRESENT().
  *
  * This function simply calls the BUS_CHILD_PRESENT() method of the
  * parent of @p dev.
  */
 int
 bus_child_present(device_t child)
 {
 	return (BUS_CHILD_PRESENT(device_get_parent(child), child));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_PNPINFO_STR().
  *
  * This function simply calls the BUS_CHILD_PNPINFO_STR() method of the
  * parent of @p dev.
  */
 int
 bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL) {
 		*buf = '\0';
 		return (0);
 	}
 	return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen));
 }
 
 /**
  * @brief Wrapper function for BUS_CHILD_LOCATION_STR().
  *
  * This function simply calls the BUS_CHILD_LOCATION_STR() method of the
  * parent of @p dev.
  */
 int
 bus_child_location_str(device_t child, char *buf, size_t buflen)
 {
 	device_t parent;
 
 	parent = device_get_parent(child);
 	if (parent == NULL) {
 		*buf = '\0';
 		return (0);
 	}
 	return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_CPUS().
  *
  * This function simply calls the BUS_GET_CPUS() method of the
  * parent of @p dev.
  */
 int
 bus_get_cpus(device_t dev, enum cpu_sets op, size_t setsize, cpuset_t *cpuset)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return (EINVAL);
 	return (BUS_GET_CPUS(parent, dev, op, setsize, cpuset));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_DMA_TAG().
  *
  * This function simply calls the BUS_GET_DMA_TAG() method of the
  * parent of @p dev.
  */
 bus_dma_tag_t
 bus_get_dma_tag(device_t dev)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return (NULL);
 	return (BUS_GET_DMA_TAG(parent, dev));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_BUS_TAG().
  *
  * This function simply calls the BUS_GET_BUS_TAG() method of the
  * parent of @p dev.
  */
 bus_space_tag_t
 bus_get_bus_tag(device_t dev)
 {
 	device_t parent;
 
 	parent = device_get_parent(dev);
 	if (parent == NULL)
 		return ((bus_space_tag_t)0);
 	return (BUS_GET_BUS_TAG(parent, dev));
 }
 
 /**
  * @brief Wrapper function for BUS_GET_DOMAIN().
  *
  * This function simply calls the BUS_GET_DOMAIN() method of the
  * parent of @p dev.
  */
 int
 bus_get_domain(device_t dev, int *domain)
 {
 	return (BUS_GET_DOMAIN(device_get_parent(dev), dev, domain));
 }
 
 /* Resume all devices and then notify userland that we're up again. */
 static int
 root_resume(device_t dev)
 {
 	int error;
 
 	error = bus_generic_resume(dev);
 	if (error == 0)
 		devctl_notify("kern", "power", "resume", NULL);
 	return (error);
 }
 
 static int
 root_print_child(device_t dev, device_t child)
 {
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += printf("\n");
 
 	return (retval);
 }
 
 static int
 root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
     driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep)
 {
 	/*
 	 * If an interrupt mapping gets to here something bad has happened.
 	 */
 	panic("root_setup_intr");
 }
 
 /*
  * If we get here, assume that the device is permanent and really is
  * present in the system.  Removable bus drivers are expected to intercept
  * this call long before it gets here.  We return -1 so that drivers that
  * really care can check vs -1 or some ERRNO returned higher in the food
  * chain.
  */
 static int
 root_child_present(device_t dev, device_t child)
 {
 	return (-1);
 }
 
 static int
 root_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
     cpuset_t *cpuset)
 {
 
 	switch (op) {
 	case INTR_CPUS:
 		/* Default to returning the set of all CPUs. */
 		if (setsize != sizeof(cpuset_t))
 			return (EINVAL);
 		*cpuset = all_cpus;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 static kobj_method_t root_methods[] = {
 	/* Device interface */
 	KOBJMETHOD(device_shutdown,	bus_generic_shutdown),
 	KOBJMETHOD(device_suspend,	bus_generic_suspend),
 	KOBJMETHOD(device_resume,	root_resume),
 
 	/* Bus interface */
 	KOBJMETHOD(bus_print_child,	root_print_child),
 	KOBJMETHOD(bus_read_ivar,	bus_generic_read_ivar),
 	KOBJMETHOD(bus_write_ivar,	bus_generic_write_ivar),
 	KOBJMETHOD(bus_setup_intr,	root_setup_intr),
 	KOBJMETHOD(bus_child_present,	root_child_present),
 	KOBJMETHOD(bus_get_cpus,	root_get_cpus),
 
 	KOBJMETHOD_END
 };
 
 static driver_t root_driver = {
 	"root",
 	root_methods,
 	1,			/* no softc */
 };
 
 device_t	root_bus;
 devclass_t	root_devclass;
 
 static int
 root_bus_module_handler(module_t mod, int what, void* arg)
 {
 	switch (what) {
 	case MOD_LOAD:
 		TAILQ_INIT(&bus_data_devices);
 		kobj_class_compile((kobj_class_t) &root_driver);
 		root_bus = make_device(NULL, "root", 0);
 		root_bus->desc = "System root bus";
 		kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
 		root_bus->driver = &root_driver;
 		root_bus->state = DS_ATTACHED;
 		root_devclass = devclass_find_internal("root", NULL, FALSE);
 		devinit();
 		return (0);
 
 	case MOD_SHUTDOWN:
 		device_shutdown(root_bus);
 		return (0);
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t root_bus_mod = {
 	"rootbus",
 	root_bus_module_handler,
 	NULL
 };
 DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 
 /**
  * @brief Automatically configure devices
  *
  * This function begins the autoconfiguration process by calling
  * device_probe_and_attach() for each child of the @c root0 device.
  */
 void
 root_bus_configure(void)
 {
 
 	PDEBUG(("."));
 
 	/* Eventually this will be split up, but this is sufficient for now. */
 	bus_set_pass(BUS_PASS_DEFAULT);
 }
 
 /**
  * @brief Module handler for registering device drivers
  *
  * This module handler is used to automatically register device
  * drivers when modules are loaded. If @p what is MOD_LOAD, it calls
  * devclass_add_driver() for the driver described by the
  * driver_module_data structure pointed to by @p arg
  */
 int
 driver_module_handler(module_t mod, int what, void *arg)
 {
 	struct driver_module_data *dmd;
 	devclass_t bus_devclass;
 	kobj_class_t driver;
 	int error, pass;
 
 	dmd = (struct driver_module_data *)arg;
 	bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
 	error = 0;
 
 	switch (what) {
 	case MOD_LOAD:
 		if (dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 
 		pass = dmd->dmd_pass;
 		driver = dmd->dmd_driver;
 		PDEBUG(("Loading module: driver %s on bus %s (pass %d)",
 		    DRIVERNAME(driver), dmd->dmd_busname, pass));
 		error = devclass_add_driver(bus_devclass, driver, pass,
 		    dmd->dmd_devclass);
 		break;
 
 	case MOD_UNLOAD:
 		PDEBUG(("Unloading module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_delete_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	case MOD_QUIESCE:
 		PDEBUG(("Quiesce module: driver %s from bus %s",
 		    DRIVERNAME(dmd->dmd_driver),
 		    dmd->dmd_busname));
 		error = devclass_quiesce_driver(bus_devclass,
 		    dmd->dmd_driver);
 
 		if (!error && dmd->dmd_chainevh)
 			error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 /**
  * @brief Enumerate all hinted devices for this bus.
  *
  * Walks through the hints for this bus and calls the bus_hinted_child
  * routine for each one it fines.  It searches first for the specific
  * bus that's being probed for hinted children (eg isa0), and then for
  * generic children (eg isa).
  *
  * @param	dev	bus device to enumerate
  */
 void
 bus_enumerate_hinted_children(device_t bus)
 {
 	int i;
 	const char *dname, *busname;
 	int dunit;
 
 	/*
 	 * enumerate all devices on the specific bus
 	 */
 	busname = device_get_nameunit(bus);
 	i = 0;
 	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
 		BUS_HINTED_CHILD(bus, dname, dunit);
 
 	/*
 	 * and all the generic ones.
 	 */
 	busname = device_get_name(bus);
 	i = 0;
 	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
 		BUS_HINTED_CHILD(bus, dname, dunit);
 }
 
 #ifdef BUS_DEBUG
 
 /* the _short versions avoid iteration by not calling anything that prints
  * more than oneliners. I love oneliners.
  */
 
 static void
 print_device_short(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
 	    dev->unit, dev->desc,
 	    (dev->parent? "":"no "),
 	    (TAILQ_EMPTY(&dev->children)? "no ":""),
 	    (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
 	    (dev->flags&DF_FIXEDCLASS? "fixed,":""),
 	    (dev->flags&DF_WILDCARD? "wildcard,":""),
 	    (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
 	    (dev->flags&DF_REBID? "rebiddable,":""),
 	    (dev->ivars? "":"no "),
 	    (dev->softc? "":"no "),
 	    dev->busy));
 }
 
 static void
 print_device(device_t dev, int indent)
 {
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	indentprintf(("Parent:\n"));
 	print_device_short(dev->parent, indent+1);
 	indentprintf(("Driver:\n"));
 	print_driver_short(dev->driver, indent+1);
 	indentprintf(("Devclass:\n"));
 	print_devclass_short(dev->devclass, indent+1);
 }
 
 void
 print_device_tree_short(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device_short(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree_short(child, indent+1);
 	}
 }
 
 void
 print_device_tree(device_t dev, int indent)
 /* print the device and all its children (indented) */
 {
 	device_t child;
 
 	if (!dev)
 		return;
 
 	print_device(dev, indent);
 
 	TAILQ_FOREACH(child, &dev->children, link) {
 		print_device_tree(child, indent+1);
 	}
 }
 
 static void
 print_driver_short(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	indentprintf(("driver %s: softc size = %zd\n",
 	    driver->name, driver->size));
 }
 
 static void
 print_driver(driver_t *driver, int indent)
 {
 	if (!driver)
 		return;
 
 	print_driver_short(driver, indent);
 }
 
 static void
 print_driver_list(driver_list_t drivers, int indent)
 {
 	driverlink_t driver;
 
 	TAILQ_FOREACH(driver, &drivers, link) {
 		print_driver(driver->driver, indent);
 	}
 }
 
 static void
 print_devclass_short(devclass_t dc, int indent)
 {
 	if ( !dc )
 		return;
 
 	indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
 }
 
 static void
 print_devclass(devclass_t dc, int indent)
 {
 	int i;
 
 	if ( !dc )
 		return;
 
 	print_devclass_short(dc, indent);
 	indentprintf(("Drivers:\n"));
 	print_driver_list(dc->drivers, indent+1);
 
 	indentprintf(("Devices:\n"));
 	for (i = 0; i < dc->maxunit; i++)
 		if (dc->devices[i])
 			print_device(dc->devices[i], indent+1);
 }
 
 void
 print_devclass_list_short(void)
 {
 	devclass_t dc;
 
 	printf("Short listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass_short(dc, 0);
 	}
 }
 
 void
 print_devclass_list(void)
 {
 	devclass_t dc;
 
 	printf("Full listing of devclasses, drivers & devices:\n");
 	TAILQ_FOREACH(dc, &devclasses, link) {
 		print_devclass(dc, 0);
 	}
 }
 
 #endif
 
 /*
  * User-space access to the device tree.
  *
  * We implement a small set of nodes:
  *
  * hw.bus			Single integer read method to obtain the
  *				current generation count.
  * hw.bus.devices		Reads the entire device tree in flat space.
  * hw.bus.rman			Resource manager interface
  *
  * We might like to add the ability to scan devclasses and/or drivers to
  * determine what else is currently loaded/available.
  */
 
 static int
 sysctl_bus(SYSCTL_HANDLER_ARGS)
 {
 	struct u_businfo	ubus;
 
 	ubus.ub_version = BUS_USER_VERSION;
 	ubus.ub_generation = bus_data_generation;
 
 	return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
 }
 SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
     "bus-related data");
 
 static int
 sysctl_devices(SYSCTL_HANDLER_ARGS)
 {
 	int			*name = (int *)arg1;
 	u_int			namelen = arg2;
 	int			index;
 	device_t		dev;
 	struct u_device		udev;	/* XXX this is a bit big */
 	int			error;
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	if (bus_data_generation_check(name[0]))
 		return (EINVAL);
 
 	index = name[1];
 
 	/*
 	 * Scan the list of devices, looking for the requested index.
 	 */
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		if (index-- == 0)
 			break;
 	}
 	if (dev == NULL)
 		return (ENOENT);
 
 	/*
 	 * Populate the return array.
 	 */
 	bzero(&udev, sizeof(udev));
 	udev.dv_handle = (uintptr_t)dev;
 	udev.dv_parent = (uintptr_t)dev->parent;
 	if (dev->nameunit != NULL)
 		strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name));
 	if (dev->desc != NULL)
 		strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc));
 	if (dev->driver != NULL && dev->driver->name != NULL)
 		strlcpy(udev.dv_drivername, dev->driver->name,
 		    sizeof(udev.dv_drivername));
 	bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo));
 	bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location));
 	udev.dv_devflags = dev->devflags;
 	udev.dv_flags = dev->flags;
 	udev.dv_state = dev->state;
 	error = SYSCTL_OUT(req, &udev, sizeof(udev));
 	return (error);
 }
 
 SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
     "system device tree");
 
 int
 bus_data_generation_check(int generation)
 {
 	if (generation != bus_data_generation)
 		return (1);
 
 	/* XXX generate optimised lists here? */
 	return (0);
 }
 
 void
 bus_data_generation_update(void)
 {
 	bus_data_generation++;
 }
 
 int
 bus_free_resource(device_t dev, int type, struct resource *r)
 {
 	if (r == NULL)
 		return (0);
 	return (bus_release_resource(dev, type, rman_get_rid(r), r));
 }
 
 device_t
 device_lookup_by_name(const char *name)
 {
 	device_t dev;
 
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		if (dev->nameunit != NULL && strcmp(dev->nameunit, name) == 0)
 			return (dev);
 	}
 	return (NULL);
 }
 
 /*
  * /dev/devctl2 implementation.  The existing /dev/devctl device has
  * implicit semantics on open, so it could not be reused for this.
  * Another option would be to call this /dev/bus?
  */
 static int
 find_device(struct devreq *req, device_t *devp)
 {
 	device_t dev;
 
 	/*
 	 * First, ensure that the name is nul terminated.
 	 */
 	if (memchr(req->dr_name, '\0', sizeof(req->dr_name)) == NULL)
 		return (EINVAL);
 
 	/*
 	 * Second, try to find an attached device whose name matches
 	 * 'name'.
 	 */
 	dev = device_lookup_by_name(req->dr_name);
 	if (dev != NULL) {
 		*devp = dev;
 		return (0);
 	}
 
 	/* Finally, give device enumerators a chance. */
 	dev = NULL;
 	EVENTHANDLER_INVOKE(dev_lookup, req->dr_name, &dev);
 	if (dev == NULL)
 		return (ENOENT);
 	*devp = dev;
 	return (0);
 }
 
 static bool
 driver_exists(device_t bus, const char *driver)
 {
 	devclass_t dc;
 
 	for (dc = bus->devclass; dc != NULL; dc = dc->parent) {
 		if (devclass_find_driver_internal(dc, driver) != NULL)
 			return (true);
 	}
 	return (false);
 }
 
 static int
 devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct devreq *req;
 	device_t dev;
 	int error, old;
 
 	/* Locate the device to control. */
 	mtx_lock(&Giant);
 	req = (struct devreq *)data;
 	switch (cmd) {
 	case DEV_ATTACH:
 	case DEV_DETACH:
 	case DEV_ENABLE:
 	case DEV_DISABLE:
 	case DEV_SUSPEND:
 	case DEV_RESUME:
 	case DEV_SET_DRIVER:
 	case DEV_CLEAR_DRIVER:
 	case DEV_RESCAN:
 	case DEV_DELETE:
 		error = priv_check(td, PRIV_DRIVER);
 		if (error == 0)
 			error = find_device(req, &dev);
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	if (error) {
 		mtx_unlock(&Giant);
 		return (error);
 	}
 
 	/* Perform the requested operation. */
 	switch (cmd) {
 	case DEV_ATTACH:
 		if (device_is_attached(dev) && (dev->flags & DF_REBID) == 0)
 			error = EBUSY;
 		else if (!device_is_enabled(dev))
 			error = ENXIO;
 		else
 			error = device_probe_and_attach(dev);
 		break;
 	case DEV_DETACH:
 		if (!device_is_attached(dev)) {
 			error = ENXIO;
 			break;
 		}
 		if (!(req->dr_flags & DEVF_FORCE_DETACH)) {
 			error = device_quiesce(dev);
 			if (error)
 				break;
 		}
 		error = device_detach(dev);
 		break;
 	case DEV_ENABLE:
 		if (device_is_enabled(dev)) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If the device has been probed but not attached (e.g.
 		 * when it has been disabled by a loader hint), just
 		 * attach the device rather than doing a full probe.
 		 */
 		device_enable(dev);
 		if (device_is_alive(dev)) {
 			/*
 			 * If the device was disabled via a hint, clear
 			 * the hint.
 			 */
 			if (resource_disabled(dev->driver->name, dev->unit))
 				resource_unset_value(dev->driver->name,
 				    dev->unit, "disabled");
 			error = device_attach(dev);
 		} else
 			error = device_probe_and_attach(dev);
 		break;
 	case DEV_DISABLE:
 		if (!device_is_enabled(dev)) {
 			error = ENXIO;
 			break;
 		}
 
 		if (!(req->dr_flags & DEVF_FORCE_DETACH)) {
 			error = device_quiesce(dev);
 			if (error)
 				break;
 		}
 
 		/*
 		 * Force DF_FIXEDCLASS on around detach to preserve
 		 * the existing name.
 		 */
 		old = dev->flags;
 		dev->flags |= DF_FIXEDCLASS;
 		error = device_detach(dev);
 		if (!(old & DF_FIXEDCLASS))
 			dev->flags &= ~DF_FIXEDCLASS;
 		if (error == 0)
 			device_disable(dev);
 		break;
 	case DEV_SUSPEND:
 		if (device_is_suspended(dev)) {
 			error = EBUSY;
 			break;
 		}
 		if (device_get_parent(dev) == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = BUS_SUSPEND_CHILD(device_get_parent(dev), dev);
 		break;
 	case DEV_RESUME:
 		if (!device_is_suspended(dev)) {
 			error = EINVAL;
 			break;
 		}
 		if (device_get_parent(dev) == NULL) {
 			error = EINVAL;
 			break;
 		}
 		error = BUS_RESUME_CHILD(device_get_parent(dev), dev);
 		break;
 	case DEV_SET_DRIVER: {
 		devclass_t dc;
 		char driver[128];
 
 		error = copyinstr(req->dr_data, driver, sizeof(driver), NULL);
 		if (error)
 			break;
 		if (driver[0] == '\0') {
 			error = EINVAL;
 			break;
 		}
 		if (dev->devclass != NULL &&
 		    strcmp(driver, dev->devclass->name) == 0)
 			/* XXX: Could possibly force DF_FIXEDCLASS on? */
 			break;
 
 		/*
 		 * Scan drivers for this device's bus looking for at
 		 * least one matching driver.
 		 */
 		if (dev->parent == NULL) {
 			error = EINVAL;
 			break;
 		}
 		if (!driver_exists(dev->parent, driver)) {
 			error = ENOENT;
 			break;
 		}
 		dc = devclass_create(driver);
 		if (dc == NULL) {
 			error = ENOMEM;
 			break;
 		}
 
 		/* Detach device if necessary. */
 		if (device_is_attached(dev)) {
 			if (req->dr_flags & DEVF_SET_DRIVER_DETACH)
 				error = device_detach(dev);
 			else
 				error = EBUSY;
 			if (error)
 				break;
 		}
 
 		/* Clear any previously-fixed device class and unit. */
 		if (dev->flags & DF_FIXEDCLASS)
 			devclass_delete_device(dev->devclass, dev);
 		dev->flags |= DF_WILDCARD;
 		dev->unit = -1;
 
 		/* Force the new device class. */
 		error = devclass_add_device(dc, dev);
 		if (error)
 			break;
 		dev->flags |= DF_FIXEDCLASS;
 		error = device_probe_and_attach(dev);
 		break;
 	}
 	case DEV_CLEAR_DRIVER:
 		if (!(dev->flags & DF_FIXEDCLASS)) {
 			error = 0;
 			break;
 		}
 		if (device_is_attached(dev)) {
 			if (req->dr_flags & DEVF_CLEAR_DRIVER_DETACH)
 				error = device_detach(dev);
 			else
 				error = EBUSY;
 			if (error)
 				break;
 		}
 
 		dev->flags &= ~DF_FIXEDCLASS;
 		dev->flags |= DF_WILDCARD;
 		devclass_delete_device(dev->devclass, dev);
 		error = device_probe_and_attach(dev);
 		break;
 	case DEV_RESCAN:
 		if (!device_is_attached(dev)) {
 			error = ENXIO;
 			break;
 		}
 		error = BUS_RESCAN(dev);
 		break;
 	case DEV_DELETE: {
 		device_t parent;
 
 		parent = device_get_parent(dev);
 		if (parent == NULL) {
 			error = EINVAL;
 			break;
 		}
 		if (!(req->dr_flags & DEVF_FORCE_DELETE)) {
 			if (bus_child_present(dev) != 0) {
 				error = EBUSY;
 				break;
 			}
 		}
 		
 		error = device_delete_child(parent, dev);
 		break;
 	}
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static struct cdevsw devctl2_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	devctl2_ioctl,
 	.d_name =	"devctl2",
 };
 
 static void
 devctl2_init(void)
 {
 
 	make_dev_credf(MAKEDEV_ETERNAL, &devctl2_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0600, "devctl2");
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(device, db_show_device)
 {
 	device_t dev;
 
 	if (!have_addr)
 		return;
 
 	dev = (device_t)addr;
 
 	db_printf("name:    %s\n", device_get_nameunit(dev));
 	db_printf("  driver:  %s\n", DRIVERNAME(dev->driver));
 	db_printf("  class:   %s\n", DEVCLANAME(dev->devclass));
 	db_printf("  addr:    %p\n", dev);
 	db_printf("  parent:  %p\n", dev->parent);
 	db_printf("  softc:   %p\n", dev->softc);
 	db_printf("  ivars:   %p\n", dev->ivars);
 }
 
 DB_SHOW_ALL_COMMAND(devices, db_show_all_devices)
 {
 	device_t dev;
 
 	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
 		db_show_device((db_expr_t)dev, true, count, modif);
 	}
 }
 #endif
Index: head/sys/kern/subr_bus_dma.c
===================================================================
--- head/sys/kern/subr_bus_dma.c	(revision 326270)
+++ head/sys/kern/subr_bus_dma.c	(revision 326271)
@@ -1,569 +1,571 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 EMC Corp.
  * All rights reserved.
  *
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bus.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/mbuf.h>
 #include <sys/memdesc.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/pmap.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 
 #include <machine/bus.h>
 
 /*
  * Load up data starting at offset within a region specified by a
  * list of virtual address ranges until either length or the region
  * are exhausted.
  */
 static int
 _bus_dmamap_load_vlist(bus_dma_tag_t dmat, bus_dmamap_t map,
     bus_dma_segment_t *list, int sglist_cnt, struct pmap *pmap, int *nsegs,
     int flags, size_t offset, size_t length)
 {
 	int error;
 
 	error = 0;
 	for (; sglist_cnt > 0 && length != 0; sglist_cnt--, list++) {
 		char *addr;
 		size_t ds_len;
 
 		KASSERT((offset < list->ds_len),
 		    ("Invalid mid-segment offset"));
 		addr = (char *)(uintptr_t)list->ds_addr + offset;
 		ds_len = list->ds_len - offset;
 		offset = 0;
 		if (ds_len > length)
 			ds_len = length;
 		length -= ds_len;
 		KASSERT((ds_len != 0), ("Segment length is zero"));
 		error = _bus_dmamap_load_buffer(dmat, map, addr, ds_len, pmap,
 		    flags, NULL, nsegs);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Load a list of physical addresses.
  */
 static int
 _bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
     bus_dma_segment_t *list, int sglist_cnt, int *nsegs, int flags)
 {
 	int error;
 
 	error = 0;
 	for (; sglist_cnt > 0; sglist_cnt--, list++) {
 		error = _bus_dmamap_load_phys(dmat, map,
 		    (vm_paddr_t)list->ds_addr, list->ds_len, flags, NULL,
 		    nsegs);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Load an mbuf chain.
  */
 static int
 _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs, int flags)
 {
 	struct mbuf *m;
 	int error;
 
 	error = 0;
 	for (m = m0; m != NULL && error == 0; m = m->m_next) {
 		if (m->m_len > 0) {
 			error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
 			    m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
 			    segs, nsegs);
 		}
 	}
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, *nsegs);
 	return (error);
 }
 
 /*
  * Load from block io.
  */
 static int
 _bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
     int *nsegs, int flags)
 {
 
 	if ((bio->bio_flags & BIO_VLIST) != 0) {
 		bus_dma_segment_t *segs = (bus_dma_segment_t *)bio->bio_data;
 		return (_bus_dmamap_load_vlist(dmat, map, segs, bio->bio_ma_n,
 		    kernel_pmap, nsegs, flags, bio->bio_ma_offset,
 		    bio->bio_bcount));
 	}
 
 	if ((bio->bio_flags & BIO_UNMAPPED) != 0)
 		return (_bus_dmamap_load_ma(dmat, map, bio->bio_ma,
 		    bio->bio_bcount, bio->bio_ma_offset, flags, NULL, nsegs));
 
 	return (_bus_dmamap_load_buffer(dmat, map, bio->bio_data,
 	    bio->bio_bcount, kernel_pmap, flags, NULL, nsegs));
 }
 
 int
 bus_dmamap_load_ma_triv(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
     bus_dma_segment_t *segs, int *segp)
 {
 	vm_paddr_t paddr;
 	bus_size_t len;
 	int error, i;
 
 	error = 0;
 	for (i = 0; tlen > 0; i++, tlen -= len) {
 		len = min(PAGE_SIZE - ma_offs, tlen);
 		paddr = VM_PAGE_TO_PHYS(ma[i]) + ma_offs;
 		error = _bus_dmamap_load_phys(dmat, map, paddr, len,
 		    flags, segs, segp);
 		if (error != 0)
 			break;
 		ma_offs = 0;
 	}
 	return (error);
 }
 
 /*
  * Load a cam control block.
  */
 static int
 _bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
 		    int *nsegs, int flags)
 {
 	struct ccb_hdr *ccb_h;
 	void *data_ptr;
 	int error;
 	uint32_t dxfer_len;
 	uint16_t sglist_cnt;
 
 	error = 0;
 	ccb_h = &ccb->ccb_h;
 	switch (ccb_h->func_code) {
 	case XPT_SCSI_IO: {
 		struct ccb_scsiio *csio;
 
 		csio = &ccb->csio;
 		data_ptr = csio->data_ptr;
 		dxfer_len = csio->dxfer_len;
 		sglist_cnt = csio->sglist_cnt;
 		break;
 	}
 	case XPT_CONT_TARGET_IO: {
 		struct ccb_scsiio *ctio;
 
 		ctio = &ccb->ctio;
 		data_ptr = ctio->data_ptr;
 		dxfer_len = ctio->dxfer_len;
 		sglist_cnt = ctio->sglist_cnt;
 		break;
 	}
 	case XPT_ATA_IO: {
 		struct ccb_ataio *ataio;
 
 		ataio = &ccb->ataio;
 		data_ptr = ataio->data_ptr;
 		dxfer_len = ataio->dxfer_len;
 		sglist_cnt = 0;
 		break;
 	}
 	case XPT_NVME_IO:
 	case XPT_NVME_ADMIN: {
 		struct ccb_nvmeio *nvmeio;
 
 		nvmeio = &ccb->nvmeio;
 		data_ptr = nvmeio->data_ptr;
 		dxfer_len = nvmeio->dxfer_len;
 		sglist_cnt = nvmeio->sglist_cnt;
 		break;
 	}
 	default:
 		panic("_bus_dmamap_load_ccb: Unsupported func code %d",
 		    ccb_h->func_code);
 	}
 
 	switch ((ccb_h->flags & CAM_DATA_MASK)) {
 	case CAM_DATA_VADDR:
 		error = _bus_dmamap_load_buffer(dmat, map, data_ptr, dxfer_len,
 		    kernel_pmap, flags, NULL, nsegs);
 		break;
 	case CAM_DATA_PADDR:
 		error = _bus_dmamap_load_phys(dmat, map,
 		    (vm_paddr_t)(uintptr_t)data_ptr, dxfer_len, flags, NULL,
 		    nsegs);
 		break;
 	case CAM_DATA_SG:
 		error = _bus_dmamap_load_vlist(dmat, map,
 		    (bus_dma_segment_t *)data_ptr, sglist_cnt, kernel_pmap,
 		    nsegs, flags, 0, dxfer_len);
 		break;
 	case CAM_DATA_SG_PADDR:
 		error = _bus_dmamap_load_plist(dmat, map,
 		    (bus_dma_segment_t *)data_ptr, sglist_cnt, nsegs, flags);
 		break;
 	case CAM_DATA_BIO:
 		error = _bus_dmamap_load_bio(dmat, map, (struct bio *)data_ptr,
 		    nsegs, flags);
 		break;
 	default:
 		panic("_bus_dmamap_load_ccb: flags 0x%X unimplemented",
 		    ccb_h->flags);
 	}
 	return (error);
 }
 
 /*
  * Load a uio.
  */
 static int
 _bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
     int *nsegs, int flags)
 {
 	bus_size_t resid;
 	bus_size_t minlen;
 	struct iovec *iov;
 	pmap_t pmap;
 	caddr_t addr;
 	int error, i;
 
 	if (uio->uio_segflg == UIO_USERSPACE) {
 		KASSERT(uio->uio_td != NULL,
 			("bus_dmamap_load_uio: USERSPACE but no proc"));
 		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
 	} else
 		pmap = kernel_pmap;
 	resid = uio->uio_resid;
 	iov = uio->uio_iov;
 	error = 0;
 
 	for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) {
 		/*
 		 * Now at the first iovec to load.  Load each iovec
 		 * until we have exhausted the residual count.
 		 */
 
 		addr = (caddr_t) iov[i].iov_base;
 		minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len;
 		if (minlen > 0) {
 			error = _bus_dmamap_load_buffer(dmat, map, addr,
 			    minlen, pmap, flags, NULL, nsegs);
 			resid -= minlen;
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Map the buffer buf into bus space using the dmamap map.
  */
 int
 bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
     bus_size_t buflen, bus_dmamap_callback_t *callback,
     void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_vaddr(buf, buflen);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 
 	nsegs = -1;
 	error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, kernel_pmap,
 	    flags, NULL, &nsegs);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, 0);
 
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
     bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int nsegs, error;
 
 	M_ASSERTPKTHDR(m0);
 
 	flags |= BUS_DMA_NOWAIT;
 	nsegs = -1;
 	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, NULL, &nsegs, flags);
 	++nsegs;
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, m0->m_pkthdr.len, error);
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 	return (error);
 }
 
 int
 bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
     bus_dma_segment_t *segs, int *nsegs, int flags)
 {
 	int error;
 
 	flags |= BUS_DMA_NOWAIT;
 	*nsegs = -1;
 	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags);
 	++*nsegs;
 	_bus_dmamap_complete(dmat, map, segs, *nsegs, error);
 	return (error);
 }
 
 int
 bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
     bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int nsegs, error;
 
 	flags |= BUS_DMA_NOWAIT;
 	nsegs = -1;
 	error = _bus_dmamap_load_uio(dmat, map, uio, &nsegs, flags);
 	nsegs++;
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, uio->uio_resid, error);
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 	return (error);
 }
 
 int
 bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
 		    bus_dmamap_callback_t *callback, void *callback_arg,
 		    int flags)
 {
 	bus_dma_segment_t *segs;
 	struct ccb_hdr *ccb_h;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	ccb_h = &ccb->ccb_h;
 	if ((ccb_h->flags & CAM_DIR_MASK) == CAM_DIR_NONE) {
 		callback(callback_arg, NULL, 0, 0);
 		return (0);
 	}
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_ccb(ccb);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 	nsegs = -1;
 	error = _bus_dmamap_load_ccb(dmat, map, ccb, &nsegs, flags);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, error);
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
 		    bus_dmamap_callback_t *callback, void *callback_arg,
 		    int flags)
 {
 	bus_dma_segment_t *segs;
 	struct memdesc mem;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0) {
 		mem = memdesc_bio(bio);
 		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
 	}
 	nsegs = -1;
 	error = _bus_dmamap_load_bio(dmat, map, bio, &nsegs, flags);
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, error);
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
 
 int
 bus_dmamap_load_mem(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct memdesc *mem, bus_dmamap_callback_t *callback,
     void *callback_arg, int flags)
 {
 	bus_dma_segment_t *segs;
 	int error;
 	int nsegs;
 
 	if ((flags & BUS_DMA_NOWAIT) == 0)
 		_bus_dmamap_waitok(dmat, map, mem, callback, callback_arg);
 
 	nsegs = -1;
 	error = 0;
 	switch (mem->md_type) {
 	case MEMDESC_VADDR:
 		error = _bus_dmamap_load_buffer(dmat, map, mem->u.md_vaddr,
 		    mem->md_opaque, kernel_pmap, flags, NULL, &nsegs);
 		break;
 	case MEMDESC_PADDR:
 		error = _bus_dmamap_load_phys(dmat, map, mem->u.md_paddr,
 		    mem->md_opaque, flags, NULL, &nsegs);
 		break;
 	case MEMDESC_VLIST:
 		error = _bus_dmamap_load_vlist(dmat, map, mem->u.md_list,
 		    mem->md_opaque, kernel_pmap, &nsegs, flags, 0, SIZE_T_MAX);
 		break;
 	case MEMDESC_PLIST:
 		error = _bus_dmamap_load_plist(dmat, map, mem->u.md_list,
 		    mem->md_opaque, &nsegs, flags);
 		break;
 	case MEMDESC_BIO:
 		error = _bus_dmamap_load_bio(dmat, map, mem->u.md_bio,
 		    &nsegs, flags);
 		break;
 	case MEMDESC_UIO:
 		error = _bus_dmamap_load_uio(dmat, map, mem->u.md_uio,
 		    &nsegs, flags);
 		break;
 	case MEMDESC_MBUF:
 		error = _bus_dmamap_load_mbuf_sg(dmat, map, mem->u.md_mbuf,
 		    NULL, &nsegs, flags);
 		break;
 	case MEMDESC_CCB:
 		error = _bus_dmamap_load_ccb(dmat, map, mem->u.md_ccb, &nsegs,
 		    flags);
 		break;
 	}
 	nsegs++;
 
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, flags, error, nsegs);
 
 	if (error == EINPROGRESS)
 		return (error);
 
 	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
 	if (error)
 		(*callback)(callback_arg, segs, 0, error);
 	else
 		(*callback)(callback_arg, segs, nsegs, 0);
 
 	/*
 	 * Return ENOMEM to the caller so that it can pass it up the stack.
 	 * This error only happens when NOWAIT is set, so deferral is disabled.
 	 */
 	if (error == ENOMEM)
 		return (error);
 
 	return (0);
 }
Index: head/sys/kern/subr_busdma_bufalloc.c
===================================================================
--- head/sys/kern/subr_busdma_bufalloc.c	(revision 326270)
+++ head/sys/kern/subr_busdma_bufalloc.c	(revision 326271)
@@ -1,174 +1,176 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 Ian Lepore
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Buffer allocation support routines for bus_dmamem_alloc implementations.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/busdma_bufalloc.h>
 #include <sys/malloc.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 /*
  * We manage buffer zones up to a page in size.  Buffers larger than a page can
  * be managed by one of the kernel's page-oriented memory allocation routines as
  * efficiently as what we can do here.  Also, a page is the largest size for
  * which we can g'tee contiguity when using uma, and contiguity is one of the
  * requirements we have to fulfill.
  */
 #define	MIN_ZONE_BUFSIZE	32
 #define	MAX_ZONE_BUFSIZE	PAGE_SIZE
 
 /*
  * The static array of 12 bufzones is big enough to handle all the zones for the
  * smallest supported allocation size of 32 through the largest supported page
  * size of 64K.  If you up the biggest page size number, up the array size too.
  * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1,
  * but I don't know of an easy way to express that as a compile-time constant.
  */
 #if PAGE_SIZE > 65536
 #error Unsupported page size
 #endif
 
 struct busdma_bufalloc {
 	bus_size_t		min_size;
 	size_t			num_zones;
 	struct busdma_bufzone	buf_zones[12];
 };
 
 busdma_bufalloc_t 
 busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment,
     uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags)
 {
 	struct busdma_bufalloc *ba;
 	struct busdma_bufzone *bz;
 	int i;
 	bus_size_t cursize;
 
 	ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF, 
 	    M_ZERO | M_WAITOK);
 
 	ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment);
 
 	/*
 	 * Each uma zone is created with an alignment of size-1, meaning that
 	 * the alignment is equal to the size (I.E., 64 byte buffers are aligned
 	 * to 64 byte boundaries, etc).  This allows for a fast efficient test
 	 * when deciding whether a pool buffer meets the constraints of a given
 	 * tag used for allocation: the buffer is usable if tag->alignment <=
 	 * bufzone->size.
 	 */
 	for (i = 0, bz = ba->buf_zones, cursize = ba->min_size;
 	    i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE;
 	    ++i, ++bz, cursize <<= 1) {
 		snprintf(bz->name, sizeof(bz->name), "dma %.10s %ju",
 		    name, (uintmax_t)cursize);
 		bz->size = cursize;
 		bz->umazone = uma_zcreate(bz->name, bz->size,
 		    NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags);
 		if (bz->umazone == NULL) {
 			busdma_bufalloc_destroy(ba);
 			return (NULL);
 		}
 		if (alloc_func != NULL)
 			uma_zone_set_allocf(bz->umazone, alloc_func);
 		if (free_func != NULL)
 			uma_zone_set_freef(bz->umazone, free_func);
 		++ba->num_zones;
 	}
 
 	return (ba);
 }
 
 void 
 busdma_bufalloc_destroy(busdma_bufalloc_t ba)
 {
 	struct busdma_bufzone *bz;
 	int i;
 
 	if (ba == NULL)
 		return;
 
 	for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
 		uma_zdestroy(bz->umazone);
 	}
 
 	free(ba, M_DEVBUF);
 }
 
 struct busdma_bufzone * 
 busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size)
 {
 	struct busdma_bufzone *bz;
 	int i;
 
 	if (size > MAX_ZONE_BUFSIZE)
 		return (NULL);
 
 	for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
 		if (bz->size >= size)
 			return (bz);
 	}
 
 	panic("Didn't find a buffer zone of the right size");
 }
 
 void *
 busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
     uint8_t *pflag, int wait)
 {
 #ifdef VM_MEMATTR_UNCACHEABLE
 
 	/* Inform UMA that this allocator uses kernel_arena/object. */
 	*pflag = UMA_SLAB_KERNEL;
 
 	return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0,
 	    BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE));
 
 #else
 
 	panic("VM_MEMATTR_UNCACHEABLE unavailable");
 
 #endif	/* VM_MEMATTR_UNCACHEABLE */
 }
 
 void 
 busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, uint8_t pflag)
 {
 
 	kmem_free(kernel_arena, (vm_offset_t)item, size);
 }
 
Index: head/sys/kern/subr_capability.c
===================================================================
--- head/sys/kern/subr_capability.c	(revision 326270)
+++ head/sys/kern/subr_capability.c	(revision 326271)
@@ -1,307 +1,309 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Note that this file is compiled into the kernel and into libc.
  */
 
 #include <sys/types.h>
 #include <sys/capsicum.h>
 
 #ifdef _KERNEL
 #include <sys/systm.h>
 
 #include <machine/stdarg.h>
 #else	/* !_KERNEL */
 #include <assert.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
 #endif
 
 #ifdef _KERNEL
 #define	assert(exp)	KASSERT((exp), ("%s:%u", __func__, __LINE__))
 #endif
 
 #define	CAPARSIZE_MIN	(CAP_RIGHTS_VERSION_00 + 2)
 #define	CAPARSIZE_MAX	(CAP_RIGHTS_VERSION + 2)
 
 static __inline int
 right_to_index(uint64_t right)
 {
 	static const int bit2idx[] = {
 		-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
 		4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 	};
 	int idx;
 
 	idx = CAPIDXBIT(right);
 	assert(idx >= 0 && idx < sizeof(bit2idx) / sizeof(bit2idx[0]));
 	return (bit2idx[idx]);
 }
 
 static void
 cap_rights_vset(cap_rights_t *rights, va_list ap)
 {
 	uint64_t right;
 	int i, n;
 
 	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
 
 	n = CAPARSIZE(rights);
 	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
 
 	for (;;) {
 		right = (uint64_t)va_arg(ap, unsigned long long);
 		if (right == 0)
 			break;
 		assert(CAPRVER(right) == 0);
 		i = right_to_index(right);
 		assert(i >= 0);
 		assert(i < n);
 		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
 		rights->cr_rights[i] |= right;
 		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
 	}
 }
 
 static void
 cap_rights_vclear(cap_rights_t *rights, va_list ap)
 {
 	uint64_t right;
 	int i, n;
 
 	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
 
 	n = CAPARSIZE(rights);
 	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
 
 	for (;;) {
 		right = (uint64_t)va_arg(ap, unsigned long long);
 		if (right == 0)
 			break;
 		assert(CAPRVER(right) == 0);
 		i = right_to_index(right);
 		assert(i >= 0);
 		assert(i < n);
 		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
 		rights->cr_rights[i] &= ~(right & 0x01FFFFFFFFFFFFFFULL);
 		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
 	}
 }
 
 static bool
 cap_rights_is_vset(const cap_rights_t *rights, va_list ap)
 {
 	uint64_t right;
 	int i, n;
 
 	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
 
 	n = CAPARSIZE(rights);
 	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
 
 	for (;;) {
 		right = (uint64_t)va_arg(ap, unsigned long long);
 		if (right == 0)
 			break;
 		assert(CAPRVER(right) == 0);
 		i = right_to_index(right);
 		assert(i >= 0);
 		assert(i < n);
 		assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
 		if ((rights->cr_rights[i] & right) != right)
 			return (false);
 	}
 
 	return (true);
 }
 
 cap_rights_t *
 __cap_rights_init(int version, cap_rights_t *rights, ...)
 {
 	unsigned int n;
 	va_list ap;
 
 	assert(version == CAP_RIGHTS_VERSION_00);
 
 	n = version + 2;
 	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
 	CAP_NONE(rights);
 	va_start(ap, rights);
 	cap_rights_vset(rights, ap);
 	va_end(ap);
 
 	return (rights);
 }
 
 cap_rights_t *
 __cap_rights_set(cap_rights_t *rights, ...)
 {
 	va_list ap;
 
 	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
 
 	va_start(ap, rights);
 	cap_rights_vset(rights, ap);
 	va_end(ap);
 
 	return (rights);
 }
 
 cap_rights_t *
 __cap_rights_clear(cap_rights_t *rights, ...)
 {
 	va_list ap;
 
 	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
 
 	va_start(ap, rights);
 	cap_rights_vclear(rights, ap);
 	va_end(ap);
 
 	return (rights);
 }
 
 bool
 __cap_rights_is_set(const cap_rights_t *rights, ...)
 {
 	va_list ap;
 	bool ret;
 
 	assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
 
 	va_start(ap, rights);
 	ret = cap_rights_is_vset(rights, ap);
 	va_end(ap);
 
 	return (ret);
 }
 
 bool
 cap_rights_is_valid(const cap_rights_t *rights)
 {
 	cap_rights_t allrights;
 	int i, j;
 
 	if (CAPVER(rights) != CAP_RIGHTS_VERSION_00)
 		return (false);
 	if (CAPARSIZE(rights) < CAPARSIZE_MIN ||
 	    CAPARSIZE(rights) > CAPARSIZE_MAX) {
 		return (false);
 	}
 	CAP_ALL(&allrights);
 	if (!cap_rights_contains(&allrights, rights))
 		return (false);
 	for (i = 0; i < CAPARSIZE(rights); i++) {
 		j = right_to_index(rights->cr_rights[i]);
 		if (i != j)
 			return (false);
 		if (i > 0) {
 			if (CAPRVER(rights->cr_rights[i]) != 0)
 				return (false);
 		}
 	}
 
 	return (true);
 }
 
 cap_rights_t *
 cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src)
 {
 	unsigned int i, n;
 
 	assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00);
 	assert(CAPVER(src) == CAP_RIGHTS_VERSION_00);
 	assert(CAPVER(dst) == CAPVER(src));
 	assert(cap_rights_is_valid(src));
 	assert(cap_rights_is_valid(dst));
 
 	n = CAPARSIZE(dst);
 	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
 
 	for (i = 0; i < n; i++)
 		dst->cr_rights[i] |= src->cr_rights[i];
 
 	assert(cap_rights_is_valid(src));
 	assert(cap_rights_is_valid(dst));
 
 	return (dst);
 }
 
 cap_rights_t *
 cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src)
 {
 	unsigned int i, n;
 
 	assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00);
 	assert(CAPVER(src) == CAP_RIGHTS_VERSION_00);
 	assert(CAPVER(dst) == CAPVER(src));
 	assert(cap_rights_is_valid(src));
 	assert(cap_rights_is_valid(dst));
 
 	n = CAPARSIZE(dst);
 	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
 
 	for (i = 0; i < n; i++) {
 		dst->cr_rights[i] &=
 		    ~(src->cr_rights[i] & 0x01FFFFFFFFFFFFFFULL);
 	}
 
 	assert(cap_rights_is_valid(src));
 	assert(cap_rights_is_valid(dst));
 
 	return (dst);
 }
 
 bool
 cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little)
 {
 	unsigned int i, n;
 
 	assert(CAPVER(big) == CAP_RIGHTS_VERSION_00);
 	assert(CAPVER(little) == CAP_RIGHTS_VERSION_00);
 	assert(CAPVER(big) == CAPVER(little));
 
 	n = CAPARSIZE(big);
 	assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
 
 	for (i = 0; i < n; i++) {
 		if ((big->cr_rights[i] & little->cr_rights[i]) !=
 		    little->cr_rights[i]) {
 			return (false);
 		}
 	}
 
 	return (true);
 }
Index: head/sys/kern/subr_counter.c
===================================================================
--- head/sys/kern/subr_counter.c	(revision 326270)
+++ head/sys/kern/subr_counter.c	(revision 326271)
@@ -1,177 +1,179 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <vm/uma.h>
 
 #define IN_SUBR_COUNTER_C
 #include <sys/counter.h>
  
 void
 counter_u64_zero(counter_u64_t c)
 {
 
 	counter_u64_zero_inline(c);
 }
 
 uint64_t
 counter_u64_fetch(counter_u64_t c)
 {
 
 	return (counter_u64_fetch_inline(c));
 }
 
 counter_u64_t
 counter_u64_alloc(int flags)
 {
 	counter_u64_t r;
 
 	r = uma_zalloc(pcpu_zone_64, flags);
 	if (r != NULL)
 		counter_u64_zero(r);
 
 	return (r);
 }
 
 void
 counter_u64_free(counter_u64_t c)
 {
 
 	uma_zfree(pcpu_zone_64, c);
 }
 
 int
 sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t out;
 	int error;
 
 	out = counter_u64_fetch(*(counter_u64_t *)arg1);
 
 	error = SYSCTL_OUT(req, &out, sizeof(uint64_t));
 
 	if (error || !req->newptr)
 		return (error);
 
 	/*
 	 * Any write attempt to a counter zeroes it.
 	 */
 	counter_u64_zero(*(counter_u64_t *)arg1);
 
 	return (0);
 }
 
 int
 sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t *out;
 	int error;
 
 	out = malloc(arg2 * sizeof(uint64_t), M_TEMP, M_WAITOK);
 	for (int i = 0; i < arg2; i++)
 		out[i] = counter_u64_fetch(((counter_u64_t *)arg1)[i]);
 
 	error = SYSCTL_OUT(req, out, arg2 * sizeof(uint64_t));
 	free(out, M_TEMP);
 
 	if (error || !req->newptr)
 		return (error);
 
 	/*
 	 * Any write attempt to a counter zeroes it.
 	 */
 	for (int i = 0; i < arg2; i++)
 		counter_u64_zero(((counter_u64_t *)arg1)[i]);
  
 	return (0);
 }
 
 /*
  * MP-friendly version of ppsratecheck().
  *
  * Returns non-negative if we are in the rate, negative otherwise.
  *  0 - rate limit not reached.
  * -1 - rate limit reached.
  * >0 - rate limit was reached before, and was just reset. The return value
  *      is number of events since last reset.
  */
 int64_t
 counter_ratecheck(struct counter_rate *cr, int64_t limit)
 {
 	int64_t val;
 	int now;
 
 	val = cr->cr_over;
 	now = ticks;
 
 	if (now - cr->cr_ticks >= hz) {
 		/*
 		 * Time to clear the structure, we are in the next second.
 		 * First try unlocked read, and then proceed with atomic.
 		 */
 		if ((cr->cr_lock == 0) &&
 		    atomic_cmpset_acq_int(&cr->cr_lock, 0, 1)) {
 			/*
 			 * Check if other thread has just went through the
 			 * reset sequence before us.
 			 */
 			if (now - cr->cr_ticks >= hz) {
 				val = counter_u64_fetch(cr->cr_rate);
 				counter_u64_zero(cr->cr_rate);
 				cr->cr_over = 0;
 				cr->cr_ticks = now;
 				if (val <= limit)
 					val = 0;
 			}
 			atomic_store_rel_int(&cr->cr_lock, 0);
 		} else
 			/*
 			 * We failed to lock, in this case other thread may
 			 * be running counter_u64_zero(), so it is not safe
 			 * to do an update, we skip it.
 			 */
 			return (val);
 	}
 
 	counter_u64_add(cr->cr_rate, 1);
 	if (cr->cr_over != 0)
 		return (-1);
 	if (counter_u64_fetch(cr->cr_rate) > limit)
 		val = cr->cr_over = -1;
 
 	return (val);
 }
Index: head/sys/kern/subr_devstat.c
===================================================================
--- head/sys/kern/subr_devstat.c	(revision 326270)
+++ head/sys/kern/subr_devstat.c	(revision 326271)
@@ -1,580 +1,582 @@
 /*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/devicestat.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/atomic.h>
 
 SDT_PROVIDER_DEFINE(io);
 
 SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , wait__start, "struct bio *",
     "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , wait__done, "struct bio *",
     "struct devstat *");
 
 #define	DTRACE_DEVSTAT_START()		SDT_PROBE2(io, , , start, NULL, ds)
 #define	DTRACE_DEVSTAT_BIO_START()	SDT_PROBE2(io, , , start, bp, ds)
 #define	DTRACE_DEVSTAT_DONE()		SDT_PROBE2(io, , , done, NULL, ds)
 #define	DTRACE_DEVSTAT_BIO_DONE()	SDT_PROBE2(io, , , done, bp, ds)
 #define	DTRACE_DEVSTAT_WAIT_START()	SDT_PROBE2(io, , , wait__start, NULL, ds)
 #define	DTRACE_DEVSTAT_WAIT_DONE()	SDT_PROBE2(io, , , wait__done, NULL, ds)
 
 static int devstat_num_devs;
 static long devstat_generation = 1;
 static int devstat_version = DEVSTAT_VERSION;
 static int devstat_current_devnumber;
 static struct mtx devstat_mutex;
 MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF);
 
 static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq);
 static struct devstat *devstat_alloc(void);
 static void devstat_free(struct devstat *);
 static void devstat_add_entry(struct devstat *ds, const void *dev_name, 
 		       int unit_number, uint32_t block_size,
 		       devstat_support_flags flags,
 		       devstat_type_flags device_type,
 		       devstat_priority priority);
 
 /*
  * Allocate a devstat and initialize it
  */
 struct devstat *
 devstat_new_entry(const void *dev_name,
 		  int unit_number, uint32_t block_size,
 		  devstat_support_flags flags,
 		  devstat_type_flags device_type,
 		  devstat_priority priority)
 {
 	struct devstat *ds;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	ds = devstat_alloc();
 	mtx_lock(&devstat_mutex);
 	if (unit_number == -1) {
 		ds->unit_number = unit_number;
 		ds->id = dev_name;
 		binuptime(&ds->creation_time);
 		devstat_generation++;
 	} else {
 		devstat_add_entry(ds, dev_name, unit_number, block_size,
 				  flags, device_type, priority);
 	}
 	mtx_unlock(&devstat_mutex);
 	return (ds);
 }
 
 /*
  * Take a malloced and zeroed devstat structure given to us, fill it in 
  * and add it to the queue of devices.  
  */
 static void
 devstat_add_entry(struct devstat *ds, const void *dev_name, 
 		  int unit_number, uint32_t block_size,
 		  devstat_support_flags flags,
 		  devstat_type_flags device_type,
 		  devstat_priority priority)
 {
 	struct devstatlist *devstat_head;
 	struct devstat *ds_tmp;
 
 	mtx_assert(&devstat_mutex, MA_OWNED);
 	devstat_num_devs++;
 
 	devstat_head = &device_statq;
 
 	/*
 	 * Priority sort.  Each driver passes in its priority when it adds
 	 * its devstat entry.  Drivers are sorted first by priority, and
 	 * then by probe order.
 	 * 
 	 * For the first device, we just insert it, since the priority
 	 * doesn't really matter yet.  Subsequent devices are inserted into
 	 * the list using the order outlined above.
 	 */
 	if (devstat_num_devs == 1)
 		STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
 	else {
 		STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
 			struct devstat *ds_next;
 
 			ds_next = STAILQ_NEXT(ds_tmp, dev_links);
 
 			/*
 			 * If we find a break between higher and lower
 			 * priority items, and if this item fits in the
 			 * break, insert it.  This also applies if the
 			 * "lower priority item" is the end of the list.
 			 */
 			if ((priority <= ds_tmp->priority)
 			 && ((ds_next == NULL)
 			   || (priority > ds_next->priority))) {
 				STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
 						    dev_links);
 				break;
 			} else if (priority > ds_tmp->priority) {
 				/*
 				 * If this is the case, we should be able
 				 * to insert ourselves at the head of the
 				 * list.  If we can't, something is wrong.
 				 */
 				if (ds_tmp == STAILQ_FIRST(devstat_head)) {
 					STAILQ_INSERT_HEAD(devstat_head,
 							   ds, dev_links);
 					break;
 				} else {
 					STAILQ_INSERT_TAIL(devstat_head,
 							   ds, dev_links);
 					printf("devstat_add_entry: HELP! "
 					       "sorting problem detected "
 					       "for name %p unit %d\n",
 					       dev_name, unit_number);
 					break;
 				}
 			}
 		}
 	}
 
 	ds->device_number = devstat_current_devnumber++;
 	ds->unit_number = unit_number;
 	strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
 	ds->block_size = block_size;
 	ds->flags = flags;
 	ds->device_type = device_type;
 	ds->priority = priority;
 	binuptime(&ds->creation_time);
 	devstat_generation++;
 }
 
 /*
  * Remove a devstat structure from the list of devices.
  */
 void
 devstat_remove_entry(struct devstat *ds)
 {
 	struct devstatlist *devstat_head;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (ds == NULL)
 		return;
 
 	mtx_lock(&devstat_mutex);
 
 	devstat_head = &device_statq;
 
 	/* Remove this entry from the devstat queue */
 	atomic_add_acq_int(&ds->sequence1, 1);
 	if (ds->unit_number != -1) {
 		devstat_num_devs--;
 		STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
 	}
 	devstat_free(ds);
 	devstat_generation++;
 	mtx_unlock(&devstat_mutex);
 }
 
 /*
  * Record a transaction start.
  *
  * See comments for devstat_end_transaction().  Ordering is very important
  * here.
  */
 void
 devstat_start_transaction(struct devstat *ds, struct bintime *now)
 {
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	atomic_add_acq_int(&ds->sequence1, 1);
 	/*
 	 * We only want to set the start time when we are going from idle
 	 * to busy.  The start time is really the start of the latest busy
 	 * period.
 	 */
 	if (ds->start_count == ds->end_count) {
 		if (now != NULL)
 			ds->busy_from = *now;
 		else
 			binuptime(&ds->busy_from);
 	}
 	ds->start_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
 	DTRACE_DEVSTAT_START();
 }
 
 void
 devstat_start_transaction_bio(struct devstat *ds, struct bio *bp)
 {
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	binuptime(&bp->bio_t0);
 	devstat_start_transaction(ds, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_START();
 }
 
 /*
  * Record the ending of a transaction, and incrment the various counters.
  *
  * Ordering in this function, and in devstat_start_transaction() is VERY
  * important.  The idea here is to run without locks, so we are very
  * careful to only modify some fields on the way "down" (i.e. at
  * transaction start) and some fields on the way "up" (i.e. at transaction
  * completion).  One exception is busy_from, which we only modify in
  * devstat_start_transaction() when there are no outstanding transactions,
  * and thus it can't be modified in devstat_end_transaction()
  * simultaneously.
  *
  * The sequence0 and sequence1 fields are provided to enable an application
  * spying on the structures with mmap(2) to tell when a structure is in a
  * consistent state or not.
  *
  * For this to work 100% reliably, it is important that the two fields
  * are at opposite ends of the structure and that they are incremented
  * in the opposite order of how a memcpy(3) in userland would copy them.
  * We assume that the copying happens front to back, but there is actually
  * no way short of writing your own memcpy(3) replacement to guarantee
  * this will be the case.
  *
  * In addition to this, being a kind of locks, they must be updated with
  * atomic instructions using appropriate memory barriers.
  */
 void
 devstat_end_transaction(struct devstat *ds, uint32_t bytes, 
 			devstat_tag_type tag_type, devstat_trans_flags flags,
 			struct bintime *now, struct bintime *then)
 {
 	struct bintime dt, lnow;
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	if (now == NULL) {
 		now = &lnow;
 		binuptime(now);
 	}
 
 	atomic_add_acq_int(&ds->sequence1, 1);
 	/* Update byte and operations counts */
 	ds->bytes[flags] += bytes;
 	ds->operations[flags]++;
 
 	/*
 	 * Keep a count of the various tag types sent.
 	 */
 	if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
 	    tag_type != DEVSTAT_TAG_NONE)
 		ds->tag_types[tag_type]++;
 
 	if (then != NULL) {
 		/* Update duration of operations */
 		dt = *now;
 		bintime_sub(&dt, then);
 		bintime_add(&ds->duration[flags], &dt);
 	}
 
 	/* Accumulate busy time */
 	dt = *now;
 	bintime_sub(&dt, &ds->busy_from);
 	bintime_add(&ds->busy_time, &dt);
 	ds->busy_from = *now;
 
 	ds->end_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
 	DTRACE_DEVSTAT_DONE();
 }
 
 void
 devstat_end_transaction_bio(struct devstat *ds, struct bio *bp)
 {
 
 	devstat_end_transaction_bio_bt(ds, bp, NULL);
 }
 
 void
 devstat_end_transaction_bio_bt(struct devstat *ds, struct bio *bp,
     struct bintime *now)
 {
 	devstat_trans_flags flg;
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	if (bp->bio_cmd == BIO_DELETE)
 		flg = DEVSTAT_FREE;
 	else if ((bp->bio_cmd == BIO_READ)
 	      || ((bp->bio_cmd == BIO_ZONE)
 	       && (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)))
 		flg = DEVSTAT_READ;
 	else if (bp->bio_cmd == BIO_WRITE)
 		flg = DEVSTAT_WRITE;
 	else 
 		flg = DEVSTAT_NO_DATA;
 
 	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
 				DEVSTAT_TAG_SIMPLE, flg, now, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_DONE();
 }
 
 /*
  * This is the sysctl handler for the devstat package.  The data pushed out
  * on the kern.devstat.all sysctl variable consists of the current devstat
  * generation number, and then an array of devstat structures, one for each
  * device in the system.
  *
  * This is more cryptic that obvious, but basically we neither can nor
  * want to hold the devstat_mutex for any amount of time, so we grab it
  * only when we need to and keep an eye on devstat_generation all the time.
  */
 static int
 sysctl_devstat(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long mygen;
 	struct devstat *nds;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/*
 	 * XXX devstat_generation should really be "volatile" but that
 	 * XXX freaks out the sysctl macro below.  The places where we
 	 * XXX change it and inspect it are bracketed in the mutex which
 	 * XXX guarantees us proper write barriers.  I don't believe the
 	 * XXX compiler is allowed to optimize mygen away across calls
 	 * XXX to other functions, so the following is belived to be safe.
 	 */
 	mygen = devstat_generation;
 
 	error = SYSCTL_OUT(req, &mygen, sizeof(mygen));
 
 	if (devstat_num_devs == 0)
 		return(0);
 
 	if (error != 0)
 		return (error);
 
 	mtx_lock(&devstat_mutex);
 	nds = STAILQ_FIRST(&device_statq); 
 	if (mygen != devstat_generation)
 		error = EBUSY;
 	mtx_unlock(&devstat_mutex);
 
 	if (error != 0)
 		return (error);
 
 	for (;nds != NULL;) {
 		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
 		if (error != 0)
 			return (error);
 		mtx_lock(&devstat_mutex);
 		if (mygen != devstat_generation)
 			error = EBUSY;
 		else
 			nds = STAILQ_NEXT(nds, dev_links);
 		mtx_unlock(&devstat_mutex);
 		if (error != 0)
 			return (error);
 	}
 	return(error);
 }
 
 /*
  * Sysctl entries for devstat.  The first one is a node that all the rest
  * hang off of. 
  */
 static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL,
     "Device Statistics");
 
 SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
     NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list");
 /*
  * Export the number of devices in the system so that userland utilities
  * can determine how much memory to allocate to hold all the devices.
  */
 SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 
     &devstat_num_devs, 0, "Number of devices in the devstat list");
 SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
     &devstat_generation, 0, "Devstat list generation");
 SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 
     &devstat_version, 0, "Devstat list version number");
 
 /*
  * Allocator for struct devstat structures.  We sub-allocate these from pages
  * which we get from malloc.  These pages are exported for mmap(2)'ing through
  * a miniature device driver
  */
 
 #define statsperpage (PAGE_SIZE / sizeof(struct devstat))
 
 static d_mmap_t devstat_mmap;
 
 static struct cdevsw devstat_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_mmap =	devstat_mmap,
 	.d_name =	"devstat",
 };
 
 struct statspage {
 	TAILQ_ENTRY(statspage)	list;
 	struct devstat		*stat;
 	u_int			nfree;
 };
 
 static TAILQ_HEAD(, statspage)	pagelist = TAILQ_HEAD_INITIALIZER(pagelist);
 static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics");
 
 static int
 devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	struct statspage *spp;
 
 	if (nprot != VM_PROT_READ)
 		return (-1);
 	mtx_lock(&devstat_mutex);
 	TAILQ_FOREACH(spp, &pagelist, list) {
 		if (offset == 0) {
 			*paddr = vtophys(spp->stat);
 			mtx_unlock(&devstat_mutex);
 			return (0);
 		}
 		offset -= PAGE_SIZE;
 	}
 	mtx_unlock(&devstat_mutex);
 	return (-1);
 }
 
 static struct devstat *
 devstat_alloc(void)
 {
 	struct devstat *dsp;
 	struct statspage *spp, *spp2;
 	u_int u;
 	static int once;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (!once) {
 		make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME,
 		    &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0444,
 		    DEVSTAT_DEVICE_NAME);
 		once = 1;
 	}
 	spp2 = NULL;
 	mtx_lock(&devstat_mutex);
 	for (;;) {
 		TAILQ_FOREACH(spp, &pagelist, list) {
 			if (spp->nfree > 0)
 				break;
 		}
 		if (spp != NULL)
 			break;
 		mtx_unlock(&devstat_mutex);
 		spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK);
 		spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK);
 		spp2->nfree = statsperpage;
 
 		/*
 		 * If free statspages were added while the lock was released
 		 * just reuse them.
 		 */
 		mtx_lock(&devstat_mutex);
 		TAILQ_FOREACH(spp, &pagelist, list)
 			if (spp->nfree > 0)
 				break;
 		if (spp == NULL) {
 			spp = spp2;
 
 			/*
 			 * It would make more sense to add the new page at the
 			 * head but the order on the list determine the
 			 * sequence of the mapping so we can't do that.
 			 */
 			TAILQ_INSERT_TAIL(&pagelist, spp, list);
 		} else
 			break;
 	}
 	dsp = spp->stat;
 	for (u = 0; u < statsperpage; u++) {
 		if (dsp->allocated == 0)
 			break;
 		dsp++;
 	}
 	spp->nfree--;
 	dsp->allocated = 1;
 	mtx_unlock(&devstat_mutex);
 	if (spp2 != NULL && spp2 != spp) {
 		free(spp2->stat, M_DEVSTAT);
 		free(spp2, M_DEVSTAT);
 	}
 	return (dsp);
 }
 
 static void
 devstat_free(struct devstat *dsp)
 {
 	struct statspage *spp;
 
 	mtx_assert(&devstat_mutex, MA_OWNED);
 	bzero(dsp, sizeof *dsp);
 	TAILQ_FOREACH(spp, &pagelist, list) {
 		if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) {
 			spp->nfree++;
 			return;
 		}
 	}
 }
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, sizeof(struct devstat), "sizeof(struct devstat)");
Index: head/sys/kern/subr_dummy_vdso_tc.c
===================================================================
--- head/sys/kern/subr_dummy_vdso_tc.c	(revision 326270)
+++ head/sys/kern/subr_dummy_vdso_tc.c	(revision 326271)
@@ -1,50 +1,52 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright 2012 Konstantin Belousov <kib@FreeBSD.ORG>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/vdso.h>
 
 uint32_t
 cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
 {
 
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD32
 uint32_t
 cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
     struct timecounter *tc)
 {
 
 	return (0);
 }
 #endif
Index: head/sys/kern/subr_eventhandler.c
===================================================================
--- head/sys/kern/subr_eventhandler.c	(revision 326270)
+++ head/sys/kern/subr_eventhandler.c	(revision 326271)
@@ -1,315 +1,317 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999 Michael Smith <msmith@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 
 static MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records");
 
 /* List of all eventhandler lists */
 static TAILQ_HEAD(, eventhandler_list)	eventhandler_lists;
 static int				eventhandler_lists_initted = 0;
 static struct mtx			eventhandler_mutex;
 
 struct eventhandler_entry_generic 
 {
     struct eventhandler_entry	ee;
     void			(* func)(void);
 };
 
 static struct eventhandler_list *_eventhandler_find_list(const char *name);
 
 /*
  * Initialize the eventhandler mutex and list.
  */
 static void
 eventhandler_init(void *dummy __unused)
 {
     TAILQ_INIT(&eventhandler_lists);
     mtx_init(&eventhandler_mutex, "eventhandler", NULL, MTX_DEF);
     atomic_store_rel_int(&eventhandler_lists_initted, 1);
 }
 SYSINIT(eventhandlers, SI_SUB_EVENTHANDLER, SI_ORDER_FIRST, eventhandler_init,
     NULL);
 
 static struct eventhandler_list *
 eventhandler_find_or_create_list(const char *name)
 {
 	struct eventhandler_list *list, *new_list;
 
 	/* look for a matching, existing list */
 	list = _eventhandler_find_list(name);
 
 	/* Do we need to create the list? */
 	if (list == NULL) {
 	    mtx_unlock(&eventhandler_mutex);
 
 	    new_list = malloc(sizeof(*new_list) + strlen(name) + 1,
 		M_EVENTHANDLER, M_WAITOK | M_ZERO);
 
 	    /* If someone else created it already, then use that one. */
 	    mtx_lock(&eventhandler_mutex);
 	    list = _eventhandler_find_list(name);
 	    if (list != NULL) {
 		free(new_list, M_EVENTHANDLER);
 	    } else {
 		CTR2(KTR_EVH, "%s: creating list \"%s\"", __func__, name);
 		list = new_list;
 		TAILQ_INIT(&list->el_entries);
 		mtx_init(&list->el_lock, name, "eventhandler list", MTX_DEF);
 		list->el_name = (char *)(list + 1);
 		strcpy(list->el_name, name);
 		TAILQ_INSERT_HEAD(&eventhandler_lists, list, el_link);
 	    }
 	}
 	return (list);
 }
 
 /* 
  * Insertion is O(n) due to the priority scan, but optimises to O(1)
  * if all priorities are identical.
  */
 static eventhandler_tag
 eventhandler_register_internal(struct eventhandler_list *list,
     const char *name, eventhandler_tag epn)
 {
     struct eventhandler_entry		*ep;
     
     KASSERT(eventhandler_lists_initted, ("eventhandler registered too early"));
     KASSERT(epn != NULL, ("%s: cannot register NULL event", __func__));
 
     /* Do we need to find/create the list? */
     if (list == NULL) {
 	    mtx_lock(&eventhandler_mutex);
 	    list = eventhandler_find_or_create_list(name);
 	    mtx_unlock(&eventhandler_mutex);
     }
 
     KASSERT(epn->ee_priority != EHE_DEAD_PRIORITY,
 	("%s: handler for %s registered with dead priority", __func__, name));
 
     /* sort it into the list */
     CTR4(KTR_EVH, "%s: adding item %p (function %p) to \"%s\"", __func__, epn,
 	((struct eventhandler_entry_generic *)epn)->func, name);
     EHL_LOCK(list);
     TAILQ_FOREACH(ep, &list->el_entries, ee_link) {
 	if (ep->ee_priority != EHE_DEAD_PRIORITY &&
 	    epn->ee_priority < ep->ee_priority) {
 	    TAILQ_INSERT_BEFORE(ep, epn, ee_link);
 	    break;
 	}
     }
     if (ep == NULL)
 	TAILQ_INSERT_TAIL(&list->el_entries, epn, ee_link);
     EHL_UNLOCK(list);
     return(epn);
 }
 
 eventhandler_tag
 eventhandler_register(struct eventhandler_list *list, const char *name, 
 		      void *func, void *arg, int priority)
 {
     struct eventhandler_entry_generic	*eg;
     
     /* allocate an entry for this handler, populate it */
     eg = malloc(sizeof(struct eventhandler_entry_generic), M_EVENTHANDLER,
 	M_WAITOK | M_ZERO);
     eg->func = func;
     eg->ee.ee_arg = arg;
     eg->ee.ee_priority = priority;
 
     return (eventhandler_register_internal(list, name, &eg->ee));
 }
 
 #ifdef VIMAGE
 struct eventhandler_entry_generic_vimage
 {
     struct eventhandler_entry		ee;
     vimage_iterator_func_t		func;		/* Vimage iterator function. */
     struct eventhandler_entry_vimage	v_ee;		/* Original func, arg. */
 };
 
 eventhandler_tag
 vimage_eventhandler_register(struct eventhandler_list *list, const char *name, 
     void *func, void *arg, int priority, vimage_iterator_func_t iterfunc)
 {
     struct eventhandler_entry_generic_vimage	*eg;
     
     /* allocate an entry for this handler, populate it */
     eg = malloc(sizeof(struct eventhandler_entry_generic_vimage),
 	M_EVENTHANDLER, M_WAITOK | M_ZERO);
     eg->func = iterfunc;
     eg->v_ee.func = func;
     eg->v_ee.ee_arg = arg;
     eg->ee.ee_arg = &eg->v_ee;
     eg->ee.ee_priority = priority;
 
     return (eventhandler_register_internal(list, name, &eg->ee));
 }
 #endif
 
 static void
 _eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag,
     bool wait)
 {
     struct eventhandler_entry	*ep = tag;
 
     EHL_LOCK_ASSERT(list, MA_OWNED);
     if (ep != NULL) {
 	/* remove just this entry */
 	if (list->el_runcount == 0) {
 	    CTR3(KTR_EVH, "%s: removing item %p from \"%s\"", __func__, ep,
 		list->el_name);
 	    TAILQ_REMOVE(&list->el_entries, ep, ee_link);
 	    free(ep, M_EVENTHANDLER);
 	} else {
 	    CTR3(KTR_EVH, "%s: marking item %p from \"%s\" as dead", __func__,
 		ep, list->el_name);
 	    ep->ee_priority = EHE_DEAD_PRIORITY;
 	}
     } else {
 	/* remove entire list */
 	if (list->el_runcount == 0) {
 	    CTR2(KTR_EVH, "%s: removing all items from \"%s\"", __func__,
 		list->el_name);
 	    while (!TAILQ_EMPTY(&list->el_entries)) {
 		ep = TAILQ_FIRST(&list->el_entries);
 		TAILQ_REMOVE(&list->el_entries, ep, ee_link);
 		free(ep, M_EVENTHANDLER);
 	    }
 	} else {
 	    CTR2(KTR_EVH, "%s: marking all items from \"%s\" as dead",
 		__func__, list->el_name);
 	    TAILQ_FOREACH(ep, &list->el_entries, ee_link)
 		ep->ee_priority = EHE_DEAD_PRIORITY;
 	}
     }
     while (wait && list->el_runcount > 0)
 	    mtx_sleep(list, &list->el_lock, 0, "evhrm", 0);
     EHL_UNLOCK(list);
 }
 
 void
 eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag)
 {
 
 	_eventhandler_deregister(list, tag, true);
 }
 
 void
 eventhandler_deregister_nowait(struct eventhandler_list *list,
     eventhandler_tag tag)
 {
 
 	_eventhandler_deregister(list, tag, false);
 }
 
 /*
  * Internal version for use when eventhandler list is already locked.
  */
 static struct eventhandler_list *
 _eventhandler_find_list(const char *name)
 {
     struct eventhandler_list	*list;
 
     mtx_assert(&eventhandler_mutex, MA_OWNED);
     TAILQ_FOREACH(list, &eventhandler_lists, el_link) {
 	if (!strcmp(name, list->el_name))
 	    break;
     }
     return (list);
 }
 
 /*
  * Lookup a "slow" list by name.  Returns with the list locked.
  */
 struct eventhandler_list *
 eventhandler_find_list(const char *name)
 {
     struct eventhandler_list	*list;
 
     if (!eventhandler_lists_initted)
 	return(NULL);
     
     /* scan looking for the requested list */
     mtx_lock(&eventhandler_mutex);
     list = _eventhandler_find_list(name);
     if (list != NULL)
 	EHL_LOCK(list);
     mtx_unlock(&eventhandler_mutex);
     
     return(list);
 }
 
 /*
  * Prune "dead" entries from an eventhandler list.
  */
 void
 eventhandler_prune_list(struct eventhandler_list *list)
 {
     struct eventhandler_entry *ep, *en;
     int pruned = 0;
 
     CTR2(KTR_EVH, "%s: pruning list \"%s\"", __func__, list->el_name);
     EHL_LOCK_ASSERT(list, MA_OWNED);
     TAILQ_FOREACH_SAFE(ep, &list->el_entries, ee_link, en) {
 	if (ep->ee_priority == EHE_DEAD_PRIORITY) {
 	    TAILQ_REMOVE(&list->el_entries, ep, ee_link);
 	    free(ep, M_EVENTHANDLER);
 	    pruned++;
 	}
     }
     if (pruned > 0)
 	    wakeup(list);
 }
 
 /*
  * Create (or get the existing) list so the pointer can be stored by
  * EVENTHANDLER_LIST_DEFINE.
  */
 struct eventhandler_list *
 eventhandler_create_list(const char *name)
 {
 	struct eventhandler_list *list;
 
 	KASSERT(eventhandler_lists_initted,
 	    ("eventhandler list created too early"));
 
 	mtx_lock(&eventhandler_mutex);
 	list = eventhandler_find_or_create_list(name);
 	mtx_unlock(&eventhandler_mutex);
 
 	return (list);
 }
Index: head/sys/kern/subr_fattime.c
===================================================================
--- head/sys/kern/subr_fattime.c	(revision 326270)
+++ head/sys/kern/subr_fattime.c	(revision 326271)
@@ -1,307 +1,309 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2006 Poul-Henning Kamp
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  * Convert MS-DOS FAT format timestamps to and from unix timespecs
  *
  * FAT filestamps originally consisted of two 16 bit integers, encoded like
  * this:
  *
  *	yyyyyyymmmmddddd (year - 1980, month, day)
  *
  *      hhhhhmmmmmmsssss (hour, minutes, seconds divided by two)
  *
  * Subsequently even Microsoft realized that files could be accessed in less
  * than two seconds and a byte was added containing:
  *
  *      sfffffff	 (second mod two, 100ths of second)
  *
  * FAT timestamps are in the local timezone, with no indication of which
  * timezone much less if daylight savings time applies.
  *
  * Later on again, in Windows NT, timestamps were defined relative to GMT.
  *
  * Purists will point out that UTC replaced GMT for such uses around
  * half a century ago, already then.  Ironically "NT" was an abbreviation of 
  * "New Technology".  Anyway...
  *
  * The 'utc' argument determines if the resulting FATTIME timestamp
  * should be on the UTC or local timezone calendar.
  *
  * The conversion functions below cut time into four-year leap-year
  * cycles rather than single years and uses table lookups inside those
  * cycles to get the months and years sorted out.
  *
  * Obviously we cannot calculate the correct table index going from
  * a posix seconds count to Y/M/D, but we can get pretty close by
  * dividing the daycount by 32 (giving a too low index), and then
  * adjusting upwards a couple of steps if necessary.
  *
  * FAT timestamps have 7 bits for the year and starts at 1980, so
  * they can represent up to 2107 which means that the non-leap-year
  * 2100 must be handled.
  *
  * XXX: As long as time_t is 32 bits this is not relevant or easily
  * XXX: testable.  Revisit when time_t grows bigger.
  * XXX: grepfodder: 64 bit time_t, y2100, y2.1k, 2100, leap year
  *
  */
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/clock.h>
 
 #define DAY	(24 * 60 * 60)	/* Length of day in seconds */
 #define YEAR	365		/* Length of normal year */
 #define LYC	(4 * YEAR + 1)	/* Length of 4 year leap-year cycle */
 #define T1980	(10 * 365 + 2)	/* Days from 1970 to 1980 */
 
 /* End of month is N days from start of (normal) year */
 #define JAN	31
 #define FEB	(JAN + 28)
 #define MAR	(FEB + 31)
 #define APR	(MAR + 30)
 #define MAY	(APR + 31)
 #define JUN	(MAY + 30)
 #define JUL	(JUN + 31)
 #define AUG	(JUL + 31)
 #define SEP	(AUG + 30)
 #define OCT	(SEP + 31)
 #define NOV	(OCT + 30)
 #define DEC	(NOV + 31)
 
 /* Table of months in a 4 year leap-year cycle */
 
 #define ENC(y,m)	(((y) << 9) | ((m) << 5))
 
 static const struct {
 	uint16_t	days;	/* month start in days relative to cycle */
 	uint16_t	coded;	/* encoded year + month information */
 } mtab[48] = {
 	{   0 + 0 * YEAR,     ENC(0, 1)  },
 
 	{ JAN + 0 * YEAR,     ENC(0, 2)  }, { FEB + 0 * YEAR + 1, ENC(0, 3)  },
 	{ MAR + 0 * YEAR + 1, ENC(0, 4)  }, { APR + 0 * YEAR + 1, ENC(0, 5)  },
 	{ MAY + 0 * YEAR + 1, ENC(0, 6)  }, { JUN + 0 * YEAR + 1, ENC(0, 7)  },
 	{ JUL + 0 * YEAR + 1, ENC(0, 8)  }, { AUG + 0 * YEAR + 1, ENC(0, 9)  },
 	{ SEP + 0 * YEAR + 1, ENC(0, 10) }, { OCT + 0 * YEAR + 1, ENC(0, 11) },
 	{ NOV + 0 * YEAR + 1, ENC(0, 12) }, { DEC + 0 * YEAR + 1, ENC(1, 1)  },
 
 	{ JAN + 1 * YEAR + 1, ENC(1, 2)  }, { FEB + 1 * YEAR + 1, ENC(1, 3)  },
 	{ MAR + 1 * YEAR + 1, ENC(1, 4)  }, { APR + 1 * YEAR + 1, ENC(1, 5)  },
 	{ MAY + 1 * YEAR + 1, ENC(1, 6)  }, { JUN + 1 * YEAR + 1, ENC(1, 7)  },
 	{ JUL + 1 * YEAR + 1, ENC(1, 8)  }, { AUG + 1 * YEAR + 1, ENC(1, 9)  },
 	{ SEP + 1 * YEAR + 1, ENC(1, 10) }, { OCT + 1 * YEAR + 1, ENC(1, 11) },
 	{ NOV + 1 * YEAR + 1, ENC(1, 12) }, { DEC + 1 * YEAR + 1, ENC(2, 1)  },
 
 	{ JAN + 2 * YEAR + 1, ENC(2, 2)  }, { FEB + 2 * YEAR + 1, ENC(2, 3)  },
 	{ MAR + 2 * YEAR + 1, ENC(2, 4)  }, { APR + 2 * YEAR + 1, ENC(2, 5)  },
 	{ MAY + 2 * YEAR + 1, ENC(2, 6)  }, { JUN + 2 * YEAR + 1, ENC(2, 7)  },
 	{ JUL + 2 * YEAR + 1, ENC(2, 8)  }, { AUG + 2 * YEAR + 1, ENC(2, 9)  },
 	{ SEP + 2 * YEAR + 1, ENC(2, 10) }, { OCT + 2 * YEAR + 1, ENC(2, 11) },
 	{ NOV + 2 * YEAR + 1, ENC(2, 12) }, { DEC + 2 * YEAR + 1, ENC(3, 1)  },
 
 	{ JAN + 3 * YEAR + 1, ENC(3, 2)  }, { FEB + 3 * YEAR + 1, ENC(3, 3)  },
 	{ MAR + 3 * YEAR + 1, ENC(3, 4)  }, { APR + 3 * YEAR + 1, ENC(3, 5)  },
 	{ MAY + 3 * YEAR + 1, ENC(3, 6)  }, { JUN + 3 * YEAR + 1, ENC(3, 7)  },
 	{ JUL + 3 * YEAR + 1, ENC(3, 8)  }, { AUG + 3 * YEAR + 1, ENC(3, 9)  },
 	{ SEP + 3 * YEAR + 1, ENC(3, 10) }, { OCT + 3 * YEAR + 1, ENC(3, 11) },
 	{ NOV + 3 * YEAR + 1, ENC(3, 12) }
 };
 
 
 void
 timespec2fattime(struct timespec *tsp, int utc, uint16_t *ddp, uint16_t *dtp, uint8_t *dhp)
 {
 	time_t t1;
 	unsigned t2, l, m;
 
 	t1 = tsp->tv_sec;
 	if (!utc)
 		t1 -= utc_offset();
 
 	if (dhp != NULL)
 		*dhp = (tsp->tv_sec & 1) * 100 + tsp->tv_nsec / 10000000;
 	if (dtp != NULL) {
 		*dtp = (t1 / 2) % 30;
 		*dtp |= ((t1 / 60) % 60) << 5;
 		*dtp |= ((t1 / 3600) % 24) << 11;
 	}
 	if (ddp != NULL) {
 		t2 = t1 / DAY;
 		if (t2 < T1980) {
 			/* Impossible date, truncate to 1980-01-01 */
 			*ddp = 0x0021;
 		} else {
 			t2 -= T1980;
 
 			/*
 			 * 2100 is not a leap year.
 			 * XXX: a 32 bit time_t can not get us here.
 			 */
 			if (t2 >= ((2100 - 1980) / 4 * LYC + FEB))
 				t2++;
 
 			/* Account for full leapyear cycles */
 			l = t2 / LYC;
 			*ddp = (l * 4) << 9;
 			t2 -= l * LYC;
 
 			/* Find approximate table entry */
 			m = t2 / 32;
 
 			/* Find correct table entry */
 			while (m < 47 && mtab[m + 1].days <= t2)
 				m++;
 
 			/* Get year + month from the table */
 			*ddp += mtab[m].coded;
 
 			/* And apply the day in the month */
 			t2 -= mtab[m].days - 1;
 			*ddp |= t2;
 		}
 	}
 }
 
 /*
  * Table indexed by the bottom two bits of year + four bits of the month
  * from the FAT timestamp, returning number of days into 4 year long
  * leap-year cycle
  */
 
 #define DCOD(m, y, l)	((m) + YEAR * (y) + (l))
 static const uint16_t daytab[64] = {
 	0, 		 DCOD(  0, 0, 0), DCOD(JAN, 0, 0), DCOD(FEB, 0, 1),
 	DCOD(MAR, 0, 1), DCOD(APR, 0, 1), DCOD(MAY, 0, 1), DCOD(JUN, 0, 1),
 	DCOD(JUL, 0, 1), DCOD(AUG, 0, 1), DCOD(SEP, 0, 1), DCOD(OCT, 0, 1),
 	DCOD(NOV, 0, 1), DCOD(DEC, 0, 1), 0,               0,
 	0, 		 DCOD(  0, 1, 1), DCOD(JAN, 1, 1), DCOD(FEB, 1, 1),
 	DCOD(MAR, 1, 1), DCOD(APR, 1, 1), DCOD(MAY, 1, 1), DCOD(JUN, 1, 1),
 	DCOD(JUL, 1, 1), DCOD(AUG, 1, 1), DCOD(SEP, 1, 1), DCOD(OCT, 1, 1),
 	DCOD(NOV, 1, 1), DCOD(DEC, 1, 1), 0,               0,
 	0,		 DCOD(  0, 2, 1), DCOD(JAN, 2, 1), DCOD(FEB, 2, 1),
 	DCOD(MAR, 2, 1), DCOD(APR, 2, 1), DCOD(MAY, 2, 1), DCOD(JUN, 2, 1),
 	DCOD(JUL, 2, 1), DCOD(AUG, 2, 1), DCOD(SEP, 2, 1), DCOD(OCT, 2, 1),
 	DCOD(NOV, 2, 1), DCOD(DEC, 2, 1), 0,               0,
 	0,		 DCOD(  0, 3, 1), DCOD(JAN, 3, 1), DCOD(FEB, 3, 1),
 	DCOD(MAR, 3, 1), DCOD(APR, 3, 1), DCOD(MAY, 3, 1), DCOD(JUN, 3, 1),
 	DCOD(JUL, 3, 1), DCOD(AUG, 3, 1), DCOD(SEP, 3, 1), DCOD(OCT, 3, 1),
 	DCOD(NOV, 3, 1), DCOD(DEC, 3, 1), 0,               0
 };
 
 void
 fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp)
 {
 	unsigned day;
 
 	/* Unpack time fields */
 	tsp->tv_sec = (dt & 0x1f) << 1;
 	tsp->tv_sec += ((dt & 0x7e0) >> 5) * 60;
 	tsp->tv_sec += ((dt & 0xf800) >> 11) * 3600;
 	tsp->tv_sec += dh / 100;
 	tsp->tv_nsec = (dh % 100) * 10000000;
 
 	/* Day of month */
 	day = (dd & 0x1f) - 1;
 
 	/* Full leap-year cycles */
 	day += LYC * ((dd >> 11) & 0x1f);
 
 	/* Month offset from leap-year cycle */
 	day += daytab[(dd >> 5) & 0x3f];
 
 	/*
 	 * 2100 is not a leap year.
 	 * XXX: a 32 bit time_t can not get us here.
 	 */
 	if (day >= ((2100 - 1980) / 4 * LYC + FEB))
 		day--;
 
 	/* Align with time_t epoch */
 	day += T1980;
 
 	tsp->tv_sec += DAY * day;
 	if (!utc)
 		tsp->tv_sec += utc_offset();
 }
 
 #ifdef TEST_DRIVER
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 
 int
 main(int argc __unused, char **argv __unused)
 {
 	int i;
 	struct timespec ts;
 	struct tm tm;
 	double a;
 	uint16_t d, t;
 	uint8_t p;
 	char buf[100];
 
 	for (i = 0; i < 10000; i++) {
 		do {
 			ts.tv_sec = random();
 		} while (ts.tv_sec < T1980 * 86400);
 		ts.tv_nsec = random() % 1000000000;
 
 		printf("%10d.%03ld -- ", ts.tv_sec, ts.tv_nsec / 1000000);
 
 		gmtime_r(&ts.tv_sec, &tm);
 		strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
 		printf("%s -- ", buf);
 
 		a = ts.tv_sec + ts.tv_nsec * 1e-9;
 		d = t = p = 0;
 		timet2fattime(&ts, &d, &t, &p);
 		printf("%04x %04x %02x -- ", d, t, p);
 		printf("%3d %02d %02d %02d %02d %02d -- ",
 		    ((d >> 9)  & 0x7f) + 1980,
 		    (d >> 5)  & 0x0f,
 		    (d >> 0)  & 0x1f,
 		    (t >> 11) & 0x1f,
 		    (t >> 5)  & 0x3f,
 		    ((t >> 0)  & 0x1f) * 2);
 
 		ts.tv_sec = ts.tv_nsec = 0;
 		fattime2timet(d, t, p, &ts);
 		printf("%10d.%03ld == ", ts.tv_sec, ts.tv_nsec / 1000000);
 		gmtime_r(&ts.tv_sec, &tm);
 		strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
 		printf("%s -- ", buf);
 		a -= ts.tv_sec + ts.tv_nsec * 1e-9;
 		printf("%.3f", a);
 		printf("\n");
 	}
 	return (0);
 }
 
 #endif /* TEST_DRIVER */
Index: head/sys/kern/subr_firmware.c
===================================================================
--- head/sys/kern/subr_firmware.c	(revision 326270)
+++ head/sys/kern/subr_firmware.c	(revision 326271)
@@ -1,526 +1,528 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2005-2008, Sam Leffler <sam@errno.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/errno.h>
 #include <sys/linker.h>
 #include <sys/firmware.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/module.h>
 #include <sys/eventhandler.h>
 
 #include <sys/filedesc.h>
 #include <sys/vnode.h>
 
 /*
  * Loadable firmware support. See sys/sys/firmware.h and firmware(9)
  * form more details on the subsystem.
  *
  * 'struct firmware' is the user-visible part of the firmware table.
  * Additional internal information is stored in a 'struct priv_fw'
  * (currently a static array). A slot is in use if FW_INUSE is true:
  */
 
 #define FW_INUSE(p)	((p)->file != NULL || (p)->fw.name != NULL)
 
 /*
  * fw.name != NULL when an image is registered; file != NULL for
  * autoloaded images whose handling has not been completed.
  *
  * The state of a slot evolves as follows:
  *	firmware_register	-->  fw.name = image_name
  *	(autoloaded image)	-->  file = module reference
  *	firmware_unregister	-->  fw.name = NULL
  *	(unloadentry complete)	-->  file = NULL
  *
  * In order for the above to work, the 'file' field must remain
  * unchanged in firmware_unregister().
  *
  * Images residing in the same module are linked to each other
  * through the 'parent' argument of firmware_register().
  * One image (typically, one with the same name as the module to let
  * the autoloading mechanism work) is considered the parent image for
  * all other images in the same module. Children affect the refcount
  * on the parent image preventing improper unloading of the image itself.
  */
 
 struct priv_fw {
 	int		refcnt;		/* reference count */
 
 	/*
 	 * parent entry, see above. Set on firmware_register(),
 	 * cleared on firmware_unregister().
 	 */
 	struct priv_fw	*parent;
 
 	int 		flags;	/* record FIRMWARE_UNLOAD requests */
 #define FW_UNLOAD	0x100
 
 	/*
 	 * 'file' is private info managed by the autoload/unload code.
 	 * Set at the end of firmware_get(), cleared only in the
 	 * firmware_unload_task, so the latter can depend on its value even
 	 * while the lock is not held.
 	 */
 	linker_file_t   file;	/* module file, if autoloaded */
 
 	/*
 	 * 'fw' is the externally visible image information.
 	 * We do not make it the first field in priv_fw, to avoid the
 	 * temptation of casting pointers to each other.
 	 * Use PRIV_FW(fw) to get a pointer to the cointainer of fw.
 	 * Beware, PRIV_FW does not work for a NULL pointer.
 	 */
 	struct firmware	fw;	/* externally visible information */
 };
 
 /*
  * PRIV_FW returns the pointer to the container of struct firmware *x.
  * Cast to intptr_t to override the 'const' attribute of x
  */
 #define PRIV_FW(x)	((struct priv_fw *)		\
 	((intptr_t)(x) - offsetof(struct priv_fw, fw)) )
 
 /*
  * At the moment we use a static array as backing store for the registry.
  * Should we move to a dynamic structure, keep in mind that we cannot
  * reallocate the array because pointers are held externally.
  * A list may work, though.
  */
 #define	FIRMWARE_MAX	50
 static struct priv_fw firmware_table[FIRMWARE_MAX];
 
 /*
  * Firmware module operations are handled in a separate task as they
  * might sleep and they require directory context to do i/o.
  */
 static struct taskqueue *firmware_tq;
 static struct task firmware_unload_task;
 
 /*
  * This mutex protects accesses to the firmware table.
  */
 static struct mtx firmware_mtx;
 MTX_SYSINIT(firmware, &firmware_mtx, "firmware table", MTX_DEF);
 
 /*
  * Helper function to lookup a name.
  * As a side effect, it sets the pointer to a free slot, if any.
  * This way we can concentrate most of the registry scanning in
  * this function, which makes it easier to replace the registry
  * with some other data structure.
  */
 static struct priv_fw *
 lookup(const char *name, struct priv_fw **empty_slot)
 {
 	struct priv_fw *fp = NULL;
 	struct priv_fw *dummy;
 	int i;
 
 	if (empty_slot == NULL)
 		empty_slot = &dummy;
 	*empty_slot = NULL;
 	for (i = 0; i < FIRMWARE_MAX; i++) {
 		fp = &firmware_table[i];
 		if (fp->fw.name != NULL && strcasecmp(name, fp->fw.name) == 0)
 			break;
 		else if (!FW_INUSE(fp))
 			*empty_slot = fp;
 	}
 	return (i < FIRMWARE_MAX ) ? fp : NULL;
 }
 
 /*
  * Register a firmware image with the specified name.  The
  * image name must not already be registered.  If this is a
  * subimage then parent refers to a previously registered
  * image that this should be associated with.
  */
 const struct firmware *
 firmware_register(const char *imagename, const void *data, size_t datasize,
     unsigned int version, const struct firmware *parent)
 {
 	struct priv_fw *match, *frp;
 	char *str;
 
 	str = strdup(imagename, M_TEMP);
 
 	mtx_lock(&firmware_mtx);
 	/*
 	 * Do a lookup to make sure the name is unique or find a free slot.
 	 */
 	match = lookup(imagename, &frp);
 	if (match != NULL) {
 		mtx_unlock(&firmware_mtx);
 		printf("%s: image %s already registered!\n",
 			__func__, imagename);
 		free(str, M_TEMP);
 		return NULL;
 	}
 	if (frp == NULL) {
 		mtx_unlock(&firmware_mtx);
 		printf("%s: cannot register image %s, firmware table full!\n",
 		    __func__, imagename);
 		free(str, M_TEMP);
 		return NULL;
 	}
 	bzero(frp, sizeof(*frp));	/* start from a clean record */
 	frp->fw.name = str;
 	frp->fw.data = data;
 	frp->fw.datasize = datasize;
 	frp->fw.version = version;
 	if (parent != NULL)
 		frp->parent = PRIV_FW(parent);
 	mtx_unlock(&firmware_mtx);
 	if (bootverbose)
 		printf("firmware: '%s' version %u: %zu bytes loaded at %p\n",
 		    imagename, version, datasize, data);
 	return &frp->fw;
 }
 
 /*
  * Unregister/remove a firmware image.  If there are outstanding
  * references an error is returned and the image is not removed
  * from the registry.
  */
 int
 firmware_unregister(const char *imagename)
 {
 	struct priv_fw *fp;
 	int err;
 
 	mtx_lock(&firmware_mtx);
 	fp = lookup(imagename, NULL);
 	if (fp == NULL) {
 		/*
 		 * It is ok for the lookup to fail; this can happen
 		 * when a module is unloaded on last reference and the
 		 * module unload handler unregister's each of its
 		 * firmware images.
 		 */
 		err = 0;
 	} else if (fp->refcnt != 0) {	/* cannot unregister */
 		err = EBUSY;
 	} else {
 		linker_file_t x = fp->file;	/* save value */
 
 		/*
 		 * Clear the whole entry with bzero to make sure we
 		 * do not forget anything. Then restore 'file' which is
 		 * non-null for autoloaded images.
 		 */
 		free((void *) (uintptr_t) fp->fw.name, M_TEMP);
 		bzero(fp, sizeof(struct priv_fw));
 		fp->file = x;
 		err = 0;
 	}
 	mtx_unlock(&firmware_mtx);
 	return err;
 }
 
 static void
 loadimage(void *arg, int npending)
 {
 	struct thread *td = curthread;
 	char *imagename = arg;
 	struct priv_fw *fp;
 	linker_file_t result;
 	int error;
 
 	/* synchronize with the thread that dispatched us */
 	mtx_lock(&firmware_mtx);
 	mtx_unlock(&firmware_mtx);
 
 	if (td->td_proc->p_fd->fd_rdir == NULL) {
 		printf("%s: root not mounted yet, no way to load image\n",
 		    imagename);
 		goto done;
 	}
 	error = linker_reference_module(imagename, NULL, &result);
 	if (error != 0) {
 		printf("%s: could not load firmware image, error %d\n",
 		    imagename, error);
 		goto done;
 	}
 
 	mtx_lock(&firmware_mtx);
 	fp = lookup(imagename, NULL);
 	if (fp == NULL || fp->file != NULL) {
 		mtx_unlock(&firmware_mtx);
 		if (fp == NULL)
 			printf("%s: firmware image loaded, "
 			    "but did not register\n", imagename);
 		(void) linker_release_module(imagename, NULL, NULL);
 		goto done;
 	}
 	fp->file = result;	/* record the module identity */
 	mtx_unlock(&firmware_mtx);
 done:
 	wakeup_one(imagename);		/* we're done */
 }
 
 /*
  * Lookup and potentially load the specified firmware image.
  * If the firmware is not found in the registry, try to load a kernel
  * module named as the image name.
  * If the firmware is located, a reference is returned. The caller must
  * release this reference for the image to be eligible for removal/unload.
  */
 const struct firmware *
 firmware_get(const char *imagename)
 {
 	struct task fwload_task;
 	struct thread *td;
 	struct priv_fw *fp;
 
 	mtx_lock(&firmware_mtx);
 	fp = lookup(imagename, NULL);
 	if (fp != NULL)
 		goto found;
 	/*
 	 * Image not present, try to load the module holding it.
 	 */
 	td = curthread;
 	if (priv_check(td, PRIV_FIRMWARE_LOAD) != 0 ||
 	    securelevel_gt(td->td_ucred, 0) != 0) {
 		mtx_unlock(&firmware_mtx);
 		printf("%s: insufficient privileges to "
 		    "load firmware image %s\n", __func__, imagename);
 		return NULL;
 	}
 	/* 
 	 * Defer load to a thread with known context.  linker_reference_module
 	 * may do filesystem i/o which requires root & current dirs, etc.
 	 * Also we must not hold any mtx's over this call which is problematic.
 	 */
 	if (!cold) {
 		TASK_INIT(&fwload_task, 0, loadimage, __DECONST(void *,
 		    imagename));
 		taskqueue_enqueue(firmware_tq, &fwload_task);
 		msleep(__DECONST(void *, imagename), &firmware_mtx, 0,
 		    "fwload", 0);
 	}
 	/*
 	 * After attempting to load the module, see if the image is registered.
 	 */
 	fp = lookup(imagename, NULL);
 	if (fp == NULL) {
 		mtx_unlock(&firmware_mtx);
 		return NULL;
 	}
 found:				/* common exit point on success */
 	if (fp->refcnt == 0 && fp->parent != NULL)
 		fp->parent->refcnt++;
 	fp->refcnt++;
 	mtx_unlock(&firmware_mtx);
 	return &fp->fw;
 }
 
 /*
  * Release a reference to a firmware image returned by firmware_get.
  * The caller may specify, with the FIRMWARE_UNLOAD flag, its desire
  * to release the resource, but the flag is only advisory.
  *
  * If this is the last reference to the firmware image, and this is an
  * autoloaded module, wake up the firmware_unload_task to figure out
  * what to do with the associated module.
  */
 void
 firmware_put(const struct firmware *p, int flags)
 {
 	struct priv_fw *fp = PRIV_FW(p);
 
 	mtx_lock(&firmware_mtx);
 	fp->refcnt--;
 	if (fp->refcnt == 0) {
 		if (fp->parent != NULL)
 			fp->parent->refcnt--;
 		if (flags & FIRMWARE_UNLOAD)
 			fp->flags |= FW_UNLOAD;
 		if (fp->file)
 			taskqueue_enqueue(firmware_tq, &firmware_unload_task);
 	}
 	mtx_unlock(&firmware_mtx);
 }
 
 /*
  * Setup directory state for the firmware_tq thread so we can do i/o.
  */
 static void
 set_rootvnode(void *arg, int npending)
 {
 
 	pwd_ensure_dirs();
 
 	free(arg, M_TEMP);
 }
 
 /*
  * Event handler called on mounting of /; bounce a task
  * into the task queue thread to setup it's directories.
  */
 static void
 firmware_mountroot(void *arg)
 {
 	struct task *setroot_task;
 
 	setroot_task = malloc(sizeof(struct task), M_TEMP, M_NOWAIT);
 	if (setroot_task != NULL) {
 		TASK_INIT(setroot_task, 0, set_rootvnode, setroot_task);
 		taskqueue_enqueue(firmware_tq, setroot_task);
 	} else
 		printf("%s: no memory for task!\n", __func__);
 }
 EVENTHANDLER_DEFINE(mountroot, firmware_mountroot, NULL, 0);
 
 /*
  * The body of the task in charge of unloading autoloaded modules
  * that are not needed anymore.
  * Images can be cross-linked so we may need to make multiple passes,
  * but the time we spend in the loop is bounded because we clear entries
  * as we touch them.
  */
 static void
 unloadentry(void *unused1, int unused2)
 {
 	int limit = FIRMWARE_MAX;
 	int i;	/* current cycle */
 
 	mtx_lock(&firmware_mtx);
 	/*
 	 * Scan the table. limit is set to make sure we make another
 	 * full sweep after matching an entry that requires unloading.
 	 */
 	for (i = 0; i < limit; i++) {
 		struct priv_fw *fp;
 		int err;
 
 		fp = &firmware_table[i % FIRMWARE_MAX];
 		if (fp->fw.name == NULL || fp->file == NULL ||
 		    fp->refcnt != 0 || (fp->flags & FW_UNLOAD) == 0)
 			continue;
 
 		/*
 		 * Found an entry. Now:
 		 * 1. bump up limit to make sure we make another full round;
 		 * 2. clear FW_UNLOAD so we don't try this entry again.
 		 * 3. release the lock while trying to unload the module.
 		 * 'file' remains set so that the entry cannot be reused
 		 * in the meantime (it also means that fp->file will
 		 * not change while we release the lock).
 		 */
 		limit = i + FIRMWARE_MAX;	/* make another full round */
 		fp->flags &= ~FW_UNLOAD;	/* do not try again */
 
 		mtx_unlock(&firmware_mtx);
 		err = linker_release_module(NULL, NULL, fp->file);
 		mtx_lock(&firmware_mtx);
 
 		/*
 		 * We rely on the module to call firmware_unregister()
 		 * on unload to actually release the entry.
 		 * If err = 0 we can drop our reference as the system
 		 * accepted it. Otherwise unloading failed (e.g. the
 		 * module itself gave an error) so our reference is
 		 * still valid.
 		 */
 		if (err == 0)
 			fp->file = NULL; 
 	}
 	mtx_unlock(&firmware_mtx);
 }
 
 /*
  * Module glue.
  */
 static int
 firmware_modevent(module_t mod, int type, void *unused)
 {
 	struct priv_fw *fp;
 	int i, err;
 
 	switch (type) {
 	case MOD_LOAD:
 		TASK_INIT(&firmware_unload_task, 0, unloadentry, NULL);
 		firmware_tq = taskqueue_create("taskqueue_firmware", M_WAITOK,
 		    taskqueue_thread_enqueue, &firmware_tq);
 		/* NB: use our own loop routine that sets up context */
 		(void) taskqueue_start_threads(&firmware_tq, 1, PWAIT,
 		    "firmware taskq");
 		if (rootvnode != NULL) {
 			/* 
 			 * Root is already mounted so we won't get an event;
 			 * simulate one here.
 			 */
 			firmware_mountroot(NULL);
 		}
 		return 0;
 
 	case MOD_UNLOAD:
 		/* request all autoloaded modules to be released */
 		mtx_lock(&firmware_mtx);
 		for (i = 0; i < FIRMWARE_MAX; i++) {
 			fp = &firmware_table[i];
 			fp->flags |= FW_UNLOAD;
 		}
 		mtx_unlock(&firmware_mtx);
 		taskqueue_enqueue(firmware_tq, &firmware_unload_task);
 		taskqueue_drain(firmware_tq, &firmware_unload_task);
 		err = 0;
 		for (i = 0; i < FIRMWARE_MAX; i++) {
 			fp = &firmware_table[i];
 			if (fp->fw.name != NULL) {
 				printf("%s: image %p ref %d still active slot %d\n",
 					__func__, fp->fw.name,
 					fp->refcnt,  i);
 				err = EINVAL;
 			}
 		}
 		if (err == 0)
 			taskqueue_free(firmware_tq);
 		return err;
 	}
 	return EINVAL;
 }
 
 static moduledata_t firmware_mod = {
 	"firmware",
 	firmware_modevent,
 	NULL
 };
 DECLARE_MODULE(firmware, firmware_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 MODULE_VERSION(firmware, 1);
Index: head/sys/kern/subr_hints.c
===================================================================
--- head/sys/kern/subr_hints.c	(revision 326270)
+++ head/sys/kern/subr_hints.c	(revision 326271)
@@ -1,491 +1,493 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000,2001 Peter Wemm <peter@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 
 /*
  * Access functions for device resources.
  */
 
 static int checkmethod = 1;
 static int use_kenv;
 static char *hintp;
 
 /*
  * Define kern.hintmode sysctl, which only accept value 2, that cause to
  * switch from Static KENV mode to Dynamic KENV. So systems that have hints
  * compiled into kernel will be able to see/modify KENV (and hints too).
  */
 
 static int
 sysctl_hintmode(SYSCTL_HANDLER_ARGS)
 {
 	const char *cp;
 	char *line, *eq;
 	int eqidx, error, from_kenv, i, value;
 
 	from_kenv = 0;
 	cp = kern_envp;
 	value = hintmode;
 
 	/* Fetch candidate for new hintmode value */
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error || req->newptr == NULL)
 		return (error);
 
 	if (value != 2)
 		/* Only accept swithing to hintmode 2 */
 		return (EINVAL);
 
 	/* Migrate from static to dynamic hints */
 	switch (hintmode) {
 	case 0:
 		if (dynamic_kenv) {
 			/*
 			 * Already here. But assign hintmode to 2, to not
 			 * check it in the future.
 			 */
 			hintmode = 2;
 			return (0);
 		}
 		from_kenv = 1;
 		cp = kern_envp;
 		break;
 	case 1:
 		cp = static_hints;
 		break;
 	case 2:
 		/* Nothing to do, hintmode already 2 */
 		return (0);
 	}
 
 	while (cp) {
 		i = strlen(cp);
 		if (i == 0)
 			break;
 		if (from_kenv) {
 			if (strncmp(cp, "hint.", 5) != 0)
 				/* kenv can have not only hints */
 				continue;
 		}
 		eq = strchr(cp, '=');
 		if (eq == NULL)
 			/* Bad hint value */
 			continue;
 		eqidx = eq - cp;
 
 		line = malloc(i+1, M_TEMP, M_WAITOK);
 		strcpy(line, cp);
 		line[eqidx] = '\0';
 		kern_setenv(line, line + eqidx + 1);
 		free(line, M_TEMP);
 		cp += i + 1;
 	}
 
 	hintmode = value;
 	use_kenv = 1;
 	return (0);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, hintmode, CTLTYPE_INT|CTLFLAG_RW,
     &hintmode, 0, sysctl_hintmode, "I", "Get/set current hintmode");
 
 /*
  * Evil wildcarding resource string lookup.
  * This walks the supplied env string table and returns a match.
  * The start point can be remembered for incremental searches.
  */
 static int
 res_find(int *line, int *startln,
     const char *name, int *unit, const char *resname, const char *value,
     const char **ret_name, int *ret_namelen, int *ret_unit,
     const char **ret_resname, int *ret_resnamelen, const char **ret_value)
 {
 	int n = 0, hit, i = 0;
 	char r_name[32];
 	int r_unit;
 	char r_resname[32];
 	char r_value[128];
 	const char *s, *cp;
 	char *p;
 
 	if (checkmethod) {
 		hintp = NULL;
 
 		switch (hintmode) {
 		case 0:		/* loader hints in environment only */
 			break;
 		case 1:		/* static hints only */
 			hintp = static_hints;
 			checkmethod = 0;
 			break;
 		case 2:		/* fallback mode */
 			if (dynamic_kenv) {
 				mtx_lock(&kenv_lock);
 				cp = kenvp[0];
 				for (i = 0; cp != NULL; cp = kenvp[++i]) {
 					if (!strncmp(cp, "hint.", 5)) {
 						use_kenv = 1;
 						checkmethod = 0;
 						break;
 					}
 				}
 				mtx_unlock(&kenv_lock);
 			} else {
 				cp = kern_envp;
 				while (cp) {
 					if (strncmp(cp, "hint.", 5) == 0) {
 						cp = NULL;
 						hintp = kern_envp;
 						break;
 					}
 					while (*cp != '\0')
 						cp++;
 					cp++;
 					if (*cp == '\0') {
 						cp = NULL;
 						hintp = static_hints;
 						break;
 					}
 				}
 			}
 			break;
 		default:
 			break;
 		}
 		if (hintp == NULL) {
 			if (dynamic_kenv) {
 				use_kenv = 1;
 				checkmethod = 0;
 			} else
 				hintp = kern_envp;
 		}
 	}
 
 	if (use_kenv) {
 		mtx_lock(&kenv_lock);
 		i = 0;
 		cp = kenvp[0];
 		if (cp == NULL) {
 			mtx_unlock(&kenv_lock);
 			return (ENOENT);
 		}
 	} else
 		cp = hintp;
 	while (cp) {
 		hit = 1;
 		(*line)++;
 		if (strncmp(cp, "hint.", 5) != 0)
 			hit = 0;
 		else
 			n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%127s",
 			    r_name, &r_unit, r_resname, r_value);
 		if (hit && n != 4) {
 			printf("CONFIG: invalid hint '%s'\n", cp);
 			p = strchr(cp, 'h');
 			*p = 'H';
 			hit = 0;
 		}
 		if (hit && startln && *startln >= 0 && *line < *startln)
 			hit = 0;
 		if (hit && name && strcmp(name, r_name) != 0)
 			hit = 0;
 		if (hit && unit && *unit != r_unit)
 			hit = 0;
 		if (hit && resname && strcmp(resname, r_resname) != 0)
 			hit = 0;
 		if (hit && value && strcmp(value, r_value) != 0)
 			hit = 0;
 		if (hit)
 			break;
 		if (use_kenv) {
 			cp = kenvp[++i];
 			if (cp == NULL)
 				break;
 		} else {
 			while (*cp != '\0')
 				cp++;
 			cp++;
 			if (*cp == '\0') {
 				cp = NULL;
 				break;
 			}
 		}
 	}
 	if (use_kenv)
 		mtx_unlock(&kenv_lock);
 	if (cp == NULL)
 		return ENOENT;
 
 	s = cp;
 	/* This is a bit of a hack, but at least is reentrant */
 	/* Note that it returns some !unterminated! strings. */
 	s = strchr(s, '.') + 1;		/* start of device */
 	if (ret_name)
 		*ret_name = s;
 	s = strchr(s, '.') + 1;		/* start of unit */
 	if (ret_namelen && ret_name)
 		*ret_namelen = s - *ret_name - 1; /* device length */
 	if (ret_unit)
 		*ret_unit = r_unit;
 	s = strchr(s, '.') + 1;		/* start of resname */
 	if (ret_resname)
 		*ret_resname = s;
 	s = strchr(s, '=') + 1;		/* start of value */
 	if (ret_resnamelen && ret_resname)
 		*ret_resnamelen = s - *ret_resname - 1; /* value len */
 	if (ret_value)
 		*ret_value = s;
 	if (startln)			/* line number for anchor */
 		*startln = *line + 1;
 	return 0;
 }
 
 /*
  * Search all the data sources for matches to our query.  We look for
  * dynamic hints first as overrides for static or fallback hints.
  */
 static int
 resource_find(int *line, int *startln,
     const char *name, int *unit, const char *resname, const char *value,
     const char **ret_name, int *ret_namelen, int *ret_unit,
     const char **ret_resname, int *ret_resnamelen, const char **ret_value)
 {
 	int i;
 	int un;
 
 	*line = 0;
 
 	/* Search for exact unit matches first */
 	i = res_find(line, startln, name, unit, resname, value,
 	    ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
 	    ret_value);
 	if (i == 0)
 		return 0;
 	if (unit == NULL)
 		return ENOENT;
 	/* If we are still here, search for wildcard matches */
 	un = -1;
 	i = res_find(line, startln, name, &un, resname, value,
 	    ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
 	    ret_value);
 	if (i == 0)
 		return 0;
 	return ENOENT;
 }
 
 int
 resource_int_value(const char *name, int unit, const char *resname, int *result)
 {
 	int error;
 	const char *str;
 	char *op;
 	unsigned long val;
 	int line;
 
 	line = 0;
 	error = resource_find(&line, NULL, name, &unit, resname, NULL,
 	    NULL, NULL, NULL, NULL, NULL, &str);
 	if (error)
 		return error;
 	if (*str == '\0') 
 		return EFTYPE;
 	val = strtoul(str, &op, 0);
 	if (*op != '\0') 
 		return EFTYPE;
 	*result = val;
 	return 0;
 }
 
 int
 resource_long_value(const char *name, int unit, const char *resname,
     long *result)
 {
 	int error;
 	const char *str;
 	char *op;
 	unsigned long val;
 	int line;
 
 	line = 0;
 	error = resource_find(&line, NULL, name, &unit, resname, NULL,
 	    NULL, NULL, NULL, NULL, NULL, &str);
 	if (error)
 		return error;
 	if (*str == '\0') 
 		return EFTYPE;
 	val = strtoul(str, &op, 0);
 	if (*op != '\0') 
 		return EFTYPE;
 	*result = val;
 	return 0;
 }
 
 int
 resource_string_value(const char *name, int unit, const char *resname,
     const char **result)
 {
 	int error;
 	const char *str;
 	int line;
 
 	line = 0;
 	error = resource_find(&line, NULL, name, &unit, resname, NULL,
 	    NULL, NULL, NULL, NULL, NULL, &str);
 	if (error)
 		return error;
 	*result = str;
 	return 0;
 }
 
 /*
  * This is a bit nasty, but allows us to not modify the env strings.
  */
 static const char *
 resource_string_copy(const char *s, int len)
 {
 	static char stringbuf[256];
 	static int offset = 0;
 	const char *ret;
 
 	if (len == 0)
 		len = strlen(s);
 	if (len > 255)
 		return NULL;
 	if ((offset + len + 1) > 255)
 		offset = 0;
 	bcopy(s, &stringbuf[offset], len);
 	stringbuf[offset + len] = '\0';
 	ret = &stringbuf[offset];
 	offset += len + 1;
 	return ret;
 }
 
 /*
  * err = resource_find_match(&anchor, &name, &unit, resname, value)
  * Iteratively fetch a list of devices wired "at" something
  * res and value are restrictions.  eg: "at", "scbus0".
  * For practical purposes, res = required, value = optional.
  * *name and *unit are set.
  * set *anchor to zero before starting.
  */
 int
 resource_find_match(int *anchor, const char **name, int *unit,
     const char *resname, const char *value)
 {
 	const char *found_name;
 	int found_namelen;
 	int found_unit;
 	int ret;
 	int newln;
 
 	newln = *anchor;
 	ret = resource_find(anchor, &newln, NULL, NULL, resname, value,
 	    &found_name, &found_namelen, &found_unit, NULL, NULL, NULL);
 	if (ret == 0) {
 		*name = resource_string_copy(found_name, found_namelen);
 		*unit = found_unit;
 	}
 	*anchor = newln;
 	return ret;
 }
 
 
 /*
  * err = resource_find_dev(&anchor, name, &unit, res, value);
  * Iterate through a list of devices, returning their unit numbers.
  * res and value are optional restrictions.  eg: "at", "scbus0".
  * *unit is set to the value.
  * set *anchor to zero before starting.
  */
 int
 resource_find_dev(int *anchor, const char *name, int *unit,
     const char *resname, const char *value)
 {
 	int found_unit;
 	int newln;
 	int ret;
 
 	newln = *anchor;
 	ret = resource_find(anchor, &newln, name, NULL, resname, value,
 	    NULL, NULL, &found_unit, NULL, NULL, NULL);
 	if (ret == 0) {
 		*unit = found_unit;
 	}
 	*anchor = newln;
 	return ret;
 }
 
 /*
  * Check to see if a device is disabled via a disabled hint.
  */
 int
 resource_disabled(const char *name, int unit)
 {
 	int error, value;
 
 	error = resource_int_value(name, unit, "disabled", &value);
 	if (error)
 	       return (0);
 	return (value);
 }
 
 /*
  * Clear a value associated with a device by removing it from
  * the kernel environment.  This only removes a hint for an
  * exact unit.
  */
 int
 resource_unset_value(const char *name, int unit, const char *resname)
 {
 	char varname[128];
 	const char *retname, *retvalue;
 	int error, line;
 	size_t len;
 
 	line = 0;
 	error = resource_find(&line, NULL, name, &unit, resname, NULL,
 	    &retname, NULL, NULL, NULL, NULL, &retvalue);
 	if (error)
 		return (error);
 
 	retname -= strlen("hint.");
 	len = retvalue - retname - 1;
 	if (len > sizeof(varname) - 1)
 		return (ENAMETOOLONG);
 	memcpy(varname, retname, len);
 	varname[len] = '\0';
 	return (kern_unsetenv(varname));
 }
Index: head/sys/kern/subr_kdb.c
===================================================================
--- head/sys/kern/subr_kdb.c	(revision 326270)
+++ head/sys/kern/subr_kdb.c	(revision 326271)
@@ -1,675 +1,677 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2004 The FreeBSD Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kdb.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cons.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 
 #include <machine/kdb.h>
 #include <machine/pcb.h>
 
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 u_char __read_frequently kdb_active = 0;
 static void *kdb_jmpbufp = NULL;
 struct kdb_dbbe *kdb_dbbe = NULL;
 static struct pcb kdb_pcb;
 struct pcb *kdb_thrctx = NULL;
 struct thread *kdb_thread = NULL;
 struct trapframe *kdb_frame = NULL;
 
 #ifdef BREAK_TO_DEBUGGER
 #define	KDB_BREAK_TO_DEBUGGER	1
 #else
 #define	KDB_BREAK_TO_DEBUGGER	0
 #endif
 
 #ifdef ALT_BREAK_TO_DEBUGGER
 #define	KDB_ALT_BREAK_TO_DEBUGGER	1
 #else
 #define	KDB_ALT_BREAK_TO_DEBUGGER	0
 #endif
 
 static int	kdb_break_to_debugger = KDB_BREAK_TO_DEBUGGER;
 static int	kdb_alt_break_to_debugger = KDB_ALT_BREAK_TO_DEBUGGER;
 
 KDB_BACKEND(null, NULL, NULL, NULL, NULL);
 SET_DECLARE(kdb_dbbe_set, struct kdb_dbbe);
 
 static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS);
 static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS);
 static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS);
 static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS);
 static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS);
 static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS);
 
 static SYSCTL_NODE(_debug, OID_AUTO, kdb, CTLFLAG_RW, NULL, "KDB nodes");
 
 SYSCTL_PROC(_debug_kdb, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD, NULL,
     0, kdb_sysctl_available, "A", "list of available KDB backends");
 
 SYSCTL_PROC(_debug_kdb, OID_AUTO, current, CTLTYPE_STRING | CTLFLAG_RW, NULL,
     0, kdb_sysctl_current, "A", "currently selected KDB backend");
 
 SYSCTL_PROC(_debug_kdb, OID_AUTO, enter,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kdb_sysctl_enter, "I", "set to enter the debugger");
 
 SYSCTL_PROC(_debug_kdb, OID_AUTO, panic,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kdb_sysctl_panic, "I", "set to panic the kernel");
 
 SYSCTL_PROC(_debug_kdb, OID_AUTO, trap,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kdb_sysctl_trap, "I", "set to cause a page fault via data access");
 
 SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kdb_sysctl_trap_code, "I", "set to cause a page fault via code access");
 
 SYSCTL_INT(_debug_kdb, OID_AUTO, break_to_debugger,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &kdb_break_to_debugger, 0, "Enable break to debugger");
 
 SYSCTL_INT(_debug_kdb, OID_AUTO, alt_break_to_debugger,
     CTLFLAG_RWTUN | CTLFLAG_SECURE,
     &kdb_alt_break_to_debugger, 0, "Enable alternative break to debugger");
 
 /*
  * Flag to indicate to debuggers why the debugger was entered.
  */
 const char * volatile kdb_why = KDB_WHY_UNSET;
 
 static int
 kdb_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	struct kdb_dbbe **iter;
 	struct sbuf sbuf;
 	int error;
 
 	sbuf_new_for_sysctl(&sbuf, NULL, 64, req);
 	SET_FOREACH(iter, kdb_dbbe_set) {
 		if ((*iter)->dbbe_active == 0)
 			sbuf_printf(&sbuf, "%s ", (*iter)->dbbe_name);
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 static int
 kdb_sysctl_current(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int error;
 
 	if (kdb_dbbe != NULL)
 		strlcpy(buf, kdb_dbbe->dbbe_name, sizeof(buf));
 	else
 		*buf = '\0';
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (kdb_active)
 		return (EBUSY);
 	return (kdb_dbbe_select(buf));
 }
 
 static int
 kdb_sysctl_enter(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error == 0) {
 		i = 0;
 		error = sysctl_handle_int(oidp, &i, 0, req);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (kdb_active)
 		return (EBUSY);
 	kdb_enter(KDB_WHY_SYSCTL, "sysctl debug.kdb.enter");
 	return (0);
 }
 
 static int
 kdb_sysctl_panic(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error == 0) {
 		i = 0;
 		error = sysctl_handle_int(oidp, &i, 0, req);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	panic("kdb_sysctl_panic");
 	return (0);
 }
 
 static int
 kdb_sysctl_trap(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 	int *addr = (int *)0x10;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error == 0) {
 		i = 0;
 		error = sysctl_handle_int(oidp, &i, 0, req);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	return (*addr);
 }
 
 static int
 kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 	void (*fp)(u_int, u_int, u_int) = (void *)0xdeadc0de;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error == 0) {
 		i = 0;
 		error = sysctl_handle_int(oidp, &i, 0, req);
 	}
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	(*fp)(0x11111111, 0x22222222, 0x33333333);
 	return (0);
 }
 
 void
 kdb_panic(const char *msg)
 {
 
 	printf("KDB: panic\n");
 	panic("%s", msg);
 }
 
 void
 kdb_reboot(void)
 {
 
 	printf("KDB: reboot requested\n");
 	shutdown_nice(0);
 }
 
 /*
  * Solaris implements a new BREAK which is initiated by a character sequence
  * CR ~ ^b which is similar to a familiar pattern used on Sun servers by the
  * Remote Console.
  *
  * Note that this function may be called from almost anywhere, with interrupts
  * disabled and with unknown locks held, so it must not access data other than
  * its arguments.  Its up to the caller to ensure that the state variable is
  * consistent.
  */
 
 #define	KEY_CR		13	/* CR '\r' */
 #define	KEY_TILDE	126	/* ~ */
 #define	KEY_CRTLB	2	/* ^B */
 #define	KEY_CRTLP	16	/* ^P */
 #define	KEY_CRTLR	18	/* ^R */
 
 /* States of th KDB "alternate break sequence" detecting state machine. */
 enum {
 	KDB_ALT_BREAK_SEEN_NONE,
 	KDB_ALT_BREAK_SEEN_CR,
 	KDB_ALT_BREAK_SEEN_CR_TILDE,
 };
 
 int
 kdb_break(void)
 {
 
 	if (!kdb_break_to_debugger)
 		return (0);
 	kdb_enter(KDB_WHY_BREAK, "Break to debugger");
 	return (KDB_REQ_DEBUGGER);
 }
 
 static int
 kdb_alt_break_state(int key, int *state)
 {
 	int brk;
 
 	/* All states transition to KDB_ALT_BREAK_SEEN_CR on a CR. */
 	if (key == KEY_CR) {
 		*state = KDB_ALT_BREAK_SEEN_CR;
 		return (0);
 	}
 
 	brk = 0;
 	switch (*state) {
 	case KDB_ALT_BREAK_SEEN_CR:
 		*state = KDB_ALT_BREAK_SEEN_NONE;
 		if (key == KEY_TILDE)
 			*state = KDB_ALT_BREAK_SEEN_CR_TILDE;
 		break;
 	case KDB_ALT_BREAK_SEEN_CR_TILDE:
 		*state = KDB_ALT_BREAK_SEEN_NONE;
 		if (key == KEY_CRTLB)
 			brk = KDB_REQ_DEBUGGER;
 		else if (key == KEY_CRTLP)
 			brk = KDB_REQ_PANIC;
 		else if (key == KEY_CRTLR)
 			brk = KDB_REQ_REBOOT;
 		break;
 	case KDB_ALT_BREAK_SEEN_NONE:
 	default:
 		*state = KDB_ALT_BREAK_SEEN_NONE;
 		break;
 	}
 	return (brk);
 }
 
 static int
 kdb_alt_break_internal(int key, int *state, int force_gdb)
 {
 	int brk;
 
 	if (!kdb_alt_break_to_debugger)
 		return (0);
 	brk = kdb_alt_break_state(key, state);
 	switch (brk) {
 	case KDB_REQ_DEBUGGER:
 		if (force_gdb)
 			kdb_dbbe_select("gdb");
 		kdb_enter(KDB_WHY_BREAK, "Break to debugger");
 		break;
 
 	case KDB_REQ_PANIC:
 		if (force_gdb)
 			kdb_dbbe_select("gdb");
 		kdb_panic("Panic sequence on console");
 		break;
 
 	case KDB_REQ_REBOOT:
 		kdb_reboot();
 		break;
 	}
 	return (0);
 }
 
 int
 kdb_alt_break(int key, int *state)
 {
 
 	return (kdb_alt_break_internal(key, state, 0));
 }
 
 /*
  * This variation on kdb_alt_break() is used only by dcons, which has its own
  * configuration flag to force GDB use regardless of the global KDB
  * configuration.
  */
 int
 kdb_alt_break_gdb(int key, int *state)
 {
 
 	return (kdb_alt_break_internal(key, state, 1));
 }
 
 /*
  * Print a backtrace of the calling thread. The backtrace is generated by
  * the selected debugger, provided it supports backtraces. If no debugger
  * is selected or the current debugger does not support backtraces, this
  * function silently returns.
  */
 
 void
 kdb_backtrace(void)
 {
 
 	if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace != NULL) {
 		printf("KDB: stack backtrace:\n");
 		kdb_dbbe->dbbe_trace();
 	}
 #ifdef STACK
 	else {
 		struct stack st;
 
 		printf("KDB: stack backtrace:\n");
 		stack_zero(&st);
 		stack_save(&st);
 		stack_print_ddb(&st);
 	}
 #endif
 }
 
 /*
  * Similar to kdb_backtrace() except that it prints a backtrace of an
  * arbitrary thread rather than the calling thread.
  */
 void
 kdb_backtrace_thread(struct thread *td)
 {
 
 	if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace_thread != NULL) {
 		printf("KDB: stack backtrace of thread %d:\n", td->td_tid);
 		kdb_dbbe->dbbe_trace_thread(td);
 	}
 #ifdef STACK
 	else {
 		struct stack st;
 
 		printf("KDB: stack backtrace of thread %d:\n", td->td_tid);
 		stack_zero(&st);
 		stack_save_td(&st, td);
 		stack_print_ddb(&st);
 	}
 #endif
 }
 
 /*
  * Set/change the current backend.
  */
 
 int
 kdb_dbbe_select(const char *name)
 {
 	struct kdb_dbbe *be, **iter;
 
 	SET_FOREACH(iter, kdb_dbbe_set) {
 		be = *iter;
 		if (be->dbbe_active == 0 && strcmp(be->dbbe_name, name) == 0) {
 			kdb_dbbe = be;
 			return (0);
 		}
 	}
 	return (EINVAL);
 }
 
 /*
  * Enter the currently selected debugger. If a message has been provided,
  * it is printed first. If the debugger does not support the enter method,
  * it is entered by using breakpoint(), which enters the debugger through
  * kdb_trap().  The 'why' argument will contain a more mechanically usable
  * string than 'msg', and is relied upon by DDB scripting to identify the
  * reason for entering the debugger so that the right script can be run.
  */
 void
 kdb_enter(const char *why, const char *msg)
 {
 
 	if (kdb_dbbe != NULL && kdb_active == 0) {
 		if (msg != NULL)
 			printf("KDB: enter: %s\n", msg);
 		kdb_why = why;
 		breakpoint();
 		kdb_why = KDB_WHY_UNSET;
 	}
 }
 
 /*
  * Initialize the kernel debugger interface.
  */
 
 void
 kdb_init(void)
 {
 	struct kdb_dbbe *be, **iter;
 	int cur_pri, pri;
 
 	kdb_active = 0;
 	kdb_dbbe = NULL;
 	cur_pri = -1;
 	SET_FOREACH(iter, kdb_dbbe_set) {
 		be = *iter;
 		pri = (be->dbbe_init != NULL) ? be->dbbe_init() : -1;
 		be->dbbe_active = (pri >= 0) ? 0 : -1;
 		if (pri > cur_pri) {
 			cur_pri = pri;
 			kdb_dbbe = be;
 		}
 	}
 	if (kdb_dbbe != NULL) {
 		printf("KDB: debugger backends:");
 		SET_FOREACH(iter, kdb_dbbe_set) {
 			be = *iter;
 			if (be->dbbe_active == 0)
 				printf(" %s", be->dbbe_name);
 		}
 		printf("\n");
 		printf("KDB: current backend: %s\n",
 		    kdb_dbbe->dbbe_name);
 	}
 }
 
 /*
  * Handle contexts.
  */
 
 void *
 kdb_jmpbuf(jmp_buf new)
 {
 	void *old;
 
 	old = kdb_jmpbufp;
 	kdb_jmpbufp = new;
 	return (old);
 }
 
 void
 kdb_reenter(void)
 {
 
 	if (!kdb_active || kdb_jmpbufp == NULL)
 		return;
 
 	printf("KDB: reentering\n");
 	kdb_backtrace();
 	longjmp(kdb_jmpbufp, 1);
 	/* NOTREACHED */
 }
 
 /*
  * Thread related support functions.
  */
 
 struct pcb *
 kdb_thr_ctx(struct thread *thr)
 {
 #if defined(SMP) && defined(KDB_STOPPEDPCB)
 	struct pcpu *pc;
 #endif
 
 	if (thr == curthread)
 		return (&kdb_pcb);
 
 #if defined(SMP) && defined(KDB_STOPPEDPCB)
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)  {
 		if (pc->pc_curthread == thr &&
 		    CPU_ISSET(pc->pc_cpuid, &stopped_cpus))
 			return (KDB_STOPPEDPCB(pc));
 	}
 #endif
 	return (thr->td_pcb);
 }
 
 struct thread *
 kdb_thr_first(void)
 {
 	struct proc *p;
 	struct thread *thr;
 
 	p = LIST_FIRST(&allproc);
 	while (p != NULL) {
 		if (p->p_flag & P_INMEM) {
 			thr = FIRST_THREAD_IN_PROC(p);
 			if (thr != NULL)
 				return (thr);
 		}
 		p = LIST_NEXT(p, p_list);
 	}
 	return (NULL);
 }
 
 struct thread *
 kdb_thr_from_pid(pid_t pid)
 {
 	struct proc *p;
 
 	p = LIST_FIRST(&allproc);
 	while (p != NULL) {
 		if (p->p_flag & P_INMEM && p->p_pid == pid)
 			return (FIRST_THREAD_IN_PROC(p));
 		p = LIST_NEXT(p, p_list);
 	}
 	return (NULL);
 }
 
 struct thread *
 kdb_thr_lookup(lwpid_t tid)
 {
 	struct thread *thr;
 
 	thr = kdb_thr_first();
 	while (thr != NULL && thr->td_tid != tid)
 		thr = kdb_thr_next(thr);
 	return (thr);
 }
 
 struct thread *
 kdb_thr_next(struct thread *thr)
 {
 	struct proc *p;
 
 	p = thr->td_proc;
 	thr = TAILQ_NEXT(thr, td_plist);
 	do {
 		if (thr != NULL)
 			return (thr);
 		p = LIST_NEXT(p, p_list);
 		if (p != NULL && (p->p_flag & P_INMEM))
 			thr = FIRST_THREAD_IN_PROC(p);
 	} while (p != NULL);
 	return (NULL);
 }
 
 int
 kdb_thr_select(struct thread *thr)
 {
 	if (thr == NULL)
 		return (EINVAL);
 	kdb_thread = thr;
 	kdb_thrctx = kdb_thr_ctx(thr);
 	return (0);
 }
 
 /*
  * Enter the debugger due to a trap.
  */
 
 int
 kdb_trap(int type, int code, struct trapframe *tf)
 {
 #ifdef SMP
 	cpuset_t other_cpus;
 #endif
 	struct kdb_dbbe *be;
 	register_t intr;
 	int handled;
 #ifdef SMP
 	int did_stop_cpus;
 #endif
 
 	be = kdb_dbbe;
 	if (be == NULL || be->dbbe_trap == NULL)
 		return (0);
 
 	/* We reenter the debugger through kdb_reenter(). */
 	if (kdb_active)
 		return (0);
 
 	intr = intr_disable();
 
 #ifdef SMP
 	if (!SCHEDULER_STOPPED()) {
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		stop_cpus_hard(other_cpus);
 		did_stop_cpus = 1;
 	} else
 		did_stop_cpus = 0;
 #endif
 
 	kdb_active++;
 
 	kdb_frame = tf;
 
 	/* Let MD code do its thing first... */
 	kdb_cpu_trap(type, code);
 
 	makectx(tf, &kdb_pcb);
 	kdb_thr_select(curthread);
 
 	cngrab();
 
 	for (;;) {
 		handled = be->dbbe_trap(type, code);
 		if (be == kdb_dbbe)
 			break;
 		be = kdb_dbbe;
 		if (be == NULL || be->dbbe_trap == NULL)
 			break;
 		printf("Switching to %s back-end\n", be->dbbe_name);
 	}
 
 	cnungrab();
 
 	kdb_active--;
 
 #ifdef SMP
 	if (did_stop_cpus)
 		restart_cpus(stopped_cpus);
 #endif
 
 	intr_restore(intr);
 
 	return (handled);
 }
Index: head/sys/kern/subr_kobj.c
===================================================================
--- head/sys/kern/subr_kobj.c	(revision 326270)
+++ head/sys/kern/subr_kobj.c	(revision 326271)
@@ -1,340 +1,342 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000,2003 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #ifndef TEST
 #include <sys/systm.h>
 #endif
 
 #ifdef TEST
 #include "usertest.h"
 #endif
 
 static MALLOC_DEFINE(M_KOBJ, "kobj", "Kernel object structures");
 
 #ifdef KOBJ_STATS
 
 u_int kobj_lookup_hits;
 u_int kobj_lookup_misses;
 
 SYSCTL_UINT(_kern, OID_AUTO, kobj_hits, CTLFLAG_RD,
 	   &kobj_lookup_hits, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, kobj_misses, CTLFLAG_RD,
 	   &kobj_lookup_misses, 0, "");
 
 #endif
 
 static struct mtx kobj_mtx;
 static int kobj_mutex_inited;
 static int kobj_next_id = 1;
 
 #define	KOBJ_LOCK()		mtx_lock(&kobj_mtx)
 #define	KOBJ_UNLOCK()		mtx_unlock(&kobj_mtx)
 #define	KOBJ_ASSERT(what)	mtx_assert(&kobj_mtx, what);
 
 SYSCTL_INT(_kern, OID_AUTO, kobj_methodcount, CTLFLAG_RD,
 	   &kobj_next_id, 0, "");
 
 static void
 kobj_init_mutex(void *arg)
 {
 	if (!kobj_mutex_inited) {
 		mtx_init(&kobj_mtx, "kobj", NULL, MTX_DEF);
 		kobj_mutex_inited = 1;
 	}
 }
 
 SYSINIT(kobj, SI_SUB_LOCK, SI_ORDER_ANY, kobj_init_mutex, NULL);
 
 /*
  * This method structure is used to initialise new caches. Since the
  * desc pointer is NULL, it is guaranteed never to match any read
  * descriptors.
  */
 static const struct kobj_method null_method = {
 	0, 0,
 };
 
 int
 kobj_error_method(void)
 {
 
 	return ENXIO;
 }
 
 static void
 kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops)
 {
 	kobj_method_t *m;
 	int i;
 
 	/*
 	 * Don't do anything if we are already compiled.
 	 */
 	if (cls->ops)
 		return;
 
 	/*
 	 * First register any methods which need it.
 	 */
 	for (i = 0, m = cls->methods; m->desc; i++, m++) {
 		if (m->desc->id == 0)
 			m->desc->id = kobj_next_id++;
 	}
 
 	/*
 	 * Then initialise the ops table.
 	 */
 	for (i = 0; i < KOBJ_CACHE_SIZE; i++)
 		ops->cache[i] = &null_method;
 	ops->cls = cls;
 	cls->ops = ops;
 }
 
 void
 kobj_class_compile(kobj_class_t cls)
 {
 	kobj_ops_t ops;
 
 	KOBJ_ASSERT(MA_NOTOWNED);
 
 	/*
 	 * Allocate space for the compiled ops table.
 	 */
 	ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT);
 	if (!ops)
 		panic("%s: out of memory", __func__);
 
 	KOBJ_LOCK();
 	
 	/*
 	 * We may have lost a race for kobj_class_compile here - check
 	 * to make sure someone else hasn't already compiled this
 	 * class.
 	 */
 	if (cls->ops) {
 		KOBJ_UNLOCK();
 		free(ops, M_KOBJ);
 		return;
 	}
 
 	kobj_class_compile_common(cls, ops);
 	KOBJ_UNLOCK();
 }
 
 void
 kobj_class_compile_static(kobj_class_t cls, kobj_ops_t ops)
 {
 
 	KASSERT(kobj_mutex_inited == 0,
 	    ("%s: only supported during early cycles", __func__));
 
 	/*
 	 * Increment refs to make sure that the ops table is not freed.
 	 */
 	cls->refs++;
 	kobj_class_compile_common(cls, ops);
 }
 
 static kobj_method_t*
 kobj_lookup_method_class(kobj_class_t cls, kobjop_desc_t desc)
 {
 	kobj_method_t *methods = cls->methods;
 	kobj_method_t *ce;
 
 	for (ce = methods; ce && ce->desc; ce++) {
 		if (ce->desc == desc) {
 			return ce;
 		}
 	}
 
 	return NULL;
 }
 
 static kobj_method_t*
 kobj_lookup_method_mi(kobj_class_t cls,
 		      kobjop_desc_t desc)
 {
 	kobj_method_t *ce;
 	kobj_class_t *basep;
 
 	ce = kobj_lookup_method_class(cls, desc);
 	if (ce)
 		return ce;
 
 	basep = cls->baseclasses;
 	if (basep) {
 		for (; *basep; basep++) {
 			ce = kobj_lookup_method_mi(*basep, desc);
 			if (ce)
 				return ce;
 		}
 	}
 
 	return NULL;
 }
 
 kobj_method_t*
 kobj_lookup_method(kobj_class_t cls,
 		   kobj_method_t **cep,
 		   kobjop_desc_t desc)
 {
 	kobj_method_t *ce;
 
 	ce = kobj_lookup_method_mi(cls, desc);
 	if (!ce)
 		ce = &desc->deflt;
 	if (cep)
 		*cep = ce;
 	return ce;
 }
 
 void
 kobj_class_free(kobj_class_t cls)
 {
 	void* ops = NULL;
 
 	KOBJ_ASSERT(MA_NOTOWNED);
 	KOBJ_LOCK();
 
 	/*
 	 * Protect against a race between kobj_create and
 	 * kobj_delete.
 	 */
 	if (cls->refs == 0) {
 		/*
 		 * For now we don't do anything to unregister any methods
 		 * which are no longer used.
 		 */
 
 		/*
 		 * Free memory and clean up.
 		 */
 		ops = cls->ops;
 		cls->ops = NULL;
 	}
 	
 	KOBJ_UNLOCK();
 
 	if (ops)
 		free(ops, M_KOBJ);
 }
 
 kobj_t
 kobj_create(kobj_class_t cls,
 	    struct malloc_type *mtype,
 	    int mflags)
 {
 	kobj_t obj;
 
 	/*
 	 * Allocate and initialise the new object.
 	 */
 	obj = malloc(cls->size, mtype, mflags | M_ZERO);
 	if (!obj)
 		return NULL;
 	kobj_init(obj, cls);
 
 	return obj;
 }
 
 static void
 kobj_init_common(kobj_t obj, kobj_class_t cls)
 {
 
 	obj->ops = cls->ops;
 	cls->refs++;
 }
 
 void
 kobj_init(kobj_t obj, kobj_class_t cls)
 {
 	KOBJ_ASSERT(MA_NOTOWNED);
   retry:
 	KOBJ_LOCK();
 
 	/*
 	 * Consider compiling the class' method table.
 	 */
 	if (!cls->ops) {
 		/*
 		 * kobj_class_compile doesn't want the lock held
 		 * because of the call to malloc - we drop the lock
 		 * and re-try.
 		 */
 		KOBJ_UNLOCK();
 		kobj_class_compile(cls);
 		goto retry;
 	}
 
 	kobj_init_common(obj, cls);
 
 	KOBJ_UNLOCK();
 }
 
 void
 kobj_init_static(kobj_t obj, kobj_class_t cls)
 {
 
 	KASSERT(kobj_mutex_inited == 0,
 	    ("%s: only supported during early cycles", __func__));
 
 	kobj_init_common(obj, cls);
 }
 
 void
 kobj_delete(kobj_t obj, struct malloc_type *mtype)
 {
 	kobj_class_t cls = obj->ops->cls;
 	int refs;
 
 	/*
 	 * Consider freeing the compiled method table for the class
 	 * after its last instance is deleted. As an optimisation, we
 	 * should defer this for a short while to avoid thrashing.
 	 */
 	KOBJ_ASSERT(MA_NOTOWNED);
 	KOBJ_LOCK();
 	cls->refs--;
 	refs = cls->refs;
 	KOBJ_UNLOCK();
 
 	if (!refs)
 		kobj_class_free(cls);
 
 	obj->ops = NULL;
 	if (mtype)
 		free(obj, mtype);
 }
Index: head/sys/kern/subr_lock.c
===================================================================
--- head/sys/kern/subr_lock.c	(revision 326270)
+++ head/sys/kern/subr_lock.c	(revision 326271)
@@ -1,700 +1,702 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This module holds the global variables and functions used to maintain
  * lock_object structures.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mprof.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/lock_profile.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/cpufunc.h>
 
 SDT_PROVIDER_DEFINE(lock);
 SDT_PROBE_DEFINE1(lock, , , starvation, "u_int");
 
 CTASSERT(LOCK_CLASS_MAX == 15);
 
 struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = {
 	&lock_class_mtx_spin,
 	&lock_class_mtx_sleep,
 	&lock_class_sx,
 	&lock_class_rm,
 	&lock_class_rm_sleepable,
 	&lock_class_rw,
 	&lock_class_lockmgr,
 };
 
 void
 lock_init(struct lock_object *lock, struct lock_class *class, const char *name,
     const char *type, int flags)
 {
 	int i;
 
 	/* Check for double-init and zero object. */
 	KASSERT(flags & LO_NEW || !lock_initialized(lock),
 	    ("lock \"%s\" %p already initialized", name, lock));
 
 	/* Look up lock class to find its index. */
 	for (i = 0; i < LOCK_CLASS_MAX; i++)
 		if (lock_classes[i] == class) {
 			lock->lo_flags = i << LO_CLASSSHIFT;
 			break;
 		}
 	KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class));
 
 	/* Initialize the lock object. */
 	lock->lo_name = name;
 	lock->lo_flags |= flags | LO_INITIALIZED;
 	LOCK_LOG_INIT(lock, 0);
 	WITNESS_INIT(lock, (type != NULL) ? type : name);
 }
 
 void
 lock_destroy(struct lock_object *lock)
 {
 
 	KASSERT(lock_initialized(lock), ("lock %p is not initialized", lock));
 	WITNESS_DESTROY(lock);
 	LOCK_LOG_DESTROY(lock, 0);
 	lock->lo_flags &= ~LO_INITIALIZED;
 }
 
 static SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging");
 static SYSCTL_NODE(_debug_lock, OID_AUTO, delay, CTLFLAG_RD, NULL,
     "lock delay");
 
 static u_int __read_mostly starvation_limit = 131072;
 SYSCTL_INT(_debug_lock_delay, OID_AUTO, starvation_limit, CTLFLAG_RW,
     &starvation_limit, 0, "");
 
 static u_int __read_mostly restrict_starvation = 0;
 SYSCTL_INT(_debug_lock_delay, OID_AUTO, restrict_starvation, CTLFLAG_RW,
     &restrict_starvation, 0, "");
 
 void
 lock_delay(struct lock_delay_arg *la)
 {
 	struct lock_delay_config *lc = la->config;
 	u_int i;
 
 	la->delay <<= 1;
 	if (__predict_false(la->delay > lc->max))
 		la->delay = lc->max;
 
 	for (i = la->delay; i > 0; i--)
 		cpu_spinwait();
 
 	la->spin_cnt += la->delay;
 	if (__predict_false(la->spin_cnt > starvation_limit)) {
 		SDT_PROBE1(lock, , , starvation, la->delay);
 		if (restrict_starvation)
 			la->delay = lc->base;
 	}
 }
 
 static u_int
 lock_roundup_2(u_int val)
 {
 	u_int res;
 
 	for (res = 1; res <= val; res <<= 1)
 		continue;
 
 	return (res);
 }
 
 void
 lock_delay_default_init(struct lock_delay_config *lc)
 {
 
 	lc->base = lock_roundup_2(mp_ncpus) / 4;
 	lc->max = lc->base * 1024;
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(lock, db_show_lock)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 
 	if (!have_addr)
 		return;
 	lock = (struct lock_object *)addr;
 	if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) {
 		db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock));
 		return;
 	}
 	class = LOCK_CLASS(lock);
 	db_printf(" class: %s\n", class->lc_name);
 	db_printf(" name: %s\n", lock->lo_name);
 	class->lc_ddb_show(lock);
 }
 #endif
 
 #ifdef LOCK_PROFILING
 
 /*
  * One object per-thread for each lock the thread owns.  Tracks individual
  * lock instances.
  */
 struct lock_profile_object {
 	LIST_ENTRY(lock_profile_object) lpo_link;
 	struct lock_object *lpo_obj;
 	const char	*lpo_file;
 	int		lpo_line;
 	uint16_t	lpo_ref;
 	uint16_t	lpo_cnt;
 	uint64_t	lpo_acqtime;
 	uint64_t	lpo_waittime;
 	u_int		lpo_contest_locking;
 };
 
 /*
  * One lock_prof for each (file, line, lock object) triple.
  */
 struct lock_prof {
 	SLIST_ENTRY(lock_prof) link;
 	struct lock_class *class;
 	const char	*file;
 	const char	*name;
 	int		line;
 	int		ticks;
 	uintmax_t	cnt_wait_max;
 	uintmax_t	cnt_max;
 	uintmax_t	cnt_tot;
 	uintmax_t	cnt_wait;
 	uintmax_t	cnt_cur;
 	uintmax_t	cnt_contest_locking;
 };
 
 SLIST_HEAD(lphead, lock_prof);
 
 #define	LPROF_HASH_SIZE		4096
 #define	LPROF_HASH_MASK		(LPROF_HASH_SIZE - 1)
 #define	LPROF_CACHE_SIZE	4096
 
 /*
  * Array of objects and profs for each type of object for each cpu.  Spinlocks
  * are handled separately because a thread may be preempted and acquire a
  * spinlock while in the lock profiling code of a non-spinlock.  In this way
  * we only need a critical section to protect the per-cpu lists.
  */
 struct lock_prof_type {
 	struct lphead		lpt_lpalloc;
 	struct lpohead		lpt_lpoalloc;
 	struct lphead		lpt_hash[LPROF_HASH_SIZE];
 	struct lock_prof	lpt_prof[LPROF_CACHE_SIZE];
 	struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE];
 };
 
 struct lock_prof_cpu {
 	struct lock_prof_type	lpc_types[2]; /* One for spin one for other. */
 };
 
 struct lock_prof_cpu *lp_cpu[MAXCPU];
 
 volatile int __read_mostly lock_prof_enable;
 static volatile int lock_prof_resetting;
 
 #define LPROF_SBUF_SIZE		256
 
 static int lock_prof_rejected;
 static int lock_prof_skipspin;
 static int lock_prof_skipcount;
 
 #ifndef USE_CPU_NANOSECONDS
 uint64_t
 nanoseconds(void)
 {
 	struct bintime bt;
 	uint64_t ns;
 
 	binuptime(&bt);
 	/* From bintime2timespec */
 	ns = bt.sec * (uint64_t)1000000000;
 	ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
 	return (ns);
 }
 #endif
 
 static void
 lock_prof_init_type(struct lock_prof_type *type)
 {
 	int i;
 
 	SLIST_INIT(&type->lpt_lpalloc);
 	LIST_INIT(&type->lpt_lpoalloc);
 	for (i = 0; i < LPROF_CACHE_SIZE; i++) {
 		SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i],
 		    link);
 		LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i],
 		    lpo_link);
 	}
 }
 
 static void
 lock_prof_init(void *arg)
 {
 	int cpu;
 
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF,
 		    M_WAITOK | M_ZERO);
 		lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]);
 		lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]);
 	}
 }
 SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL);
 
 static void
 lock_prof_reset_wait(void)
 {
 
 	/*
 	 * Spin relinquishing our cpu so that quiesce_all_cpus may
 	 * complete.
 	 */
 	while (lock_prof_resetting)
 		sched_relinquish(curthread);
 }
 
 static void
 lock_prof_reset(void)
 {
 	struct lock_prof_cpu *lpc;
 	int enabled, i, cpu;
 
 	/*
 	 * We not only race with acquiring and releasing locks but also
 	 * thread exit.  To be certain that threads exit without valid head
 	 * pointers they must see resetting set before enabled is cleared.
 	 * Otherwise a lock may not be removed from a per-thread list due
 	 * to disabled being set but not wait for reset() to remove it below.
 	 */
 	atomic_store_rel_int(&lock_prof_resetting, 1);
 	enabled = lock_prof_enable;
 	lock_prof_enable = 0;
 	quiesce_all_cpus("profreset", 0);
 	/*
 	 * Some objects may have migrated between CPUs.  Clear all links
 	 * before we zero the structures.  Some items may still be linked
 	 * into per-thread lists as well.
 	 */
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		lpc = lp_cpu[cpu];
 		for (i = 0; i < LPROF_CACHE_SIZE; i++) {
 			LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link);
 			LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link);
 		}
 	}
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		lpc = lp_cpu[cpu];
 		bzero(lpc, sizeof(*lpc));
 		lock_prof_init_type(&lpc->lpc_types[0]);
 		lock_prof_init_type(&lpc->lpc_types[1]);
 	}
 	atomic_store_rel_int(&lock_prof_resetting, 0);
 	lock_prof_enable = enabled;
 }
 
 static void
 lock_prof_output(struct lock_prof *lp, struct sbuf *sb)
 {
 	const char *p;
 
 	for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3);
 	sbuf_printf(sb,
 	    "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n",
 	    lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000,
 	    lp->cnt_wait / 1000, lp->cnt_cur,
 	    lp->cnt_cur == 0 ? (uintmax_t)0 :
 	    lp->cnt_tot / (lp->cnt_cur * 1000),
 	    lp->cnt_cur == 0 ? (uintmax_t)0 :
 	    lp->cnt_wait / (lp->cnt_cur * 1000),
 	    (uintmax_t)0, lp->cnt_contest_locking,
 	    p, lp->line, lp->class->lc_name, lp->name);
 }
 
 static void
 lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash,
     int spin, int t)
 {
 	struct lock_prof_type *type;
 	struct lock_prof *l;
 	int cpu;
 
 	dst->file = match->file;
 	dst->line = match->line;
 	dst->class = match->class;
 	dst->name = match->name;
 
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (lp_cpu[cpu] == NULL)
 			continue;
 		type = &lp_cpu[cpu]->lpc_types[spin];
 		SLIST_FOREACH(l, &type->lpt_hash[hash], link) {
 			if (l->ticks == t)
 				continue;
 			if (l->file != match->file || l->line != match->line ||
 			    l->name != match->name)
 				continue;
 			l->ticks = t;
 			if (l->cnt_max > dst->cnt_max)
 				dst->cnt_max = l->cnt_max;
 			if (l->cnt_wait_max > dst->cnt_wait_max)
 				dst->cnt_wait_max = l->cnt_wait_max;
 			dst->cnt_tot += l->cnt_tot;
 			dst->cnt_wait += l->cnt_wait;
 			dst->cnt_cur += l->cnt_cur;
 			dst->cnt_contest_locking += l->cnt_contest_locking;
 		}
 	}
 	
 }
 
 static void
 lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin,
     int t)
 {
 	struct lock_prof *l;
 	int i;
 
 	for (i = 0; i < LPROF_HASH_SIZE; ++i) {
 		SLIST_FOREACH(l, &type->lpt_hash[i], link) {
 			struct lock_prof lp = {};
 
 			if (l->ticks == t)
 				continue;
 			lock_prof_sum(l, &lp, i, spin, t);
 			lock_prof_output(&lp, sb);
 		}
 	}
 }
 
 static int
 dump_lock_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	int error, cpu, t;
 	int enabled;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req);
 	sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n",
 	    "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name");
 	enabled = lock_prof_enable;
 	lock_prof_enable = 0;
 	quiesce_all_cpus("profstat", 0);
 	t = ticks;
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (lp_cpu[cpu] == NULL)
 			continue;
 		lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t);
 		lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t);
 	}
 	lock_prof_enable = enabled;
 
 	error = sbuf_finish(sb);
 	/* Output a trailing NUL. */
 	if (error == 0)
 		error = SYSCTL_OUT(req, "", 1);
 	sbuf_delete(sb);
 	return (error);
 }
 
 static int
 enable_lock_prof(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = lock_prof_enable;
 	error = sysctl_handle_int(oidp, &v, v, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == lock_prof_enable)
 		return (0);
 	if (v == 1)
 		lock_prof_reset();
 	lock_prof_enable = !!v;
 
 	return (0);
 }
 
 static int
 reset_lock_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = 0;
 	error = sysctl_handle_int(oidp, &v, 0, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == 0)
 		return (0);
 	lock_prof_reset();
 
 	return (0);
 }
 
 static struct lock_prof *
 lock_profile_lookup(struct lock_object *lo, int spin, const char *file,
     int line)
 {
 	const char *unknown = "(unknown)";
 	struct lock_prof_type *type;
 	struct lock_prof *lp;
 	struct lphead *head;
 	const char *p;
 	u_int hash;
 
 	p = file;
 	if (p == NULL || *p == '\0')
 		p = unknown;
 	hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line;
 	hash &= LPROF_HASH_MASK;
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	head = &type->lpt_hash[hash];
 	SLIST_FOREACH(lp, head, link) {
 		if (lp->line == line && lp->file == p &&
 		    lp->name == lo->lo_name)
 			return (lp);
 
 	}
 	lp = SLIST_FIRST(&type->lpt_lpalloc);
 	if (lp == NULL) {
 		lock_prof_rejected++;
 		return (lp);
 	}
 	SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link);
 	lp->file = p;
 	lp->line = line;
 	lp->class = LOCK_CLASS(lo);
 	lp->name = lo->lo_name;
 	SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link);
 	return (lp);
 }
 
 static struct lock_profile_object *
 lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file,
     int line)
 {
 	struct lock_profile_object *l;
 	struct lock_prof_type *type;
 	struct lpohead *head;
 
 	head = &curthread->td_lprof[spin];
 	LIST_FOREACH(l, head, lpo_link)
 		if (l->lpo_obj == lo && l->lpo_file == file &&
 		    l->lpo_line == line)
 			return (l);
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	l = LIST_FIRST(&type->lpt_lpoalloc);
 	if (l == NULL) {
 		lock_prof_rejected++;
 		return (NULL);
 	}
 	LIST_REMOVE(l, lpo_link);
 	l->lpo_obj = lo;
 	l->lpo_file = file;
 	l->lpo_line = line;
 	l->lpo_cnt = 0;
 	LIST_INSERT_HEAD(head, l, lpo_link);
 
 	return (l);
 }
 
 void
 lock_profile_obtain_lock_success(struct lock_object *lo, int contested,
     uint64_t waittime, const char *file, int line)
 {
 	static int lock_prof_count;
 	struct lock_profile_object *l;
 	int spin;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	/* don't reset the timer when/if recursing */
 	if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE))
 		return;
 	if (lock_prof_skipcount &&
 	    (++lock_prof_count % lock_prof_skipcount) != 0)
 		return;
 	spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
 	if (spin && lock_prof_skipspin == 1)
 		return;
 	critical_enter();
 	/* Recheck enabled now that we're in a critical section. */
 	if (lock_prof_enable == 0)
 		goto out;
 	l = lock_profile_object_lookup(lo, spin, file, line);
 	if (l == NULL)
 		goto out;
 	l->lpo_cnt++;
 	if (++l->lpo_ref > 1)
 		goto out;
 	l->lpo_contest_locking = contested;
 	l->lpo_acqtime = nanoseconds(); 
 	if (waittime && (l->lpo_acqtime > waittime))
 		l->lpo_waittime = l->lpo_acqtime - waittime;
 	else
 		l->lpo_waittime = 0;
 out:
 	critical_exit();
 }
 
 void
 lock_profile_thread_exit(struct thread *td)
 {
 #ifdef INVARIANTS
 	struct lock_profile_object *l;
 
 	MPASS(curthread->td_critnest == 0);
 #endif
 	/*
 	 * If lock profiling was disabled we have to wait for reset to
 	 * clear our pointers before we can exit safely.
 	 */
 	lock_prof_reset_wait();
 #ifdef INVARIANTS
 	LIST_FOREACH(l, &td->td_lprof[0], lpo_link)
 		printf("thread still holds lock acquired at %s:%d\n",
 		    l->lpo_file, l->lpo_line);
 	LIST_FOREACH(l, &td->td_lprof[1], lpo_link)
 		printf("thread still holds lock acquired at %s:%d\n",
 		    l->lpo_file, l->lpo_line);
 #endif
 	MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL);
 	MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL);
 }
 
 void
 lock_profile_release_lock(struct lock_object *lo)
 {
 	struct lock_profile_object *l;
 	struct lock_prof_type *type;
 	struct lock_prof *lp;
 	uint64_t curtime, holdtime;
 	struct lpohead *head;
 	int spin;
 
 	if (SCHEDULER_STOPPED())
 		return;
 	if (lo->lo_flags & LO_NOPROFILE)
 		return;
 	spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
 	head = &curthread->td_lprof[spin];
 	if (LIST_FIRST(head) == NULL)
 		return;
 	critical_enter();
 	/* Recheck enabled now that we're in a critical section. */
 	if (lock_prof_enable == 0 && lock_prof_resetting == 1)
 		goto out;
 	/*
 	 * If lock profiling is not enabled we still want to remove the
 	 * lpo from our queue.
 	 */
 	LIST_FOREACH(l, head, lpo_link)
 		if (l->lpo_obj == lo)
 			break;
 	if (l == NULL)
 		goto out;
 	if (--l->lpo_ref > 0)
 		goto out;
 	lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line);
 	if (lp == NULL)
 		goto release;
 	curtime = nanoseconds();
 	if (curtime < l->lpo_acqtime)
 		goto release;
 	holdtime = curtime - l->lpo_acqtime;
 
 	/*
 	 * Record if the lock has been held longer now than ever
 	 * before.
 	 */
 	if (holdtime > lp->cnt_max)
 		lp->cnt_max = holdtime;
 	if (l->lpo_waittime > lp->cnt_wait_max)
 		lp->cnt_wait_max = l->lpo_waittime;
 	lp->cnt_tot += holdtime;
 	lp->cnt_wait += l->lpo_waittime;
 	lp->cnt_contest_locking += l->lpo_contest_locking;
 	lp->cnt_cur += l->lpo_cnt;
 release:
 	LIST_REMOVE(l, lpo_link);
 	type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
 	LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link);
 out:
 	critical_exit();
 }
 
 static SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL,
     "lock profiling");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW,
     &lock_prof_skipspin, 0, "Skip profiling on spinlocks.");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW,
     &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions.");
 SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD,
     &lock_prof_rejected, 0, "Number of rejected profiling records");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics");
 SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, enable_lock_prof, "I", "Enable lock profiling");
 
 #endif
Index: head/sys/kern/subr_module.c
===================================================================
--- head/sys/kern/subr_module.c	(revision 326270)
+++ head/sys/kern/subr_module.c	(revision 326271)
@@ -1,293 +1,295 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1998 Michael Smith
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/linker.h>
 
 /*
  * Preloaded module support
  */
 
 vm_offset_t preload_addr_relocate = 0;
 caddr_t preload_metadata;
 
 /*
  * Search for the preloaded module (name)
  */
 caddr_t
 preload_search_by_name(const char *name)
 {
     caddr_t	curp;
     uint32_t	*hdr;
     int		next;
     
     if (preload_metadata != NULL) {
 	
 	curp = preload_metadata;
 	for (;;) {
 	    hdr = (uint32_t *)curp;
 	    if (hdr[0] == 0 && hdr[1] == 0)
 		break;
 
 	    /* Search for a MODINFO_NAME field */
 	    if ((hdr[0] == MODINFO_NAME) &&
 		!strcmp(name, curp + sizeof(uint32_t) * 2))
 		return(curp);
 
 	    /* skip to next field */
 	    next = sizeof(uint32_t) * 2 + hdr[1];
 	    next = roundup(next, sizeof(u_long));
 	    curp += next;
 	}
     }
     return(NULL);
 }
 
 /*
  * Search for the first preloaded module of (type)
  */
 caddr_t
 preload_search_by_type(const char *type)
 {
     caddr_t	curp, lname;
     uint32_t	*hdr;
     int		next;
 
     if (preload_metadata != NULL) {
 
 	curp = preload_metadata;
 	lname = NULL;
 	for (;;) {
 	    hdr = (uint32_t *)curp;
 	    if (hdr[0] == 0 && hdr[1] == 0)
 		break;
 
 	    /* remember the start of each record */
 	    if (hdr[0] == MODINFO_NAME)
 		lname = curp;
 
 	    /* Search for a MODINFO_TYPE field */
 	    if ((hdr[0] == MODINFO_TYPE) &&
 		!strcmp(type, curp + sizeof(uint32_t) * 2))
 		return(lname);
 
 	    /* skip to next field */
 	    next = sizeof(uint32_t) * 2 + hdr[1];
 	    next = roundup(next, sizeof(u_long));
 	    curp += next;
 	}
     }
     return(NULL);
 }
 
 /*
  * Walk through the preloaded module list
  */
 caddr_t
 preload_search_next_name(caddr_t base)
 {
     caddr_t	curp;
     uint32_t	*hdr;
     int		next;
     
     if (preload_metadata != NULL) {
 	
 	/* Pick up where we left off last time */
 	if (base) {
 	    /* skip to next field */
 	    curp = base;
 	    hdr = (uint32_t *)curp;
 	    next = sizeof(uint32_t) * 2 + hdr[1];
 	    next = roundup(next, sizeof(u_long));
 	    curp += next;
 	} else
 	    curp = preload_metadata;
 
 	for (;;) {
 	    hdr = (uint32_t *)curp;
 	    if (hdr[0] == 0 && hdr[1] == 0)
 		break;
 
 	    /* Found a new record? */
 	    if (hdr[0] == MODINFO_NAME)
 		return curp;
 
 	    /* skip to next field */
 	    next = sizeof(uint32_t) * 2 + hdr[1];
 	    next = roundup(next, sizeof(u_long));
 	    curp += next;
 	}
     }
     return(NULL);
 }
 
 /*
  * Given a preloaded module handle (mod), return a pointer
  * to the data for the attribute (inf).
  */
 caddr_t
 preload_search_info(caddr_t mod, int inf)
 {
     caddr_t	curp;
     uint32_t	*hdr;
     uint32_t	type = 0;
     int		next;
 
     if (mod == NULL)
     	return (NULL);
 
     curp = mod;
     for (;;) {
 	hdr = (uint32_t *)curp;
 	/* end of module data? */
 	if (hdr[0] == 0 && hdr[1] == 0)
 	    break;
 	/* 
 	 * We give up once we've looped back to what we were looking at 
 	 * first - this should normally be a MODINFO_NAME field.
 	 */
 	if (type == 0) {
 	    type = hdr[0];
 	} else {
 	    if (hdr[0] == type)
 		break;
 	}
 	
 	/* 
 	 * Attribute match? Return pointer to data.
 	 * Consumer may safely assume that size value precedes	
 	 * data.
 	 */
 	if (hdr[0] == inf)
 	    return(curp + (sizeof(uint32_t) * 2));
 
 	/* skip to next field */
 	next = sizeof(uint32_t) * 2 + hdr[1];
 	next = roundup(next, sizeof(u_long));
 	curp += next;
     }
     return(NULL);
 }
 
 /*
  * Delete a preload record by name.
  */
 void
 preload_delete_name(const char *name)
 {
     caddr_t	curp;
     uint32_t	*hdr;
     int		next;
     int		clearing;
     
     if (preload_metadata != NULL) {
 	
 	clearing = 0;
 	curp = preload_metadata;
 	for (;;) {
 	    hdr = (uint32_t *)curp;
 	    if (hdr[0] == 0 && hdr[1] == 0)
 		break;
 
 	    /* Search for a MODINFO_NAME field */
 	    if (hdr[0] == MODINFO_NAME) {
 		if (!strcmp(name, curp + sizeof(uint32_t) * 2))
 		    clearing = 1;	/* got it, start clearing */
 		else if (clearing)
 		    clearing = 0;	/* at next one now.. better stop */
 	    }
 	    if (clearing)
 		hdr[0] = MODINFO_EMPTY;
 
 	    /* skip to next field */
 	    next = sizeof(uint32_t) * 2 + hdr[1];
 	    next = roundup(next, sizeof(u_long));
 	    curp += next;
 	}
     }
 }
 
 void *
 preload_fetch_addr(caddr_t mod)
 {
 	caddr_t *mdp;
 
 	mdp = (caddr_t *)preload_search_info(mod, MODINFO_ADDR);
 	if (mdp == NULL)
 		return (NULL);
 	return (*mdp + preload_addr_relocate);
 }
 
 size_t
 preload_fetch_size(caddr_t mod)
 {
 	size_t *mdp;
 
 	mdp = (size_t *)preload_search_info(mod, MODINFO_SIZE);
 	if (mdp == NULL)
 		return (0);
 	return (*mdp);
 }
 
 /* Called from locore.  Convert physical pointers to kvm. Sigh. */
 void
 preload_bootstrap_relocate(vm_offset_t offset)
 {
     caddr_t	curp;
     uint32_t	*hdr;
     vm_offset_t	*ptr;
     int		next;
     
     if (preload_metadata != NULL) {
 	
 	curp = preload_metadata;
 	for (;;) {
 	    hdr = (uint32_t *)curp;
 	    if (hdr[0] == 0 && hdr[1] == 0)
 		break;
 
 	    /* Deal with the ones that we know we have to fix */
 	    switch (hdr[0]) {
 	    case MODINFO_ADDR:
 	    case MODINFO_METADATA|MODINFOMD_SSYM:
 	    case MODINFO_METADATA|MODINFOMD_ESYM:
 		ptr = (vm_offset_t *)(curp + (sizeof(uint32_t) * 2));
 		*ptr += offset;
 		break;
 	    }
 	    /* The rest is beyond us for now */
 
 	    /* skip to next field */
 	    next = sizeof(uint32_t) * 2 + hdr[1];
 	    next = roundup(next, sizeof(u_long));
 	    curp += next;
 	}
     }
 }
Index: head/sys/kern/subr_msgbuf.c
===================================================================
--- head/sys/kern/subr_msgbuf.c	(revision 326270)
+++ head/sys/kern/subr_msgbuf.c	(revision 326271)
@@ -1,417 +1,419 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003 Ian Dowse.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Generic message buffer support routines.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/mutex.h>
 #include <sys/msgbuf.h>
 #include <sys/sysctl.h>
 
 /*
  * Maximum number conversion buffer length: uintmax_t in base 2, plus <>
  * around the priority, and a terminating NUL.
  */
 #define	MAXPRIBUF	(sizeof(intmax_t) * NBBY + 3)
 
 /* Read/write sequence numbers are modulo a multiple of the buffer size. */
 #define SEQMOD(size) ((size) * 16)
 
 static u_int msgbuf_cksum(struct msgbuf *mbp);
 
 /*
  * Timestamps in msgbuf are useful when trying to diagnose when core dumps
  * or other actions occurred.
  */
 static int msgbuf_show_timestamp = 0;
 SYSCTL_INT(_kern, OID_AUTO, msgbuf_show_timestamp, CTLFLAG_RWTUN,
     &msgbuf_show_timestamp, 0, "Show timestamp in msgbuf");
 
 /*
  * Initialize a message buffer of the specified size at the specified
  * location. This also zeros the buffer area.
  */
 void
 msgbuf_init(struct msgbuf *mbp, void *ptr, int size)
 {
 
 	mbp->msg_ptr = ptr;
 	mbp->msg_size = size;
 	mbp->msg_seqmod = SEQMOD(size);
 	msgbuf_clear(mbp);
 	mbp->msg_magic = MSG_MAGIC;
 	mbp->msg_lastpri = -1;
 	mbp->msg_flags = 0;
 	bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
 	mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
 }
 
 /*
  * Reinitialize a message buffer, retaining its previous contents if
  * the size and checksum are correct. If the old contents cannot be
  * recovered, the message buffer is cleared.
  */
 void
 msgbuf_reinit(struct msgbuf *mbp, void *ptr, int size)
 {
 	u_int cksum;
 
 	if (mbp->msg_magic != MSG_MAGIC || mbp->msg_size != size) {
 		msgbuf_init(mbp, ptr, size);
 		return;
 	}
 	mbp->msg_seqmod = SEQMOD(size);
 	mbp->msg_wseq = MSGBUF_SEQNORM(mbp, mbp->msg_wseq);
 	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq);
         mbp->msg_ptr = ptr;
 	cksum = msgbuf_cksum(mbp);
 	if (cksum != mbp->msg_cksum) {
 		if (bootverbose) {
 			printf("msgbuf cksum mismatch (read %x, calc %x)\n",
 			    mbp->msg_cksum, cksum);
 			printf("Old msgbuf not recovered\n");
 		}
 		msgbuf_clear(mbp);
 	}
 
 	mbp->msg_lastpri = -1;
 	/* Assume that the old message buffer didn't end in a newline. */
 	mbp->msg_flags |= MSGBUF_NEEDNL;
 	bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
 	mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
 }
 
 /*
  * Clear the message buffer.
  */
 void
 msgbuf_clear(struct msgbuf *mbp)
 {
 
 	bzero(mbp->msg_ptr, mbp->msg_size);
 	mbp->msg_wseq = 0;
 	mbp->msg_rseq = 0;
 	mbp->msg_cksum = 0;
 }
 
 /*
  * Get a count of the number of unread characters in the message buffer.
  */
 int
 msgbuf_getcount(struct msgbuf *mbp)
 {
 	u_int len;
 
 	len = MSGBUF_SEQSUB(mbp, mbp->msg_wseq, mbp->msg_rseq);
 	if (len > mbp->msg_size)
 		len = mbp->msg_size;
 	return (len);
 }
 
 /*
  * Add a character into the message buffer, and update the checksum and
  * sequence number.
  *
  * The caller should hold the message buffer spinlock.
  */
 
 static void
 msgbuf_do_addchar(struct msgbuf * const mbp, u_int * const seq, const int c)
 {
 	u_int pos;
 
 	/* Make sure we properly wrap the sequence number. */
 	pos = MSGBUF_SEQ_TO_POS(mbp, *seq);
 	mbp->msg_cksum += (u_int)(u_char)c -
 	    (u_int)(u_char)mbp->msg_ptr[pos];
 	mbp->msg_ptr[pos] = c;
 	*seq = MSGBUF_SEQNORM(mbp, *seq + 1);
 }
 
 /*
  * Append a character to a message buffer.
  */
 void
 msgbuf_addchar(struct msgbuf *mbp, int c)
 {
 	mtx_lock_spin(&mbp->msg_lock);
 
 	msgbuf_do_addchar(mbp, &mbp->msg_wseq, c);
 
 	mtx_unlock_spin(&mbp->msg_lock);
 }
 
 /*
  * Append a NUL-terminated string with a priority to a message buffer.
  * Filter carriage returns if the caller requests it.
  *
  * XXX The carriage return filtering behavior is present in the
  * msglogchar() API, however testing has shown that we don't seem to send
  * carriage returns down this path.  So do we still need it?
  */
 void
 msgbuf_addstr(struct msgbuf *mbp, int pri, char *str, int filter_cr)
 {
 	u_int seq;
 	size_t len, prefix_len;
 	char prefix[MAXPRIBUF];
 	char buf[32];
 	int nl, i, j, needtime;
 
 	len = strlen(str);
 	prefix_len = 0;
 	nl = 0;
 
 	/* If we have a zero-length string, no need to do anything. */
 	if (len == 0)
 		return;
 
 	mtx_lock_spin(&mbp->msg_lock);
 
 	/*
 	 * If this is true, we may need to insert a new priority sequence,
 	 * so prepare the prefix.
 	 */
 	if (pri != -1)
 		prefix_len = sprintf(prefix, "<%d>", pri);
 
 	/*
 	 * Starting write sequence number.
 	 */
 	seq = mbp->msg_wseq;
 
 	/*
 	 * Whenever there is a change in priority, we have to insert a
 	 * newline, and a priority prefix if the priority is not -1.  Here
 	 * we detect whether there was a priority change, and whether we
 	 * did not end with a newline.  If that is the case, we need to
 	 * insert a newline before this string.
 	 */
 	if (mbp->msg_lastpri != pri && (mbp->msg_flags & MSGBUF_NEEDNL) != 0) {
 
 		msgbuf_do_addchar(mbp, &seq, '\n');
 		mbp->msg_flags &= ~MSGBUF_NEEDNL;
 	}
 
 	needtime = 1;
 	for (i = 0; i < len; i++) {
 		/*
 		 * If we just had a newline, and the priority is not -1
 		 * (and therefore prefix_len != 0), then we need a priority
 		 * prefix for this line.
 		 */
 		if ((mbp->msg_flags & MSGBUF_NEEDNL) == 0 && prefix_len != 0) {
 			int j;
 
 			for (j = 0; j < prefix_len; j++)
 				msgbuf_do_addchar(mbp, &seq, prefix[j]);
 		}
 
 		if (msgbuf_show_timestamp && needtime == 1 &&
 		    (mbp->msg_flags & MSGBUF_NEEDNL) == 0) {
 
 			snprintf(buf, sizeof(buf), "[%jd] ",
 			    (intmax_t)time_uptime);
 			for (j = 0; buf[j] != '\0'; j++)
 				msgbuf_do_addchar(mbp, &seq, buf[j]);
 			needtime = 0;
 		}
 
 		/*
 		 * Don't copy carriage returns if the caller requested
 		 * filtering.
 		 * 
 		 * XXX This matches the behavior of msglogchar(), but is it
 		 * necessary?  Testing has shown that we don't seem to get
 		 * carriage returns here.
 		 */
 		if ((filter_cr != 0) && (str[i] == '\r'))
 			continue;
 
 		/*
 		 * Clear this flag if we see a newline.  This affects whether
 		 * we need to insert a new prefix or insert a newline later.
 		 */
 		if (str[i] == '\n')
 			mbp->msg_flags &= ~MSGBUF_NEEDNL;
 		else
 			mbp->msg_flags |= MSGBUF_NEEDNL;
 
 		msgbuf_do_addchar(mbp, &seq, str[i]);
 	}
 	/*
 	 * Update the write sequence number for the actual number of
 	 * characters we put in the message buffer.  (Depends on whether
 	 * carriage returns are filtered.)
 	 */
 	mbp->msg_wseq = seq;
 
 	/*
 	 * Set the last priority.
 	 */
 	mbp->msg_lastpri = pri;
 
 	mtx_unlock_spin(&mbp->msg_lock);
 
 }
 
 /*
  * Read and mark as read a character from a message buffer.
  * Returns the character, or -1 if no characters are available.
  */
 int
 msgbuf_getchar(struct msgbuf *mbp)
 {
 	u_int len, wseq;
 	int c;
 
 	mtx_lock_spin(&mbp->msg_lock);
 
 	wseq = mbp->msg_wseq;
 	len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
 	if (len == 0) {
 		mtx_unlock_spin(&mbp->msg_lock);
 		return (-1);
 	}
 	if (len > mbp->msg_size)
 		mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
 	c = (u_char)mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq)];
 	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + 1);
 
 	mtx_unlock_spin(&mbp->msg_lock);
 
 	return (c);
 }
 
 /*
  * Read and mark as read a number of characters from a message buffer.
  * Returns the number of characters that were placed in `buf'.
  */
 int
 msgbuf_getbytes(struct msgbuf *mbp, char *buf, int buflen)
 {
 	u_int len, pos, wseq;
 
 	mtx_lock_spin(&mbp->msg_lock);
 
 	wseq = mbp->msg_wseq;
 	len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
 	if (len == 0) {
 		mtx_unlock_spin(&mbp->msg_lock);
 		return (0);
 	}
 	if (len > mbp->msg_size) {
 		mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
 		len = mbp->msg_size;
 	}
 	pos = MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq);
 	len = min(len, mbp->msg_size - pos);
 	len = min(len, (u_int)buflen);
 
 	bcopy(&mbp->msg_ptr[pos], buf, len);
 	mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + len);
 
 	mtx_unlock_spin(&mbp->msg_lock);
 
 	return (len);
 }
 
 /*
  * Peek at the full contents of a message buffer without marking any
  * data as read. `seqp' should point to an unsigned integer that
  * msgbuf_peekbytes() can use to retain state between calls so that
  * the whole message buffer can be read in multiple short reads.
  * To initialise this variable to the start of the message buffer,
  * call msgbuf_peekbytes() with a NULL `buf' parameter.
  *
  * Returns the number of characters that were placed in `buf'.
  */
 int
 msgbuf_peekbytes(struct msgbuf *mbp, char *buf, int buflen, u_int *seqp)
 {
 	u_int len, pos, wseq;
 
 	mtx_lock_spin(&mbp->msg_lock);
 
 	if (buf == NULL) {
 		/* Just initialise *seqp. */
 		*seqp = MSGBUF_SEQNORM(mbp, mbp->msg_wseq - mbp->msg_size);
 		mtx_unlock_spin(&mbp->msg_lock);
 		return (0);
 	}
 
 	wseq = mbp->msg_wseq;
 	len = MSGBUF_SEQSUB(mbp, wseq, *seqp);
 	if (len == 0) {
 		mtx_unlock_spin(&mbp->msg_lock);
 		return (0);
 	}
 	if (len > mbp->msg_size) {
 		*seqp = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
 		len = mbp->msg_size;
 	}
 	pos = MSGBUF_SEQ_TO_POS(mbp, *seqp);
 	len = min(len, mbp->msg_size - pos);
 	len = min(len, (u_int)buflen);
 	bcopy(&mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, *seqp)], buf, len);
 	*seqp = MSGBUF_SEQNORM(mbp, *seqp + len);
 
 	mtx_unlock_spin(&mbp->msg_lock);
 
 	return (len);
 }
 
 /*
  * Compute the checksum for the complete message buffer contents.
  */
 static u_int
 msgbuf_cksum(struct msgbuf *mbp)
 {
 	u_int i, sum;
 
 	sum = 0;
 	for (i = 0; i < mbp->msg_size; i++)
 		sum += (u_char)mbp->msg_ptr[i];
 	return (sum);
 }
 
 /*
  * Copy from one message buffer to another.
  */
 void
 msgbuf_copy(struct msgbuf *src, struct msgbuf *dst)
 {
 	int c;
 
 	while ((c = msgbuf_getchar(src)) >= 0)
 		msgbuf_addchar(dst, c);
 }
Index: head/sys/kern/subr_pctrie.c
===================================================================
--- head/sys/kern/subr_pctrie.c	(revision 326270)
+++ head/sys/kern/subr_pctrie.c	(revision 326271)
@@ -1,693 +1,695 @@
-/*
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2013 EMC Corp.
  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * Path-compressed radix trie implementation.
  *
  * The implementation takes into account the following rationale:
  * - Size of the nodes should be as small as possible but still big enough
  *   to avoid a large maximum depth for the trie.  This is a balance
  *   between the necessity to not wire too much physical memory for the nodes
  *   and the necessity to avoid too much cache pollution during the trie
  *   operations.
  * - There is not a huge bias toward the number of lookup operations over
  *   the number of insert and remove operations.  This basically implies
  *   that optimizations supposedly helping one operation but hurting the
  *   other might be carefully evaluated.
  * - On average not many nodes are expected to be fully populated, hence
  *   level compression may just complicate things.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/pctrie.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
 #define	PCTRIE_LIMIT	(howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
 
 /* Flag bits stored in node pointers. */
 #define	PCTRIE_ISLEAF	0x1
 #define	PCTRIE_FLAGS	0x1
 #define	PCTRIE_PAD	PCTRIE_FLAGS
 
 /* Returns one unit associated with specified level. */
 #define	PCTRIE_UNITLEVEL(lev)						\
 	((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
 
 struct pctrie_node {
 	uint64_t	 pn_owner;			/* Owner of record. */
 	uint16_t	 pn_count;			/* Valid children. */
 	uint16_t	 pn_clev;			/* Current level. */
 	void		*pn_child[PCTRIE_COUNT];	/* Child nodes. */
 };
 
 /*
  * Allocate a node.  Pre-allocation should ensure that the request
  * will always be satisfied.
  */
 static __inline struct pctrie_node *
 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
     uint16_t count, uint16_t clevel)
 {
 	struct pctrie_node *node;
 
 	node = allocfn(ptree);
 	if (node == NULL)
 		return (NULL);
 	node->pn_owner = owner;
 	node->pn_count = count;
 	node->pn_clev = clevel;
 
 	return (node);
 }
 
 /*
  * Free radix node.
  */
 static __inline void
 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
     pctrie_free_t freefn)
 {
 #ifdef INVARIANTS
 	int slot;
 
 	KASSERT(node->pn_count == 0,
 	    ("pctrie_node_put: node %p has %d children", node,
 	    node->pn_count));
 	for (slot = 0; slot < PCTRIE_COUNT; slot++)
 		KASSERT(node->pn_child[slot] == NULL,
 		    ("pctrie_node_put: node %p has a child", node));
 #endif
 	freefn(ptree, node);
 }
 
 /*
  * Return the position in the array for a given level.
  */
 static __inline int
 pctrie_slot(uint64_t index, uint16_t level)
 {
 
 	return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
 }
 
 /* Trims the key after the specified level. */
 static __inline uint64_t
 pctrie_trimkey(uint64_t index, uint16_t level)
 {
 	uint64_t ret;
 
 	ret = index;
 	if (level > 0) {
 		ret >>= level * PCTRIE_WIDTH;
 		ret <<= level * PCTRIE_WIDTH;
 	}
 	return (ret);
 }
 
 /*
  * Get the root node for a tree.
  */
 static __inline struct pctrie_node *
 pctrie_getroot(struct pctrie *ptree)
 {
 
 	return ((struct pctrie_node *)ptree->pt_root);
 }
 
 /*
  * Set the root node for a tree.
  */
 static __inline void
 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
 {
 
 	ptree->pt_root = (uintptr_t)node;
 }
 
 /*
  * Returns TRUE if the specified node is a leaf and FALSE otherwise.
  */
 static __inline boolean_t
 pctrie_isleaf(struct pctrie_node *node)
 {
 
 	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
 }
 
 /*
  * Returns the associated val extracted from node.
  */
 static __inline uint64_t *
 pctrie_toval(struct pctrie_node *node)
 {
 
 	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
 }
 
 /*
  * Adds the val as a child of the provided node.
  */
 static __inline void
 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
     uint64_t *val)
 {
 	int slot;
 
 	slot = pctrie_slot(index, clev);
 	node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
 }
 
 /*
  * Returns the slot where two keys differ.
  * It cannot accept 2 equal keys.
  */
 static __inline uint16_t
 pctrie_keydiff(uint64_t index1, uint64_t index2)
 {
 	uint16_t clev;
 
 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
 	    __func__, (uintmax_t)index1));
 
 	index1 ^= index2;
 	for (clev = PCTRIE_LIMIT;; clev--)
 		if (pctrie_slot(index1, clev) != 0)
 			return (clev);
 }
 
 /*
  * Returns TRUE if it can be determined that key does not belong to the
  * specified node.  Otherwise, returns FALSE.
  */
 static __inline boolean_t
 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
 {
 
 	if (node->pn_clev < PCTRIE_LIMIT) {
 		idx = pctrie_trimkey(idx, node->pn_clev + 1);
 		return (idx != node->pn_owner);
 	}
 	return (FALSE);
 }
 
 /*
  * Internal helper for pctrie_reclaim_allnodes().
  * This function is recursive.
  */
 static void
 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
     pctrie_free_t freefn)
 {
 	int slot;
 
 	KASSERT(node->pn_count <= PCTRIE_COUNT,
 	    ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
 	for (slot = 0; node->pn_count != 0; slot++) {
 		if (node->pn_child[slot] == NULL)
 			continue;
 		if (!pctrie_isleaf(node->pn_child[slot]))
 			pctrie_reclaim_allnodes_int(ptree,
 			    node->pn_child[slot], freefn);
 		node->pn_child[slot] = NULL;
 		node->pn_count--;
 	}
 	pctrie_node_put(ptree, node, freefn);
 }
 
 /*
  * pctrie node zone initializer.
  */
 int
 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
 {
 	struct pctrie_node *node;
 
 	node = mem;
 	memset(node->pn_child, 0, sizeof(node->pn_child));
 	return (0);
 }
 
 size_t
 pctrie_node_size(void)
 {
 
 	return (sizeof(struct pctrie_node));
 }
 
 /*
  * Inserts the key-value pair into the trie.
  * Panics if the key already exists.
  */
 int
 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
 {
 	uint64_t index, newind;
 	void **parentp;
 	struct pctrie_node *node, *tmp;
 	uint64_t *m;
 	int slot;
 	uint16_t clev;
 
 	index = *val;
 
 	/*
 	 * The owner of record for root is not really important because it
 	 * will never be used.
 	 */
 	node = pctrie_getroot(ptree);
 	if (node == NULL) {
 		ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
 		return (0);
 	}
 	parentp = (void **)&ptree->pt_root;
 	for (;;) {
 		if (pctrie_isleaf(node)) {
 			m = pctrie_toval(node);
 			if (*m == index)
 				panic("%s: key %jx is already present",
 				    __func__, (uintmax_t)index);
 			clev = pctrie_keydiff(*m, index);
 			tmp = pctrie_node_get(ptree, allocfn,
 			    pctrie_trimkey(index, clev + 1), 2, clev);
 			if (tmp == NULL)
 				return (ENOMEM);
 			*parentp = tmp;
 			pctrie_addval(tmp, index, clev, val);
 			pctrie_addval(tmp, *m, clev, m);
 			return (0);
 		} else if (pctrie_keybarr(node, index))
 			break;
 		slot = pctrie_slot(index, node->pn_clev);
 		if (node->pn_child[slot] == NULL) {
 			node->pn_count++;
 			pctrie_addval(node, index, node->pn_clev, val);
 			return (0);
 		}
 		parentp = &node->pn_child[slot];
 		node = node->pn_child[slot];
 	}
 
 	/*
 	 * A new node is needed because the right insertion level is reached.
 	 * Setup the new intermediate node and add the 2 children: the
 	 * new object and the older edge.
 	 */
 	newind = node->pn_owner;
 	clev = pctrie_keydiff(newind, index);
 	tmp = pctrie_node_get(ptree, allocfn,
 	    pctrie_trimkey(index, clev + 1), 2, clev);
 	if (tmp == NULL)
 		return (ENOMEM);
 	*parentp = tmp;
 	pctrie_addval(tmp, index, clev, val);
 	slot = pctrie_slot(newind, clev);
 	tmp->pn_child[slot] = node;
 
 	return (0);
 }
 
 /*
  * Returns the value stored at the index.  If the index is not present,
  * NULL is returned.
  */
 uint64_t *
 pctrie_lookup(struct pctrie *ptree, uint64_t index)
 {
 	struct pctrie_node *node;
 	uint64_t *m;
 	int slot;
 
 	node = pctrie_getroot(ptree);
 	while (node != NULL) {
 		if (pctrie_isleaf(node)) {
 			m = pctrie_toval(node);
 			if (*m == index)
 				return (m);
 			else
 				break;
 		} else if (pctrie_keybarr(node, index))
 			break;
 		slot = pctrie_slot(index, node->pn_clev);
 		node = node->pn_child[slot];
 	}
 	return (NULL);
 }
 
 /*
  * Look up the nearest entry at a position bigger than or equal to index.
  */
 uint64_t *
 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
 {
 	struct pctrie_node *stack[PCTRIE_LIMIT];
 	uint64_t inc;
 	uint64_t *m;
 	struct pctrie_node *child, *node;
 #ifdef INVARIANTS
 	int loops = 0;
 #endif
 	int slot, tos;
 
 	node = pctrie_getroot(ptree);
 	if (node == NULL)
 		return (NULL);
 	else if (pctrie_isleaf(node)) {
 		m = pctrie_toval(node);
 		if (*m >= index)
 			return (m);
 		else
 			return (NULL);
 	}
 	tos = 0;
 	for (;;) {
 		/*
 		 * If the keys differ before the current bisection node,
 		 * then the search key might rollback to the earliest
 		 * available bisection node or to the smallest key
 		 * in the current node (if the owner is bigger than the
 		 * search key).
 		 */
 		if (pctrie_keybarr(node, index)) {
 			if (index > node->pn_owner) {
 ascend:
 				KASSERT(++loops < 1000,
 				    ("pctrie_lookup_ge: too many loops"));
 
 				/*
 				 * Pop nodes from the stack until either the
 				 * stack is empty or a node that could have a
 				 * matching descendant is found.
 				 */
 				do {
 					if (tos == 0)
 						return (NULL);
 					node = stack[--tos];
 				} while (pctrie_slot(index,
 				    node->pn_clev) == (PCTRIE_COUNT - 1));
 
 				/*
 				 * The following computation cannot overflow
 				 * because index's slot at the current level
 				 * is less than PCTRIE_COUNT - 1.
 				 */
 				index = pctrie_trimkey(index,
 				    node->pn_clev);
 				index += PCTRIE_UNITLEVEL(node->pn_clev);
 			} else
 				index = node->pn_owner;
 			KASSERT(!pctrie_keybarr(node, index),
 			    ("pctrie_lookup_ge: keybarr failed"));
 		}
 		slot = pctrie_slot(index, node->pn_clev);
 		child = node->pn_child[slot];
 		if (pctrie_isleaf(child)) {
 			m = pctrie_toval(child);
 			if (*m >= index)
 				return (m);
 		} else if (child != NULL)
 			goto descend;
 
 		/*
 		 * Look for an available edge or val within the current
 		 * bisection node.
 		 */
                 if (slot < (PCTRIE_COUNT - 1)) {
 			inc = PCTRIE_UNITLEVEL(node->pn_clev);
 			index = pctrie_trimkey(index, node->pn_clev);
 			do {
 				index += inc;
 				slot++;
 				child = node->pn_child[slot];
 				if (pctrie_isleaf(child)) {
 					m = pctrie_toval(child);
 					if (*m >= index)
 						return (m);
 				} else if (child != NULL)
 					goto descend;
 			} while (slot < (PCTRIE_COUNT - 1));
 		}
 		KASSERT(child == NULL || pctrie_isleaf(child),
 		    ("pctrie_lookup_ge: child is radix node"));
 
 		/*
 		 * If a value or edge bigger than the search slot is not found
 		 * in the current node, ascend to the next higher-level node.
 		 */
 		goto ascend;
 descend:
 		KASSERT(node->pn_clev > 0,
 		    ("pctrie_lookup_ge: pushing leaf's parent"));
 		KASSERT(tos < PCTRIE_LIMIT,
 		    ("pctrie_lookup_ge: stack overflow"));
 		stack[tos++] = node;
 		node = child;
 	}
 }
 
 /*
  * Look up the nearest entry at a position less than or equal to index.
  */
 uint64_t *
 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
 {
 	struct pctrie_node *stack[PCTRIE_LIMIT];
 	uint64_t inc;
 	uint64_t *m;
 	struct pctrie_node *child, *node;
 #ifdef INVARIANTS
 	int loops = 0;
 #endif
 	int slot, tos;
 
 	node = pctrie_getroot(ptree);
 	if (node == NULL)
 		return (NULL);
 	else if (pctrie_isleaf(node)) {
 		m = pctrie_toval(node);
 		if (*m <= index)
 			return (m);
 		else
 			return (NULL);
 	}
 	tos = 0;
 	for (;;) {
 		/*
 		 * If the keys differ before the current bisection node,
 		 * then the search key might rollback to the earliest
 		 * available bisection node or to the largest key
 		 * in the current node (if the owner is smaller than the
 		 * search key).
 		 */
 		if (pctrie_keybarr(node, index)) {
 			if (index > node->pn_owner) {
 				index = node->pn_owner + PCTRIE_COUNT *
 				    PCTRIE_UNITLEVEL(node->pn_clev);
 			} else {
 ascend:
 				KASSERT(++loops < 1000,
 				    ("pctrie_lookup_le: too many loops"));
 
 				/*
 				 * Pop nodes from the stack until either the
 				 * stack is empty or a node that could have a
 				 * matching descendant is found.
 				 */
 				do {
 					if (tos == 0)
 						return (NULL);
 					node = stack[--tos];
 				} while (pctrie_slot(index,
 				    node->pn_clev) == 0);
 
 				/*
 				 * The following computation cannot overflow
 				 * because index's slot at the current level
 				 * is greater than 0.
 				 */
 				index = pctrie_trimkey(index,
 				    node->pn_clev);
 			}
 			index--;
 			KASSERT(!pctrie_keybarr(node, index),
 			    ("pctrie_lookup_le: keybarr failed"));
 		}
 		slot = pctrie_slot(index, node->pn_clev);
 		child = node->pn_child[slot];
 		if (pctrie_isleaf(child)) {
 			m = pctrie_toval(child);
 			if (*m <= index)
 				return (m);
 		} else if (child != NULL)
 			goto descend;
 
 		/*
 		 * Look for an available edge or value within the current
 		 * bisection node.
 		 */
 		if (slot > 0) {
 			inc = PCTRIE_UNITLEVEL(node->pn_clev);
 			index |= inc - 1;
 			do {
 				index -= inc;
 				slot--;
 				child = node->pn_child[slot];
 				if (pctrie_isleaf(child)) {
 					m = pctrie_toval(child);
 					if (*m <= index)
 						return (m);
 				} else if (child != NULL)
 					goto descend;
 			} while (slot > 0);
 		}
 		KASSERT(child == NULL || pctrie_isleaf(child),
 		    ("pctrie_lookup_le: child is radix node"));
 
 		/*
 		 * If a value or edge smaller than the search slot is not found
 		 * in the current node, ascend to the next higher-level node.
 		 */
 		goto ascend;
 descend:
 		KASSERT(node->pn_clev > 0,
 		    ("pctrie_lookup_le: pushing leaf's parent"));
 		KASSERT(tos < PCTRIE_LIMIT,
 		    ("pctrie_lookup_le: stack overflow"));
 		stack[tos++] = node;
 		node = child;
 	}
 }
 
 /*
  * Remove the specified index from the tree.
  * Panics if the key is not present.
  */
 void
 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
 {
 	struct pctrie_node *node, *parent;
 	uint64_t *m;
 	int i, slot;
 
 	node = pctrie_getroot(ptree);
 	if (pctrie_isleaf(node)) {
 		m = pctrie_toval(node);
 		if (*m != index)
 			panic("%s: invalid key found", __func__);
 		pctrie_setroot(ptree, NULL);
 		return;
 	}
 	parent = NULL;
 	for (;;) {
 		if (node == NULL)
 			panic("pctrie_remove: impossible to locate the key");
 		slot = pctrie_slot(index, node->pn_clev);
 		if (pctrie_isleaf(node->pn_child[slot])) {
 			m = pctrie_toval(node->pn_child[slot]);
 			if (*m != index)
 				panic("%s: invalid key found", __func__);
 			node->pn_child[slot] = NULL;
 			node->pn_count--;
 			if (node->pn_count > 1)
 				break;
 			for (i = 0; i < PCTRIE_COUNT; i++)
 				if (node->pn_child[i] != NULL)
 					break;
 			KASSERT(i != PCTRIE_COUNT,
 			    ("%s: invalid node configuration", __func__));
 			if (parent == NULL)
 				pctrie_setroot(ptree, node->pn_child[i]);
 			else {
 				slot = pctrie_slot(index, parent->pn_clev);
 				KASSERT(parent->pn_child[slot] == node,
 				    ("%s: invalid child value", __func__));
 				parent->pn_child[slot] = node->pn_child[i];
 			}
 			node->pn_count--;
 			node->pn_child[i] = NULL;
 			pctrie_node_put(ptree, node, freefn);
 			break;
 		}
 		parent = node;
 		node = node->pn_child[slot];
 	}
 }
 
 /*
  * Remove and free all the nodes from the tree.
  * This function is recursive but there is a tight control on it as the
  * maximum depth of the tree is fixed.
  */
 void
 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
 {
 	struct pctrie_node *root;
 
 	root = pctrie_getroot(ptree);
 	if (root == NULL)
 		return;
 	pctrie_setroot(ptree, NULL);
 	if (!pctrie_isleaf(root))
 		pctrie_reclaim_allnodes_int(ptree, root, freefn);
 }
 
 #ifdef DDB
 /*
  * Show details about the given node.
  */
 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
 {
 	struct pctrie_node *node;
 	int i;
 
         if (!have_addr)
                 return;
 	node = (struct pctrie_node *)addr;
 	db_printf("node %p, owner %jx, children count %u, level %u:\n",
 	    (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
 	    node->pn_clev);
 	for (i = 0; i < PCTRIE_COUNT; i++)
 		if (node->pn_child[i] != NULL)
 			db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
 			    i, (void *)node->pn_child[i],
 			    pctrie_isleaf(node->pn_child[i]) ?
 			    pctrie_toval(node->pn_child[i]) : NULL,
 			    node->pn_clev);
 }
 #endif /* DDB */
Index: head/sys/kern/subr_power.c
===================================================================
--- head/sys/kern/subr_power.c	(revision 326270)
+++ head/sys/kern/subr_power.c	(revision 326271)
@@ -1,122 +1,124 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2001 Mitsuru IWASAKI
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <sys/power.h>
 #include <sys/taskqueue.h>
 
 static u_int		 power_pm_type	= POWER_PM_TYPE_NONE;
 static power_pm_fn_t	 power_pm_fn	= NULL;
 static void		*power_pm_arg	= NULL;
 static struct task	 power_pm_task;
 
 static void
 power_pm_deferred_fn(void *arg, int pending)
 {
 	int state = (intptr_t)arg;
 
 	power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
 }
 
 int
 power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
 {
 	int	error;
 
 	if (power_pm_type == POWER_PM_TYPE_NONE ||
 	    power_pm_type == pm_type) {
 		power_pm_type	= pm_type;
 		power_pm_fn	= pm_fn;
 		power_pm_arg	= pm_arg;
 		error = 0;
 		TASK_INIT(&power_pm_task, 0, power_pm_deferred_fn, NULL);
 	} else {
 		error = ENXIO;
 	}
 
 	return (error);
 }
 
 u_int
 power_pm_get_type(void)
 {
 
 	return (power_pm_type);
 }
 
 void
 power_pm_suspend(int state)
 {
 	if (power_pm_fn == NULL)
 		return;
 
 	if (state != POWER_SLEEP_STATE_STANDBY &&
 	    state != POWER_SLEEP_STATE_SUSPEND &&
 	    state != POWER_SLEEP_STATE_HIBERNATE)
 		return;
 	power_pm_task.ta_context = (void *)(intptr_t)state;
 	taskqueue_enqueue(taskqueue_thread, &power_pm_task);
 }
 
 /*
  * Power profile.
  */
 
 static int	power_profile_state = POWER_PROFILE_PERFORMANCE;
 
 int
 power_profile_get_state(void)
 {
 	return (power_profile_state);
 }
 
 void
 power_profile_set_state(int state) 
 {
 	int		changed;
     
 	if (state != power_profile_state) {
 		power_profile_state = state;
 		changed = 1;
 		if (bootverbose) {
 			printf("system power profile changed to '%s'\n",
 				(state == POWER_PROFILE_PERFORMANCE) ?
 				"performance" : "economy");
 		}
 	} else {
 		changed = 0;
 	}
 
 	if (changed)
 		EVENTHANDLER_INVOKE(power_profile_change, 0);
 }
 
Index: head/sys/kern/subr_sbuf.c
===================================================================
--- head/sys/kern/subr_sbuf.c	(revision 326270)
+++ head/sys/kern/subr_sbuf.c	(revision 326271)
@@ -1,882 +1,884 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000-2008 Poul-Henning Kamp
  * Copyright (c) 2000-2008 Dag-Erling Coïdan Smørgrav
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #ifdef _KERNEL
 #include <sys/ctype.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <machine/stdarg.h>
 #else /* _KERNEL */
 #include <ctype.h>
 #include <errno.h>
 #include <limits.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #endif /* _KERNEL */
 
 #include <sys/sbuf.h>
 
 #ifdef _KERNEL
 static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers");
 #define	SBMALLOC(size)		malloc(size, M_SBUF, M_WAITOK|M_ZERO)
 #define	SBFREE(buf)		free(buf, M_SBUF)
 #else /* _KERNEL */
 #define	KASSERT(e, m)
 #define	SBMALLOC(size)		calloc(1, size)
 #define	SBFREE(buf)		free(buf)
 #endif /* _KERNEL */
 
 /*
  * Predicates
  */
 #define	SBUF_ISDYNAMIC(s)	((s)->s_flags & SBUF_DYNAMIC)
 #define	SBUF_ISDYNSTRUCT(s)	((s)->s_flags & SBUF_DYNSTRUCT)
 #define	SBUF_ISFINISHED(s)	((s)->s_flags & SBUF_FINISHED)
 #define	SBUF_HASROOM(s)		((s)->s_len < (s)->s_size - 1)
 #define	SBUF_FREESPACE(s)	((s)->s_size - ((s)->s_len + 1))
 #define	SBUF_CANEXTEND(s)	((s)->s_flags & SBUF_AUTOEXTEND)
 #define	SBUF_ISSECTION(s)	((s)->s_flags & SBUF_INSECTION)
 #define	SBUF_NULINCLUDED(s)	((s)->s_flags & SBUF_INCLUDENUL)
 #define	SBUF_ISDRAINTOEOR(s)	((s)->s_flags & SBUF_DRAINTOEOR)
 #define	SBUF_DODRAINTOEOR(s)	(SBUF_ISSECTION(s) && SBUF_ISDRAINTOEOR(s))
 
 /*
  * Set / clear flags
  */
 #define	SBUF_SETFLAG(s, f)	do { (s)->s_flags |= (f); } while (0)
 #define	SBUF_CLEARFLAG(s, f)	do { (s)->s_flags &= ~(f); } while (0)
 
 #define	SBUF_MINSIZE		 2		/* Min is 1 byte + nulterm. */
 #define	SBUF_MINEXTENDSIZE	16		/* Should be power of 2. */
 
 #ifdef PAGE_SIZE
 #define	SBUF_MAXEXTENDSIZE	PAGE_SIZE
 #define	SBUF_MAXEXTENDINCR	PAGE_SIZE
 #else
 #define	SBUF_MAXEXTENDSIZE	4096
 #define	SBUF_MAXEXTENDINCR	4096
 #endif
 
 /*
  * Debugging support
  */
 #if defined(_KERNEL) && defined(INVARIANTS)
 
 static void
 _assert_sbuf_integrity(const char *fun, struct sbuf *s)
 {
 
 	KASSERT(s != NULL,
 	    ("%s called with a NULL sbuf pointer", fun));
 	KASSERT(s->s_buf != NULL,
 	    ("%s called with uninitialized or corrupt sbuf", fun));
 	if (SBUF_ISFINISHED(s) && SBUF_NULINCLUDED(s)) {
 		KASSERT(s->s_len <= s->s_size,
 		    ("wrote past end of sbuf (%jd >= %jd)",
 		    (intmax_t)s->s_len, (intmax_t)s->s_size));
 	} else {
 		KASSERT(s->s_len < s->s_size,
 		    ("wrote past end of sbuf (%jd >= %jd)",
 		    (intmax_t)s->s_len, (intmax_t)s->s_size));
 	}
 }
 
 static void
 _assert_sbuf_state(const char *fun, struct sbuf *s, int state)
 {
 
 	KASSERT((s->s_flags & SBUF_FINISHED) == state,
 	    ("%s called with %sfinished or corrupt sbuf", fun,
 	    (state ? "un" : "")));
 }
 
 #define	assert_sbuf_integrity(s) _assert_sbuf_integrity(__func__, (s))
 #define	assert_sbuf_state(s, i)	 _assert_sbuf_state(__func__, (s), (i))
 
 #else /* _KERNEL && INVARIANTS */
 
 #define	assert_sbuf_integrity(s) do { } while (0)
 #define	assert_sbuf_state(s, i)	 do { } while (0)
 
 #endif /* _KERNEL && INVARIANTS */
 
 #ifdef CTASSERT
 CTASSERT(powerof2(SBUF_MAXEXTENDSIZE));
 CTASSERT(powerof2(SBUF_MAXEXTENDINCR));
 #endif
 
 static int
 sbuf_extendsize(int size)
 {
 	int newsize;
 
 	if (size < (int)SBUF_MAXEXTENDSIZE) {
 		newsize = SBUF_MINEXTENDSIZE;
 		while (newsize < size)
 			newsize *= 2;
 	} else {
 		newsize = roundup2(size, SBUF_MAXEXTENDINCR);
 	}
 	KASSERT(newsize >= size, ("%s: %d < %d\n", __func__, newsize, size));
 	return (newsize);
 }
 
 /*
  * Extend an sbuf.
  */
 static int
 sbuf_extend(struct sbuf *s, int addlen)
 {
 	char *newbuf;
 	int newsize;
 
 	if (!SBUF_CANEXTEND(s))
 		return (-1);
 	newsize = sbuf_extendsize(s->s_size + addlen);
 	newbuf = SBMALLOC(newsize);
 	if (newbuf == NULL)
 		return (-1);
 	memcpy(newbuf, s->s_buf, s->s_size);
 	if (SBUF_ISDYNAMIC(s))
 		SBFREE(s->s_buf);
 	else
 		SBUF_SETFLAG(s, SBUF_DYNAMIC);
 	s->s_buf = newbuf;
 	s->s_size = newsize;
 	return (0);
 }
 
 /*
  * Initialize the internals of an sbuf.
  * If buf is non-NULL, it points to a static or already-allocated string
  * big enough to hold at least length characters.
  */
 static struct sbuf *
 sbuf_newbuf(struct sbuf *s, char *buf, int length, int flags)
 {
 
 	memset(s, 0, sizeof(*s));
 	s->s_flags = flags;
 	s->s_size = length;
 	s->s_buf = buf;
 
 	if ((s->s_flags & SBUF_AUTOEXTEND) == 0) {
 		KASSERT(s->s_size >= SBUF_MINSIZE,
 		    ("attempt to create an sbuf smaller than %d bytes",
 		    SBUF_MINSIZE));
 	}
 
 	if (s->s_buf != NULL)
 		return (s);
 
 	if ((flags & SBUF_AUTOEXTEND) != 0)
 		s->s_size = sbuf_extendsize(s->s_size);
 
 	s->s_buf = SBMALLOC(s->s_size);
 	if (s->s_buf == NULL)
 		return (NULL);
 	SBUF_SETFLAG(s, SBUF_DYNAMIC);
 	return (s);
 }
 
 /*
  * Initialize an sbuf.
  * If buf is non-NULL, it points to a static or already-allocated string
  * big enough to hold at least length characters.
  */
 struct sbuf *
 sbuf_new(struct sbuf *s, char *buf, int length, int flags)
 {
 
 	KASSERT(length >= 0,
 	    ("attempt to create an sbuf of negative length (%d)", length));
 	KASSERT((flags & ~SBUF_USRFLAGMSK) == 0,
 	    ("%s called with invalid flags", __func__));
 
 	flags &= SBUF_USRFLAGMSK;
 	if (s != NULL)
 		return (sbuf_newbuf(s, buf, length, flags));
 
 	s = SBMALLOC(sizeof(*s));
 	if (s == NULL)
 		return (NULL);
 	if (sbuf_newbuf(s, buf, length, flags) == NULL) {
 		SBFREE(s);
 		return (NULL);
 	}
 	SBUF_SETFLAG(s, SBUF_DYNSTRUCT);
 	return (s);
 }
 
 #ifdef _KERNEL
 /*
  * Create an sbuf with uio data
  */
 struct sbuf *
 sbuf_uionew(struct sbuf *s, struct uio *uio, int *error)
 {
 
 	KASSERT(uio != NULL,
 	    ("%s called with NULL uio pointer", __func__));
 	KASSERT(error != NULL,
 	    ("%s called with NULL error pointer", __func__));
 
 	s = sbuf_new(s, NULL, uio->uio_resid + 1, 0);
 	if (s == NULL) {
 		*error = ENOMEM;
 		return (NULL);
 	}
 	*error = uiomove(s->s_buf, uio->uio_resid, uio);
 	if (*error != 0) {
 		sbuf_delete(s);
 		return (NULL);
 	}
 	s->s_len = s->s_size - 1;
 	if (SBUF_ISSECTION(s))
 		s->s_sect_len = s->s_size - 1;
 	*error = 0;
 	return (s);
 }
 #endif
 
 int
 sbuf_get_flags(struct sbuf *s)
 {
 
 	return (s->s_flags & SBUF_USRFLAGMSK);
 }
 
 void
 sbuf_clear_flags(struct sbuf *s, int flags)
 {
 
 	s->s_flags &= ~(flags & SBUF_USRFLAGMSK);
 }
 
 void
 sbuf_set_flags(struct sbuf *s, int flags)
 {
 
 
 	s->s_flags |= (flags & SBUF_USRFLAGMSK);
 }
 
 /*
  * Clear an sbuf and reset its position.
  */
 void
 sbuf_clear(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	/* don't care if it's finished or not */
 
 	SBUF_CLEARFLAG(s, SBUF_FINISHED);
 	s->s_error = 0;
 	s->s_len = 0;
 	s->s_rec_off = 0;
 	s->s_sect_len = 0;
 }
 
 /*
  * Set the sbuf's end position to an arbitrary value.
  * Effectively truncates the sbuf at the new position.
  */
 int
 sbuf_setpos(struct sbuf *s, ssize_t pos)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	KASSERT(pos >= 0,
 	    ("attempt to seek to a negative position (%jd)", (intmax_t)pos));
 	KASSERT(pos < s->s_size,
 	    ("attempt to seek past end of sbuf (%jd >= %jd)",
 	    (intmax_t)pos, (intmax_t)s->s_size));
 	KASSERT(!SBUF_ISSECTION(s),
 	    ("attempt to seek when in a section"));
 
 	if (pos < 0 || pos > s->s_len)
 		return (-1);
 	s->s_len = pos;
 	return (0);
 }
 
 /*
  * Set up a drain function and argument on an sbuf to flush data to
  * when the sbuf buffer overflows.
  */
 void
 sbuf_set_drain(struct sbuf *s, sbuf_drain_func *func, void *ctx)
 {
 
 	assert_sbuf_state(s, 0);
 	assert_sbuf_integrity(s);
 	KASSERT(func == s->s_drain_func || s->s_len == 0,
 	    ("Cannot change drain to %p on non-empty sbuf %p", func, s));
 	s->s_drain_func = func;
 	s->s_drain_arg = ctx;
 }
 
 /*
  * Call the drain and process the return.
  */
 static int
 sbuf_drain(struct sbuf *s)
 {
 	int len;
 
 	KASSERT(s->s_len > 0, ("Shouldn't drain empty sbuf %p", s));
 	KASSERT(s->s_error == 0, ("Called %s with error on %p", __func__, s));
 	if (SBUF_DODRAINTOEOR(s) && s->s_rec_off == 0)
 		return (s->s_error = EDEADLK);
 	len = s->s_drain_func(s->s_drain_arg, s->s_buf,
 	    SBUF_DODRAINTOEOR(s) ? s->s_rec_off : s->s_len);
 	if (len <= 0) {
 		s->s_error = len ? -len : EDEADLK;
 		return (s->s_error);
 	}
 	KASSERT(len > 0 && len <= s->s_len,
 	    ("Bad drain amount %d for sbuf %p", len, s));
 	s->s_len -= len;
 	s->s_rec_off -= len;
 	/*
 	 * Fast path for the expected case where all the data was
 	 * drained.
 	 */
 	if (s->s_len == 0)
 		return (0);
 	/*
 	 * Move the remaining characters to the beginning of the
 	 * string.
 	 */
 	memmove(s->s_buf, s->s_buf + len, s->s_len);
 	return (0);
 }
 
 /*
  * Append bytes to an sbuf.  This is the core function for appending
  * to an sbuf and is the main place that deals with extending the
  * buffer and marking overflow.
  */
 static void
 sbuf_put_bytes(struct sbuf *s, const char *buf, size_t len)
 {
 	size_t n;
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	if (s->s_error != 0)
 		return;
 	while (len > 0) {
 		if (SBUF_FREESPACE(s) <= 0) {
 			/*
 			 * If there is a drain, use it, otherwise extend the
 			 * buffer.
 			 */
 			if (s->s_drain_func != NULL)
 				(void)sbuf_drain(s);
 			else if (sbuf_extend(s, len > INT_MAX ? INT_MAX : len)
 			    < 0)
 				s->s_error = ENOMEM;
 			if (s->s_error != 0)
 				return;
 		}
 		n = SBUF_FREESPACE(s);
 		if (len < n)
 			n = len;
 		memcpy(&s->s_buf[s->s_len], buf, n);
 		s->s_len += n;
 		if (SBUF_ISSECTION(s))
 			s->s_sect_len += n;
 		len -= n;
 		buf += n;
 	}
 }
 
 static void
 sbuf_put_byte(struct sbuf *s, char c)
 {
 
 	sbuf_put_bytes(s, &c, 1);
 }
 
 /*
  * Append a byte string to an sbuf.
  */
 int
 sbuf_bcat(struct sbuf *s, const void *buf, size_t len)
 {
 
 	sbuf_put_bytes(s, buf, len);
 	if (s->s_error != 0)
 		return (-1);
 	return (0);
 }
 
 #ifdef _KERNEL
 /*
  * Copy a byte string from userland into an sbuf.
  */
 int
 sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 	KASSERT(s->s_drain_func == NULL,
 	    ("Nonsensical copyin to sbuf %p with a drain", s));
 
 	if (s->s_error != 0)
 		return (-1);
 	if (len == 0)
 		return (0);
 	if (len > SBUF_FREESPACE(s)) {
 		sbuf_extend(s, len - SBUF_FREESPACE(s));
 		if (SBUF_FREESPACE(s) < len)
 			len = SBUF_FREESPACE(s);
 	}
 	if (copyin(uaddr, s->s_buf + s->s_len, len) != 0)
 		return (-1);
 	s->s_len += len;
 
 	return (0);
 }
 #endif
 
 /*
  * Copy a byte string into an sbuf.
  */
 int
 sbuf_bcpy(struct sbuf *s, const void *buf, size_t len)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	sbuf_clear(s);
 	return (sbuf_bcat(s, buf, len));
 }
 
 /*
  * Append a string to an sbuf.
  */
 int
 sbuf_cat(struct sbuf *s, const char *str)
 {
 	size_t n;
 
 	n = strlen(str);
 	sbuf_put_bytes(s, str, n);
 	if (s->s_error != 0)
 		return (-1);
 	return (0);
 }
 
 #ifdef _KERNEL
 /*
  * Append a string from userland to an sbuf.
  */
 int
 sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len)
 {
 	size_t done;
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 	KASSERT(s->s_drain_func == NULL,
 	    ("Nonsensical copyin to sbuf %p with a drain", s));
 
 	if (s->s_error != 0)
 		return (-1);
 
 	if (len == 0)
 		len = SBUF_FREESPACE(s);	/* XXX return 0? */
 	if (len > SBUF_FREESPACE(s)) {
 		sbuf_extend(s, len);
 		if (SBUF_FREESPACE(s) < len)
 			len = SBUF_FREESPACE(s);
 	}
 	switch (copyinstr(uaddr, s->s_buf + s->s_len, len + 1, &done)) {
 	case ENAMETOOLONG:
 		s->s_error = ENOMEM;
 		/* fall through */
 	case 0:
 		s->s_len += done - 1;
 		if (SBUF_ISSECTION(s))
 			s->s_sect_len += done - 1;
 		break;
 	default:
 		return (-1);	/* XXX */
 	}
 
 	return (done);
 }
 #endif
 
 /*
  * Copy a string into an sbuf.
  */
 int
 sbuf_cpy(struct sbuf *s, const char *str)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	sbuf_clear(s);
 	return (sbuf_cat(s, str));
 }
 
 /*
  * Format the given argument list and append the resulting string to an sbuf.
  */
 #ifdef _KERNEL
 
 /*
  * Append a non-NUL character to an sbuf.  This prototype signature is
  * suitable for use with kvprintf(9).
  */
 static void
 sbuf_putc_func(int c, void *arg)
 {
 
 	if (c != '\0')
 		sbuf_put_byte(arg, c);
 }
 
 int
 sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	KASSERT(fmt != NULL,
 	    ("%s called with a NULL format string", __func__));
 
 	(void)kvprintf(fmt, sbuf_putc_func, s, 10, ap);
 	if (s->s_error != 0)
 		return (-1);
 	return (0);
 }
 #else /* !_KERNEL */
 int
 sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
 {
 	va_list ap_copy;
 	int error, len;
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	KASSERT(fmt != NULL,
 	    ("%s called with a NULL format string", __func__));
 
 	if (s->s_error != 0)
 		return (-1);
 
 	/*
 	 * For the moment, there is no way to get vsnprintf(3) to hand
 	 * back a character at a time, to push everything into
 	 * sbuf_putc_func() as was done for the kernel.
 	 *
 	 * In userspace, while drains are useful, there's generally
 	 * not a problem attempting to malloc(3) on out of space.  So
 	 * expand a userland sbuf if there is not enough room for the
 	 * data produced by sbuf_[v]printf(3).
 	 */
 
 	error = 0;
 	do {
 		va_copy(ap_copy, ap);
 		len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1,
 		    fmt, ap_copy);
 		if (len < 0) {
 			s->s_error = errno;
 			return (-1);
 		}
 		va_end(ap_copy);
 
 		if (SBUF_FREESPACE(s) >= len)
 			break;
 		/* Cannot print with the current available space. */
 		if (s->s_drain_func != NULL && s->s_len > 0)
 			error = sbuf_drain(s); /* sbuf_drain() sets s_error. */
 		else if (sbuf_extend(s, len - SBUF_FREESPACE(s)) != 0)
 			s->s_error = error = ENOMEM;
 	} while (error == 0);
 
 	/*
 	 * s->s_len is the length of the string, without the terminating nul.
 	 * When updating s->s_len, we must subtract 1 from the length that
 	 * we passed into vsnprintf() because that length includes the
 	 * terminating nul.
 	 *
 	 * vsnprintf() returns the amount that would have been copied,
 	 * given sufficient space, so don't over-increment s_len.
 	 */
 	if (SBUF_FREESPACE(s) < len)
 		len = SBUF_FREESPACE(s);
 	s->s_len += len;
 	if (SBUF_ISSECTION(s))
 		s->s_sect_len += len;
 
 	KASSERT(s->s_len < s->s_size,
 	    ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
 
 	if (s->s_error != 0)
 		return (-1);
 	return (0);
 }
 #endif /* _KERNEL */
 
 /*
  * Format the given arguments and append the resulting string to an sbuf.
  */
 int
 sbuf_printf(struct sbuf *s, const char *fmt, ...)
 {
 	va_list ap;
 	int result;
 
 	va_start(ap, fmt);
 	result = sbuf_vprintf(s, fmt, ap);
 	va_end(ap);
 	return (result);
 }
 
 /*
  * Append a character to an sbuf.
  */
 int
 sbuf_putc(struct sbuf *s, int c)
 {
 
 	sbuf_put_byte(s, c);
 	if (s->s_error != 0)
 		return (-1);
 	return (0);
 }
 
 /*
  * Trim whitespace characters from end of an sbuf.
  */
 int
 sbuf_trim(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 	KASSERT(s->s_drain_func == NULL,
 	    ("%s makes no sense on sbuf %p with drain", __func__, s));
 
 	if (s->s_error != 0)
 		return (-1);
 
 	while (s->s_len > 0 && isspace(s->s_buf[s->s_len-1])) {
 		--s->s_len;
 		if (SBUF_ISSECTION(s))
 			s->s_sect_len--;
 	}
 
 	return (0);
 }
 
 /*
  * Check if an sbuf has an error.
  */
 int
 sbuf_error(const struct sbuf *s)
 {
 
 	return (s->s_error);
 }
 
 /*
  * Finish off an sbuf.
  */
 int
 sbuf_finish(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	s->s_buf[s->s_len] = '\0';
 	if (SBUF_NULINCLUDED(s))
 		s->s_len++;
 	if (s->s_drain_func != NULL) {
 		while (s->s_len > 0 && s->s_error == 0)
 			s->s_error = sbuf_drain(s);
 	}
 	SBUF_SETFLAG(s, SBUF_FINISHED);
 #ifdef _KERNEL
 	return (s->s_error);
 #else
 	if (s->s_error != 0) {
 		errno = s->s_error;
 		return (-1);
 	}
 	return (0);
 #endif
 }
 
 /*
  * Return a pointer to the sbuf data.
  */
 char *
 sbuf_data(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, SBUF_FINISHED);
 	KASSERT(s->s_drain_func == NULL,
 	    ("%s makes no sense on sbuf %p with drain", __func__, s));
 
 	return (s->s_buf);
 }
 
 /*
  * Return the length of the sbuf data.
  */
 ssize_t
 sbuf_len(struct sbuf *s)
 {
 
 	assert_sbuf_integrity(s);
 	/* don't care if it's finished or not */
 	KASSERT(s->s_drain_func == NULL,
 	    ("%s makes no sense on sbuf %p with drain", __func__, s));
 
 	if (s->s_error != 0)
 		return (-1);
 
 	/* If finished, nulterm is already in len, else add one. */
 	if (SBUF_NULINCLUDED(s) && !SBUF_ISFINISHED(s))
 		return (s->s_len + 1);
 	return (s->s_len);
 }
 
 /*
  * Clear an sbuf, free its buffer if necessary.
  */
 void
 sbuf_delete(struct sbuf *s)
 {
 	int isdyn;
 
 	assert_sbuf_integrity(s);
 	/* don't care if it's finished or not */
 
 	if (SBUF_ISDYNAMIC(s))
 		SBFREE(s->s_buf);
 	isdyn = SBUF_ISDYNSTRUCT(s);
 	memset(s, 0, sizeof(*s));
 	if (isdyn)
 		SBFREE(s);
 }
 
 /*
  * Check if an sbuf has been finished.
  */
 int
 sbuf_done(const struct sbuf *s)
 {
 
 	return (SBUF_ISFINISHED(s));
 }
 
 /*
  * Start a section.
  */
 void
 sbuf_start_section(struct sbuf *s, ssize_t *old_lenp)
 {
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 
 	if (!SBUF_ISSECTION(s)) {
 		KASSERT(s->s_sect_len == 0,
 		    ("s_sect_len != 0 when starting a section"));
 		if (old_lenp != NULL)
 			*old_lenp = -1;
 		s->s_rec_off = s->s_len;
 		SBUF_SETFLAG(s, SBUF_INSECTION);
 	} else {
 		KASSERT(old_lenp != NULL,
 		    ("s_sect_len should be saved when starting a subsection"));
 		*old_lenp = s->s_sect_len;
 		s->s_sect_len = 0;
 	}
 }
 
 /*
  * End the section padding to the specified length with the specified
  * character.
  */
 ssize_t
 sbuf_end_section(struct sbuf *s, ssize_t old_len, size_t pad, int c)
 {
 	ssize_t len;
 
 	assert_sbuf_integrity(s);
 	assert_sbuf_state(s, 0);
 	KASSERT(SBUF_ISSECTION(s),
 	    ("attempt to end a section when not in a section"));
 
 	if (pad > 1) {
 		len = roundup(s->s_sect_len, pad) - s->s_sect_len;
 		for (; s->s_error == 0 && len > 0; len--)
 			sbuf_put_byte(s, c);
 	}
 	len = s->s_sect_len;
 	if (old_len == -1) {
 		s->s_rec_off = s->s_sect_len = 0;
 		SBUF_CLEARFLAG(s, SBUF_INSECTION);
 	} else {
 		s->s_sect_len += old_len;
 	}
 	if (s->s_error != 0)
 		return (-1);
 	return (len);
 }
Index: head/sys/kern/subr_sleepqueue.c
===================================================================
--- head/sys/kern/subr_sleepqueue.c	(revision 326270)
+++ head/sys/kern/subr_sleepqueue.c	(revision 326271)
@@ -1,1453 +1,1455 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Implementation of sleep queues used to hold queue of threads blocked on
  * a wait channel.  Sleep queues are different from turnstiles in that wait
  * channels are not owned by anyone, so there is no priority propagation.
  * Sleep queues can also provide a timeout and can also be interrupted by
  * signals.  That said, there are several similarities between the turnstile
  * and sleep queue implementations.  (Note: turnstiles were implemented
  * first.)  For example, both use a hash table of the same size where each
  * bucket is referred to as a "chain" that contains both a spin lock and
  * a linked list of queues.  An individual queue is located by using a hash
  * to pick a chain, locking the chain, and then walking the chain searching
  * for the queue.  This means that a wait channel object does not need to
  * embed its queue head just as locks do not embed their turnstile queue
  * head.  Threads also carry around a sleep queue that they lend to the
  * wait channel when blocking.  Just as in turnstiles, the queue includes
  * a free list of the sleep queues of other threads blocked on the same
  * wait channel in the case of multiple waiters.
  *
  * Some additional functionality provided by sleep queues include the
  * ability to set a timeout.  The timeout is managed using a per-thread
  * callout that resumes a thread if it is asleep.  A thread may also
  * catch signals while it is asleep (aka an interruptible sleep).  The
  * signal code uses sleepq_abort() to interrupt a sleeping thread.  Finally,
  * sleep queues also provide some extra assertions.  One is not allowed to
  * mix the sleep/wakeup and cv APIs for a given wait channel.  Also, one
  * must consistently use the same lock to synchronize with a wait channel,
  * though this check is currently only a warning for sleep/wakeup due to
  * pre-existing abuse of that API.  The same lock must also be held when
  * awakening threads, though that is currently only enforced for condition
  * variables.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sleepqueue_profiling.h"
 #include "opt_ddb.h"
 #include "opt_sched.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 
 #include <machine/atomic.h>
 
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 
 /*
  * Constants for the hash table of sleep queue chains.
  * SC_TABLESIZE must be a power of two for SC_MASK to work properly.
  */
 #ifndef SC_TABLESIZE
 #define	SC_TABLESIZE	256
 #endif
 CTASSERT(powerof2(SC_TABLESIZE));
 #define	SC_MASK		(SC_TABLESIZE - 1)
 #define	SC_SHIFT	8
 #define	SC_HASH(wc)	((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
 			    SC_MASK)
 #define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
 #define NR_SLEEPQS      2
 /*
  * There are two different lists of sleep queues.  Both lists are connected
  * via the sq_hash entries.  The first list is the sleep queue chain list
  * that a sleep queue is on when it is attached to a wait channel.  The
  * second list is the free list hung off of a sleep queue that is attached
  * to a wait channel.
  *
  * Each sleep queue also contains the wait channel it is attached to, the
  * list of threads blocked on that wait channel, flags specific to the
  * wait channel, and the lock used to synchronize with a wait channel.
  * The flags are used to catch mismatches between the various consumers
  * of the sleep queue API (e.g. sleep/wakeup and condition variables).
  * The lock pointer is only used when invariants are enabled for various
  * debugging checks.
  *
  * Locking key:
  *  c - sleep queue chain lock
  */
 struct sleepqueue {
 	TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS];	/* (c) Blocked threads. */
 	u_int sq_blockedcnt[NR_SLEEPQS];	/* (c) N. of blocked threads. */
 	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
 	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
 	void	*sq_wchan;			/* (c) Wait channel. */
 	int	sq_type;			/* (c) Queue type. */
 #ifdef INVARIANTS
 	struct lock_object *sq_lock;		/* (c) Associated lock. */
 #endif
 };
 
 struct sleepqueue_chain {
 	LIST_HEAD(, sleepqueue) sc_queues;	/* List of sleep queues. */
 	struct mtx sc_lock;			/* Spin lock for this chain. */
 #ifdef SLEEPQUEUE_PROFILING
 	u_int	sc_depth;			/* Length of sc_queues. */
 	u_int	sc_max_depth;			/* Max length of sc_queues. */
 #endif
 } __aligned(CACHE_LINE_SIZE);
 
 #ifdef SLEEPQUEUE_PROFILING
 u_int sleepq_max_depth;
 static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling");
 static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0,
     "sleepq chain stats");
 SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
     0, "maxmimum depth achieved of a single chain");
 
 static void	sleepq_profile(const char *wmesg);
 static int	prof_enabled;
 #endif
 static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
 static uma_zone_t sleepq_zone;
 
 /*
  * Prototypes for non-exported routines.
  */
 static int	sleepq_catch_signals(void *wchan, int pri);
 static int	sleepq_check_signals(void);
 static int	sleepq_check_timeout(void);
 #ifdef INVARIANTS
 static void	sleepq_dtor(void *mem, int size, void *arg);
 #endif
 static int	sleepq_init(void *mem, int size, int flags);
 static int	sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
 		    int pri);
 static void	sleepq_switch(void *wchan, int pri);
 static void	sleepq_timeout(void *arg);
 
 SDT_PROBE_DECLARE(sched, , , sleep);
 SDT_PROBE_DECLARE(sched, , , wakeup);
 
 /*
  * Initialize SLEEPQUEUE_PROFILING specific sysctl nodes.
  * Note that it must happen after sleepinit() has been fully executed, so
  * it must happen after SI_SUB_KMEM SYSINIT() subsystem setup.
  */
 #ifdef SLEEPQUEUE_PROFILING
 static void
 init_sleepqueue_profiling(void)
 {
 	char chain_name[10];
 	struct sysctl_oid *chain_oid;
 	u_int i;
 
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		snprintf(chain_name, sizeof(chain_name), "%u", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL,
 		    SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
 		    chain_name, CTLFLAG_RD, NULL, "sleepq chain stats");
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
 		    NULL);
 	}
 }
 
 SYSINIT(sleepqueue_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
     init_sleepqueue_profiling, NULL);
 #endif
 
 /*
  * Early initialization of sleep queues that is called from the sleepinit()
  * SYSINIT.
  */
 void
 init_sleepqueues(void)
 {
 	int i;
 
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_INIT(&sleepq_chains[i].sc_queues);
 		mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
 		    MTX_SPIN | MTX_RECURSE);
 	}
 	sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
 #ifdef INVARIANTS
 	    NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #else
 	    NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #endif
 
 	thread0.td_sleepqueue = sleepq_alloc();
 }
 
 /*
  * Get a sleep queue for a new thread.
  */
 struct sleepqueue *
 sleepq_alloc(void)
 {
 
 	return (uma_zalloc(sleepq_zone, M_WAITOK));
 }
 
 /*
  * Free a sleep queue when a thread is destroyed.
  */
 void
 sleepq_free(struct sleepqueue *sq)
 {
 
 	uma_zfree(sleepq_zone, sq);
 }
 
 /*
  * Lock the sleep queue chain associated with the specified wait channel.
  */
 void
 sleepq_lock(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_lock_spin(&sc->sc_lock);
 }
 
 /*
  * Look up the sleep queue associated with a given wait channel in the hash
  * table locking the associated sleep queue chain.  If no queue is found in
  * the table, NULL is returned.
  */
 struct sleepqueue *
 sleepq_lookup(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			return (sq);
 	return (NULL);
 }
 
 /*
  * Unlock the sleep queue chain associated with a given wait channel.
  */
 void
 sleepq_release(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_unlock_spin(&sc->sc_lock);
 }
 
 /*
  * Places the current thread on the sleep queue for the specified wait
  * channel.  If INVARIANTS is enabled, then it associates the passed in
  * lock with the sleepq to make sure it is held when that sleep queue is
  * woken up.
  */
 void
 sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
     int queue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(td->td_sleepqueue != NULL);
 	MPASS(wchan != NULL);
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	/* If this thread is not allowed to sleep, die a horrible death. */
 	KASSERT(td->td_no_sleeping == 0,
 	    ("%s: td %p to sleep on wchan %p with sleeping prohibited",
 	    __func__, td, wchan));
 
 	/* Look up the sleep queue associated with the wait channel 'wchan'. */
 	sq = sleepq_lookup(wchan);
 
 	/*
 	 * If the wait channel does not already have a sleep queue, use
 	 * this thread's sleep queue.  Otherwise, insert the current thread
 	 * into the sleep queue already in use by this wait channel.
 	 */
 	if (sq == NULL) {
 #ifdef INVARIANTS
 		int i;
 
 		sq = td->td_sleepqueue;
 		for (i = 0; i < NR_SLEEPQS; i++) {
 			KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
 			    ("thread's sleep queue %d is not empty", i));
 			KASSERT(sq->sq_blockedcnt[i] == 0,
 			    ("thread's sleep queue %d count mismatches", i));
 		}
 		KASSERT(LIST_EMPTY(&sq->sq_free),
 		    ("thread's sleep queue has a non-empty free list"));
 		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
 		sq->sq_lock = lock;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth++;
 		if (sc->sc_depth > sc->sc_max_depth) {
 			sc->sc_max_depth = sc->sc_depth;
 			if (sc->sc_max_depth > sleepq_max_depth)
 				sleepq_max_depth = sc->sc_max_depth;
 		}
 #endif
 		sq = td->td_sleepqueue;
 		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
 		sq->sq_wchan = wchan;
 		sq->sq_type = flags & SLEEPQ_TYPE;
 	} else {
 		MPASS(wchan == sq->sq_wchan);
 		MPASS(lock == sq->sq_lock);
 		MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
 		LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
 	}
 	thread_lock(td);
 	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
 	sq->sq_blockedcnt[queue]++;
 	td->td_sleepqueue = NULL;
 	td->td_sqqueue = queue;
 	td->td_wchan = wchan;
 	td->td_wmesg = wmesg;
 	if (flags & SLEEPQ_INTERRUPTIBLE) {
 		td->td_flags |= TDF_SINTR;
 		td->td_flags &= ~TDF_SLEEPABORT;
 	}
 	thread_unlock(td);
 }
 
 /*
  * Sets a timeout that will remove the current thread from the specified
  * sleep queue after timo ticks if the thread has not already been awakened.
  */
 void
 sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
     int flags)
 {
 	struct sleepqueue_chain *sc;
 	struct thread *td;
 	sbintime_t pr1;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_sleepqueue == NULL);
 	MPASS(wchan != NULL);
 	if (cold && td == &thread0)
 		panic("timed sleep before timers are working");
 	KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx",
 	    td->td_tid, td, (uintmax_t)td->td_sleeptimo));
 	thread_lock(td);
 	callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1);
 	thread_unlock(td);
 	callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1,
 	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC |
 	    C_DIRECT_EXEC);
 }
 
 /*
  * Return the number of actual sleepers for the specified queue.
  */
 u_int
 sleepq_sleepcnt(void *wchan, int queue)
 {
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	return (sq->sq_blockedcnt[queue]);
 }
 
 /*
  * Marks the pending sleep of the current thread as interruptible and
  * makes an initial check for pending signals before putting a thread
  * to sleep. Enters and exits with the thread lock held.  Thread lock
  * may have transitioned from the sleepq lock to a run lock.
  */
 static int
 sleepq_catch_signals(void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	int sig, ret;
 
 	ret = 0;
 	td = curthread;
 	p = curproc;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(wchan != NULL);
 	if ((td->td_pflags & TDP_WAKEUP) != 0) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		ret = EINTR;
 		thread_lock(td);
 		goto out;
 	}
 
 	/*
 	 * See if there are any pending signals or suspension requests for this
 	 * thread.  If not, we can switch immediately.
 	 */
 	thread_lock(td);
 	if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) != 0) {
 		thread_unlock(td);
 		mtx_unlock_spin(&sc->sc_lock);
 		CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
 			(void *)td, (long)p->p_pid, td->td_name);
 		PROC_LOCK(p);
 		/*
 		 * Check for suspension first. Checking for signals and then
 		 * suspending could result in a missed signal, since a signal
 		 * can be delivered while this thread is suspended.
 		 */
 		if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 			ret = thread_suspend_check(1);
 			MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
 			if (ret != 0) {
 				PROC_UNLOCK(p);
 				mtx_lock_spin(&sc->sc_lock);
 				thread_lock(td);
 				goto out;
 			}
 		}
 		if ((td->td_flags & TDF_NEEDSIGCHK) != 0) {
 			ps = p->p_sigacts;
 			mtx_lock(&ps->ps_mtx);
 			sig = cursig(td);
 			if (sig == -1) {
 				mtx_unlock(&ps->ps_mtx);
 				KASSERT((td->td_flags & TDF_SBDRY) != 0,
 				    ("lost TDF_SBDRY"));
 				KASSERT(TD_SBDRY_INTR(td),
 				    ("lost TDF_SERESTART of TDF_SEINTR"));
 				KASSERT((td->td_flags &
 				    (TDF_SEINTR | TDF_SERESTART)) !=
 				    (TDF_SEINTR | TDF_SERESTART),
 				    ("both TDF_SEINTR and TDF_SERESTART"));
 				ret = TD_SBDRY_ERRNO(td);
 			} else if (sig != 0) {
 				ret = SIGISMEMBER(ps->ps_sigintr, sig) ?
 				    EINTR : ERESTART;
 				mtx_unlock(&ps->ps_mtx);
 			} else {
 				mtx_unlock(&ps->ps_mtx);
 			}
 		}
 		/*
 		 * Lock the per-process spinlock prior to dropping the PROC_LOCK
 		 * to avoid a signal delivery race.  PROC_LOCK, PROC_SLOCK, and
 		 * thread_lock() are currently held in tdsendsignal().
 		 */
 		PROC_SLOCK(p);
 		mtx_lock_spin(&sc->sc_lock);
 		PROC_UNLOCK(p);
 		thread_lock(td);
 		PROC_SUNLOCK(p);
 	}
 	if (ret == 0) {
 		sleepq_switch(wchan, pri);
 		return (0);
 	}
 out:
 	/*
 	 * There were pending signals and this thread is still
 	 * on the sleep queue, remove it from the sleep queue.
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		sq = sleepq_lookup(wchan);
 		if (sleepq_resume_thread(sq, td, 0)) {
 #ifdef INVARIANTS
 			/*
 			 * This thread hasn't gone to sleep yet, so it
 			 * should not be swapped out.
 			 */
 			panic("not waking up swapper");
 #endif
 		}
 	}
 	mtx_unlock_spin(&sc->sc_lock);
 	MPASS(td->td_lock != &sc->sc_lock);
 	return (ret);
 }
 
 /*
  * Switches to another thread if we are still asleep on a sleep queue.
  * Returns with thread lock.
  */
 static void
 sleepq_switch(void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	bool rtc_changed;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If we have a sleep queue, then we've already been woken up, so
 	 * just return.
 	 */
 	if (td->td_sleepqueue != NULL) {
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * If TDF_TIMEOUT is set, then our sleep has been timed out
 	 * already but we are still on the sleep queue, so dequeue the
 	 * thread and return.
 	 *
 	 * Do the same if the real-time clock has been adjusted since this
 	 * thread calculated its timeout based on that clock.  This handles
 	 * the following race:
 	 * - The Ts thread needs to sleep until an absolute real-clock time.
 	 *   It copies the global rtc_generation into curthread->td_rtcgen,
 	 *   reads the RTC, and calculates a sleep duration based on that time.
 	 *   See umtxq_sleep() for an example.
 	 * - The Tc thread adjusts the RTC, bumps rtc_generation, and wakes
 	 *   threads that are sleeping until an absolute real-clock time.
 	 *   See tc_setclock() and the POSIX specification of clock_settime().
 	 * - Ts reaches the code below.  It holds the sleepqueue chain lock,
 	 *   so Tc has finished waking, so this thread must test td_rtcgen.
 	 * (The declaration of td_rtcgen refers to this comment.)
 	 */
 	rtc_changed = td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation;
 	if ((td->td_flags & TDF_TIMEOUT) || rtc_changed) {
 		if (rtc_changed) {
 			td->td_rtcgen = 0;
 		}
 		MPASS(TD_ON_SLEEPQ(td));
 		sq = sleepq_lookup(wchan);
 		if (sleepq_resume_thread(sq, td, 0)) {
 #ifdef INVARIANTS
 			/*
 			 * This thread hasn't gone to sleep yet, so it
 			 * should not be swapped out.
 			 */
 			panic("not waking up swapper");
 #endif
 		}
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
 #ifdef SLEEPQUEUE_PROFILING
 	if (prof_enabled)
 		sleepq_profile(td->td_wmesg);
 #endif
 	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td, pri);
 	thread_lock_set(td, &sc->sc_lock);
 	SDT_PROBE0(sched, , , sleep);
 	TD_SET_SLEEPING(td);
 	mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 }
 
 /*
  * Check to see if we timed out.
  */
 static int
 sleepq_check_timeout(void)
 {
 	struct thread *td;
 	int res;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If TDF_TIMEOUT is set, we timed out.  But recheck
 	 * td_sleeptimo anyway.
 	 */
 	res = 0;
 	if (td->td_sleeptimo != 0) {
 		if (td->td_sleeptimo <= sbinuptime())
 			res = EWOULDBLOCK;
 		td->td_sleeptimo = 0;
 	}
 	if (td->td_flags & TDF_TIMEOUT)
 		td->td_flags &= ~TDF_TIMEOUT;
 	else
 		/*
 		 * We ignore the situation where timeout subsystem was
 		 * unable to stop our callout.  The struct thread is
 		 * type-stable, the callout will use the correct
 		 * memory when running.  The checks of the
 		 * td_sleeptimo value in this function and in
 		 * sleepq_timeout() ensure that the thread does not
 		 * get spurious wakeups, even if the callout was reset
 		 * or thread reused.
 		 */
 		callout_stop(&td->td_slpcallout);
 	return (res);
 }
 
 /*
  * Check to see if we were awoken by a signal.
  */
 static int
 sleepq_check_signals(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* We are no longer in an interruptible sleep. */
 	if (td->td_flags & TDF_SINTR)
 		td->td_flags &= ~TDF_SINTR;
 
 	if (td->td_flags & TDF_SLEEPABORT) {
 		td->td_flags &= ~TDF_SLEEPABORT;
 		return (td->td_intrval);
 	}
 
 	return (0);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue.
  */
 void
 sleepq_wait(void *wchan, int pri)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 	thread_unlock(td);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it is interrupted by a signal.
  */
 int
 sleepq_wait_sig(void *wchan, int pri)
 {
 	int rcatch;
 	int rval;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	rval = sleepq_check_signals();
 	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	return (rval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it times out while waiting.
  */
 int
 sleepq_timedwait(void *wchan, int pri)
 {
 	struct thread *td;
 	int rval;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 	rval = sleepq_check_timeout();
 	thread_unlock(td);
 
 	return (rval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue,
  * it is interrupted by a signal, or it times out waiting to be awakened.
  */
 int
 sleepq_timedwait_sig(void *wchan, int pri)
 {
 	int rcatch, rvalt, rvals;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	rvalt = sleepq_check_timeout();
 	rvals = sleepq_check_signals();
 	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
 		return (rvals);
 	return (rvalt);
 }
 
 /*
  * Returns the type of sleepqueue given a waitchannel.
  */
 int
 sleepq_type(void *wchan)
 {
 	struct sleepqueue *sq;
 	int type;
 
 	MPASS(wchan != NULL);
 
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL) {
 		sleepq_release(wchan);
 		return (-1);
 	}
 	type = sq->sq_type;
 	sleepq_release(wchan);
 	return (type);
 }
 
 /*
  * Removes a thread from a sleep queue and makes it
  * runnable.
  */
 static int
 sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri)
 {
 	struct sleepqueue_chain *sc;
 
 	MPASS(td != NULL);
 	MPASS(sq->sq_wchan != NULL);
 	MPASS(td->td_wchan == sq->sq_wchan);
 	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
 	SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
 
 	/* Remove the thread from the queue. */
 	sq->sq_blockedcnt[td->td_sqqueue]--;
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
 
 	/*
 	 * Get a sleep queue for this thread.  If this is the last waiter,
 	 * use the queue itself and take it out of the chain, otherwise,
 	 * remove a queue from the free list.
 	 */
 	if (LIST_EMPTY(&sq->sq_free)) {
 		td->td_sleepqueue = sq;
 #ifdef INVARIANTS
 		sq->sq_wchan = NULL;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth--;
 #endif
 	} else
 		td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
 	LIST_REMOVE(td->td_sleepqueue, sq_hash);
 
 	td->td_wmesg = NULL;
 	td->td_wchan = NULL;
 	td->td_flags &= ~TDF_SINTR;
 
 	CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, td->td_name);
 
 	/* Adjust priority if requested. */
 	MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX));
 	if (pri != 0 && td->td_priority > pri &&
 	    PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, pri);
 
 	/*
 	 * Note that thread td might not be sleeping if it is running
 	 * sleepq_catch_signals() on another CPU or is blocked on its
 	 * proc lock to check signals.  There's no need to mark the
 	 * thread runnable in that case.
 	 */
 	if (TD_IS_SLEEPING(td)) {
 		TD_CLR_SLEEPING(td);
 		return (setrunnable(td));
 	}
 	return (0);
 }
 
 #ifdef INVARIANTS
 /*
  * UMA zone item deallocator.
  */
 static void
 sleepq_dtor(void *mem, int size, void *arg)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
 		MPASS(sq->sq_blockedcnt[i] == 0);
 	}
 }
 #endif
 
 /*
  * UMA zone item initializer.
  */
 static int
 sleepq_init(void *mem, int size, int flags)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	bzero(mem, size);
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		TAILQ_INIT(&sq->sq_blocked[i]);
 		sq->sq_blockedcnt[i] = 0;
 	}
 	LIST_INIT(&sq->sq_free);
 	return (0);
 }
 
 /*
  * Find the highest priority thread sleeping on a wait channel and resume it.
  */
 int
 sleepq_signal(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 	struct thread *td, *besttd;
 	int wakeup_swapper;
 
 	CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	/*
 	 * Find the highest priority thread on the queue.  If there is a
 	 * tie, use the thread that first appears in the queue as it has
 	 * been sleeping the longest since threads are always added to
 	 * the tail of sleep queues.
 	 */
 	besttd = TAILQ_FIRST(&sq->sq_blocked[queue]);
 	TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
 		if (td->td_priority < besttd->td_priority)
 			besttd = td;
 	}
 	MPASS(besttd != NULL);
 	thread_lock(besttd);
 	wakeup_swapper = sleepq_resume_thread(sq, besttd, pri);
 	thread_unlock(besttd);
 	return (wakeup_swapper);
 }
 
 static bool
 match_any(struct thread *td __unused)
 {
 
 	return (true);
 }
 
 /*
  * Resume all threads sleeping on a specified wait channel.
  */
 int
 sleepq_broadcast(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 
 	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	return (sleepq_remove_matching(sq, queue, match_any, pri));
 }
 
 /*
  * Resume threads on the sleep queue that match the given predicate.
  */
 int
 sleepq_remove_matching(struct sleepqueue *sq, int queue,
     bool (*matches)(struct thread *), int pri)
 {
 	struct thread *td, *tdn;
 	int wakeup_swapper;
 
 	/*
 	 * The last thread will be given ownership of sq and may
 	 * re-enqueue itself before sleepq_resume_thread() returns,
 	 * so we must cache the "next" queue item at the beginning
 	 * of the final iteration.
 	 */
 	wakeup_swapper = 0;
 	TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
 		thread_lock(td);
 		if (matches(td))
 			wakeup_swapper |= sleepq_resume_thread(sq, td, pri);
 		thread_unlock(td);
 	}
 
 	return (wakeup_swapper);
 }
 
 /*
  * Time sleeping threads out.  When the timeout expires, the thread is
  * removed from the sleep queue and made runnable if it is still asleep.
  */
 static void
 sleepq_timeout(void *arg)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	void *wchan;
 	int wakeup_swapper;
 
 	td = arg;
 	wakeup_swapper = 0;
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
 	thread_lock(td);
 
 	if (td->td_sleeptimo > sbinuptime() || td->td_sleeptimo == 0) {
 		/*
 		 * The thread does not want a timeout (yet).
 		 */
 	} else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
 		/*
 		 * See if the thread is asleep and get the wait
 		 * channel if it is.
 		 */
 		wchan = td->td_wchan;
 		sc = SC_LOOKUP(wchan);
 		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
 		sq = sleepq_lookup(wchan);
 		MPASS(sq != NULL);
 		td->td_flags |= TDF_TIMEOUT;
 		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
 	} else if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If the thread is on the SLEEPQ but isn't sleeping
 		 * yet, it can either be on another CPU in between
 		 * sleepq_add() and one of the sleepq_*wait*()
 		 * routines or it can be in sleepq_catch_signals().
 		 */
 		td->td_flags |= TDF_TIMEOUT;
 	}
 
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Resumes a specific thread from the sleep queue associated with a specific
  * wait channel if it is on that queue.
  */
 void
 sleepq_remove(struct thread *td, void *wchan)
 {
 	struct sleepqueue *sq;
 	int wakeup_swapper;
 
 	/*
 	 * Look up the sleep queue for this wait channel, then re-check
 	 * that the thread is asleep on that channel, if it is not, then
 	 * bail.
 	 */
 	MPASS(wchan != NULL);
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
 	/*
 	 * We can not lock the thread here as it may be sleeping on a
 	 * different sleepq.  However, holding the sleepq lock for this
 	 * wchan can guarantee that we do not miss a wakeup for this
 	 * channel.  The asserts below will catch any false positives.
 	 */
 	if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
 		sleepq_release(wchan);
 		return;
 	}
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	thread_lock(td);
 	MPASS(sq != NULL);
 	MPASS(td->td_wchan == wchan);
 	wakeup_swapper = sleepq_resume_thread(sq, td, 0);
 	thread_unlock(td);
 	sleepq_release(wchan);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Abort a thread as if an interrupt had occurred.  Only abort
  * interruptible waits (unfortunately it isn't safe to abort others).
  */
 int
 sleepq_abort(struct thread *td, int intrval)
 {
 	struct sleepqueue *sq;
 	void *wchan;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_flags & TDF_SINTR);
 	MPASS(intrval == EINTR || intrval == ERESTART);
 
 	/*
 	 * If the TDF_TIMEOUT flag is set, just leave. A
 	 * timeout is scheduled anyhow.
 	 */
 	if (td->td_flags & TDF_TIMEOUT)
 		return (0);
 
 	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 	td->td_intrval = intrval;
 	td->td_flags |= TDF_SLEEPABORT;
 	/*
 	 * If the thread has not slept yet it will find the signal in
 	 * sleepq_catch_signals() and call sleepq_resume_thread.  Otherwise
 	 * we have to do it here.
 	 */
 	if (!TD_IS_SLEEPING(td))
 		return (0);
 	wchan = td->td_wchan;
 	MPASS(wchan != NULL);
 	sq = sleepq_lookup(wchan);
 	MPASS(sq != NULL);
 
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	return (sleepq_resume_thread(sq, td, 0));
 }
 
 void
 sleepq_chains_remove_matching(bool (*matches)(struct thread *))
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	int i, wakeup_swapper;
 
 	wakeup_swapper = 0;
 	for (sc = &sleepq_chains[0]; sc < sleepq_chains + SC_TABLESIZE; ++sc) {
 		if (LIST_EMPTY(&sc->sc_queues)) {
 			continue;
 		}
 		mtx_lock_spin(&sc->sc_lock);
 		LIST_FOREACH(sq, &sc->sc_queues, sq_hash) {
 			for (i = 0; i < NR_SLEEPQS; ++i) {
 				wakeup_swapper |= sleepq_remove_matching(sq, i,
 				    matches, 0);
 			}
 		}
 		mtx_unlock_spin(&sc->sc_lock);
 	}
 	if (wakeup_swapper) {
 		kick_proc0();
 	}
 }
 
 /*
  * Prints the stacks of all threads presently sleeping on wchan/queue to
  * the sbuf sb.  Sets count_stacks_printed to the number of stacks actually
  * printed.  Typically, this will equal the number of threads sleeping on the
  * queue, but may be less if sb overflowed before all stacks were printed.
  */
 #ifdef STACK
 int
 sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue,
     int *count_stacks_printed)
 {
 	struct thread *td, *td_next;
 	struct sleepqueue *sq;
 	struct stack **st;
 	struct sbuf **td_infos;
 	int i, stack_idx, error, stacks_to_allocate;
 	bool finished, partial_print;
 
 	error = 0;
 	finished = false;
 	partial_print = false;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	stacks_to_allocate = 10;
 	for (i = 0; i < 3 && !finished ; i++) {
 		/* We cannot malloc while holding the queue's spinlock, so
 		 * we do our mallocs now, and hope it is enough.  If it
 		 * isn't, we will free these, drop the lock, malloc more,
 		 * and try again, up to a point.  After that point we will
 		 * give up and report ENOMEM. We also cannot write to sb
 		 * during this time since the client may have set the
 		 * SBUF_AUTOEXTEND flag on their sbuf, which could cause a
 		 * malloc as we print to it.  So we defer actually printing
 		 * to sb until after we drop the spinlock.
 		 */
 
 		/* Where we will store the stacks. */
 		st = malloc(sizeof(struct stack *) * stacks_to_allocate,
 		    M_TEMP, M_WAITOK);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			st[stack_idx] = stack_create(M_WAITOK);
 
 		/* Where we will store the td name, tid, etc. */
 		td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate,
 		    M_TEMP, M_WAITOK);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			td_infos[stack_idx] = sbuf_new(NULL, NULL,
 			    MAXCOMLEN + sizeof(struct thread *) * 2 + 40,
 			    SBUF_FIXEDLEN);
 
 		sleepq_lock(wchan);
 		sq = sleepq_lookup(wchan);
 		if (sq == NULL) {
 			/* This sleepq does not exist; exit and return ENOENT. */
 			error = ENOENT;
 			finished = true;
 			sleepq_release(wchan);
 			goto loop_end;
 		}
 
 		stack_idx = 0;
 		/* Save thread info */
 		TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq,
 		    td_next) {
 			if (stack_idx >= stacks_to_allocate)
 				goto loop_end;
 
 			/* Note the td_lock is equal to the sleepq_lock here. */
 			stack_save_td(st[stack_idx], td);
 
 			sbuf_printf(td_infos[stack_idx], "%d: %s %p",
 			    td->td_tid, td->td_name, td);
 
 			++stack_idx;
 		}
 
 		finished = true;
 		sleepq_release(wchan);
 
 		/* Print the stacks */
 		for (i = 0; i < stack_idx; i++) {
 			sbuf_finish(td_infos[i]);
 			sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i]));
 			stack_sbuf_print(sb, st[i]);
 			sbuf_printf(sb, "\n");
 
 			error = sbuf_error(sb);
 			if (error == 0)
 				*count_stacks_printed = stack_idx;
 		}
 
 loop_end:
 		if (!finished)
 			sleepq_release(wchan);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			stack_destroy(st[stack_idx]);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			sbuf_delete(td_infos[stack_idx]);
 		free(st, M_TEMP);
 		free(td_infos, M_TEMP);
 		stacks_to_allocate *= 10;
 	}
 
 	if (!finished && error == 0)
 		error = ENOMEM;
 
 	return (error);
 }
 #endif
 
 #ifdef SLEEPQUEUE_PROFILING
 #define	SLEEPQ_PROF_LOCATIONS	1024
 #define	SLEEPQ_SBUFSIZE		512
 struct sleepq_prof {
 	LIST_ENTRY(sleepq_prof) sp_link;
 	const char	*sp_wmesg;
 	long		sp_count;
 };
 
 LIST_HEAD(sqphead, sleepq_prof);
 
 struct sqphead sleepq_prof_free;
 struct sqphead sleepq_hash[SC_TABLESIZE];
 static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
 static struct mtx sleepq_prof_lock;
 MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);
 
 static void
 sleepq_profile(const char *wmesg)
 {
 	struct sleepq_prof *sp;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	if (prof_enabled == 0)
 		goto unlock;
 	LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
 		if (sp->sp_wmesg == wmesg)
 			goto done;
 	sp = LIST_FIRST(&sleepq_prof_free);
 	if (sp == NULL)
 		goto unlock;
 	sp->sp_wmesg = wmesg;
 	LIST_REMOVE(sp, sp_link);
 	LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
 done:
 	sp->sp_count++;
 unlock:
 	mtx_unlock_spin(&sleepq_prof_lock);
 	return;
 }
 
 static void
 sleepq_prof_reset(void)
 {
 	struct sleepq_prof *sp;
 	int enabled;
 	int i;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	enabled = prof_enabled;
 	prof_enabled = 0;
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_INIT(&sleepq_hash[i]);
 	LIST_INIT(&sleepq_prof_free);
 	for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
 		sp = &sleepq_profent[i];
 		sp->sp_wmesg = NULL;
 		sp->sp_count = 0;
 		LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
 	}
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 }
 
 static int
 enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = prof_enabled;
 	error = sysctl_handle_int(oidp, &v, v, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == prof_enabled)
 		return (0);
 	if (v == 1)
 		sleepq_prof_reset();
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = !!v;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	return (0);
 }
 
 static int
 reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = 0;
 	error = sysctl_handle_int(oidp, &v, 0, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == 0)
 		return (0);
 	sleepq_prof_reset();
 
 	return (0);
 }
 
 static int
 dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct sleepq_prof *sp;
 	struct sbuf *sb;
 	int enabled;
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
 	sbuf_printf(sb, "\nwmesg\tcount\n");
 	enabled = prof_enabled;
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = 0;
 	mtx_unlock_spin(&sleepq_prof_lock);
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
 			sbuf_printf(sb, "%s\t%ld\n",
 			    sp->sp_wmesg, sp->sp_count);
 		}
 	}
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, reset_sleepq_prof_stats, "I",
     "Reset sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling");
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 #ifdef INVARIANTS
 	struct lock_object *lock;
 #endif
 	struct thread *td;
 	void *wchan;
 	int i;
 
 	if (!have_addr)
 		return;
 
 	/*
 	 * First, see if there is an active sleep queue for the wait channel
 	 * indicated by the address.
 	 */
 	wchan = (void *)addr;
 	sc = SC_LOOKUP(wchan);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			goto found;
 
 	/*
 	 * Second, see if there is an active sleep queue at the address
 	 * indicated.
 	 */
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
 			if (sq == (struct sleepqueue *)addr)
 				goto found;
 		}
 
 	db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
 	return;
 found:
 	db_printf("Wait channel: %p\n", sq->sq_wchan);
 	db_printf("Queue type: %d\n", sq->sq_type);
 #ifdef INVARIANTS
 	if (sq->sq_lock) {
 		lock = sq->sq_lock;
 		db_printf("Associated Interlock: %p - (%s) %s\n", lock,
 		    LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	}
 #endif
 	db_printf("Blocked threads:\n");
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		db_printf("\nQueue[%d]:\n", i);
 		if (TAILQ_EMPTY(&sq->sq_blocked[i]))
 			db_printf("\tempty\n");
 		else
 			TAILQ_FOREACH(td, &sq->sq_blocked[i],
 				      td_slpq) {
 				db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
 					  td->td_tid, td->td_proc->p_pid,
 					  td->td_name);
 			}
 		db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
 	}
 }
 
 /* Alias 'show sleepqueue' to 'show sleepq'. */
 DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
 #endif
Index: head/sys/kern/subr_smp.c
===================================================================
--- head/sys/kern/subr_smp.c	(revision 326270)
+++ head/sys/kern/subr_smp.c	(revision 326271)
@@ -1,1152 +1,1154 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This module holds the global variables and machine independent functions
  * used for the kernel SMP support.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #include "opt_sched.h"
 
 #ifdef SMP
 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
 
 volatile cpuset_t stopped_cpus;
 volatile cpuset_t started_cpus;
 volatile cpuset_t suspended_cpus;
 cpuset_t hlt_cpus_mask;
 cpuset_t logical_cpus_mask;
 
 void (*cpustop_restartfunc)(void);
 #endif
 
 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
 
 /* This is used in modules that need to work in both SMP and UP. */
 cpuset_t all_cpus;
 
 int mp_ncpus;
 /* export this for libkvm consumers. */
 int mp_maxcpus = MAXCPU;
 
 volatile int smp_started;
 u_int mp_maxid;
 
 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
     "Kernel SMP");
 
 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
     "Max CPU ID.");
 
 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
     0, "Max number of CPUs that the system was compiled for.");
 
 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_smp_active, "I",
     "Indicates system is running in SMP mode");
 
 int smp_disabled = 0;	/* has smp been disabled? */
 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
     &smp_disabled, 0, "SMP has been disabled from the loader");
 
 int smp_cpus = 1;	/* how many cpu's running */
 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
     "Number of CPUs online");
 
 int smp_topology = 0;	/* Which topology we're using. */
 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
     "Topology override setting; 0 is default provided by hardware.");
 
 #ifdef SMP
 /* Enable forwarding of a signal to a process running on a different CPU */
 static int forward_signal_enabled = 1;
 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 	   &forward_signal_enabled, 0,
 	   "Forwarding of a signal to a process on a different CPU");
 
 /* Variables needed for SMP rendezvous. */
 static volatile int smp_rv_ncpus;
 static void (*volatile smp_rv_setup_func)(void *arg);
 static void (*volatile smp_rv_action_func)(void *arg);
 static void (*volatile smp_rv_teardown_func)(void *arg);
 static void *volatile smp_rv_func_arg;
 static volatile int smp_rv_waiters[4];
 
 /* 
  * Shared mutex to restrict busywaits between smp_rendezvous() and
  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
  * functions trigger at once and cause multiple CPUs to busywait with
  * interrupts disabled. 
  */
 struct mtx smp_ipi_mtx;
 
 /*
  * Let the MD SMP code initialize mp_maxid very early if it can.
  */
 static void
 mp_setmaxid(void *dummy)
 {
 
 	cpu_mp_setmaxid();
 
 	KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
 	KASSERT(mp_ncpus > 1 || mp_maxid == 0,
 	    ("%s: one CPU but mp_maxid is not zero", __func__));
 	KASSERT(mp_maxid >= mp_ncpus - 1,
 	    ("%s: counters out of sync: max %d, count %d", __func__,
 		mp_maxid, mp_ncpus));
 }
 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 
 /*
  * Call the MD SMP initialization code.
  */
 static void
 mp_start(void *dummy)
 {
 
 	mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 
 	/* Probe for MP hardware. */
 	if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 		mp_ncpus = 1;
 		CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 		return;
 	}
 
 	cpu_mp_start();
 	printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 	    mp_ncpus);
 	cpu_mp_announce();
 }
 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 
 void
 forward_signal(struct thread *td)
 {
 	int id;
 
 	/*
 	 * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
 	 * this thread, so all we need to do is poke it if it is currently
 	 * executing so that it executes ast().
 	 */
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("forward_signal: thread is not TDS_RUNNING"));
 
 	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 
 	if (!smp_started || cold || panicstr)
 		return;
 	if (!forward_signal_enabled)
 		return;
 
 	/* No need to IPI ourself. */
 	if (td == curthread)
 		return;
 
 	id = td->td_oncpu;
 	if (id == NOCPU)
 		return;
 	ipi_cpu(id, IPI_AST);
 }
 
 /*
  * When called the executing CPU will send an IPI to all other CPUs
  *  requesting that they halt execution.
  *
  * Usually (but not necessarily) called with 'other_cpus' as its arg.
  *
  *  - Signals all CPUs in map to stop.
  *  - Waits for each to stop.
  *
  * Returns:
  *  -1: error
  *   0: NA
  *   1: ok
  *
  */
 #if defined(__amd64__) || defined(__i386__)
 #define	X86	1
 #else
 #define	X86	0
 #endif
 static int
 generic_stop_cpus(cpuset_t map, u_int type)
 {
 #ifdef KTR
 	char cpusetbuf[CPUSETBUFSIZ];
 #endif
 	static volatile u_int stopping_cpu = NOCPU;
 	int i;
 	volatile cpuset_t *cpus;
 
 	KASSERT(
 	    type == IPI_STOP || type == IPI_STOP_HARD
 #if X86
 	    || type == IPI_SUSPEND
 #endif
 	    , ("%s: invalid stop type", __func__));
 
 	if (!smp_started)
 		return (0);
 
 	CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 	    cpusetobj_strprint(cpusetbuf, &map), type);
 
 #if X86
 	/*
 	 * When suspending, ensure there are are no IPIs in progress.
 	 * IPIs that have been issued, but not yet delivered (e.g.
 	 * not pending on a vCPU when running under virtualization)
 	 * will be lost, violating FreeBSD's assumption of reliable
 	 * IPI delivery.
 	 */
 	if (type == IPI_SUSPEND)
 		mtx_lock_spin(&smp_ipi_mtx);
 #endif
 
 #if X86
 	if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 #endif
 	if (stopping_cpu != PCPU_GET(cpuid))
 		while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 		    PCPU_GET(cpuid)) == 0)
 			while (stopping_cpu != NOCPU)
 				cpu_spinwait(); /* spin */
 
 	/* send the stop IPI to all CPUs in map */
 	ipi_selected(map, type);
 #if X86
 	}
 #endif
 
 #if X86
 	if (type == IPI_SUSPEND)
 		cpus = &suspended_cpus;
 	else
 #endif
 		cpus = &stopped_cpus;
 
 	i = 0;
 	while (!CPU_SUBSET(cpus, &map)) {
 		/* spin */
 		cpu_spinwait();
 		i++;
 		if (i == 100000000) {
 			printf("timeout stopping cpus\n");
 			break;
 		}
 	}
 
 #if X86
 	if (type == IPI_SUSPEND)
 		mtx_unlock_spin(&smp_ipi_mtx);
 #endif
 
 	stopping_cpu = NOCPU;
 	return (1);
 }
 
 int
 stop_cpus(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_STOP));
 }
 
 int
 stop_cpus_hard(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_STOP_HARD));
 }
 
 #if X86
 int
 suspend_cpus(cpuset_t map)
 {
 
 	return (generic_stop_cpus(map, IPI_SUSPEND));
 }
 #endif
 
 /*
  * Called by a CPU to restart stopped CPUs. 
  *
  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
  *
  *  - Signals all CPUs in map to restart.
  *  - Waits for each to restart.
  *
  * Returns:
  *  -1: error
  *   0: NA
  *   1: ok
  */
 static int
 generic_restart_cpus(cpuset_t map, u_int type)
 {
 #ifdef KTR
 	char cpusetbuf[CPUSETBUFSIZ];
 #endif
 	volatile cpuset_t *cpus;
 
 	KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
 #if X86
 	    || type == IPI_SUSPEND
 #endif
 	    , ("%s: invalid stop type", __func__));
 
 	if (!smp_started)
 		return (0);
 
 	CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 
 #if X86
 	if (type == IPI_SUSPEND)
 		cpus = &suspended_cpus;
 	else
 #endif
 		cpus = &stopped_cpus;
 
 	/* signal other cpus to restart */
 	CPU_COPY_STORE_REL(&map, &started_cpus);
 
 #if X86
 	if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 #endif
 	/* wait for each to clear its bit */
 	while (CPU_OVERLAP(cpus, &map))
 		cpu_spinwait();
 #if X86
 	}
 #endif
 
 	return (1);
 }
 
 int
 restart_cpus(cpuset_t map)
 {
 
 	return (generic_restart_cpus(map, IPI_STOP));
 }
 
 #if X86
 int
 resume_cpus(cpuset_t map)
 {
 
 	return (generic_restart_cpus(map, IPI_SUSPEND));
 }
 #endif
 #undef X86
 
 /*
  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function 
  * (if specified), rendezvous, execute the action function (if specified),
  * rendezvous again, execute the teardown function (if specified), and then
  * resume.
  *
  * Note that the supplied external functions _must_ be reentrant and aware
  * that they are running in parallel and in an unknown lock context.
  */
 void
 smp_rendezvous_action(void)
 {
 	struct thread *td;
 	void *local_func_arg;
 	void (*local_setup_func)(void*);
 	void (*local_action_func)(void*);
 	void (*local_teardown_func)(void*);
 #ifdef INVARIANTS
 	int owepreempt;
 #endif
 
 	/* Ensure we have up-to-date values. */
 	atomic_add_acq_int(&smp_rv_waiters[0], 1);
 	while (smp_rv_waiters[0] < smp_rv_ncpus)
 		cpu_spinwait();
 
 	/* Fetch rendezvous parameters after acquire barrier. */
 	local_func_arg = smp_rv_func_arg;
 	local_setup_func = smp_rv_setup_func;
 	local_action_func = smp_rv_action_func;
 	local_teardown_func = smp_rv_teardown_func;
 
 	/*
 	 * Use a nested critical section to prevent any preemptions
 	 * from occurring during a rendezvous action routine.
 	 * Specifically, if a rendezvous handler is invoked via an IPI
 	 * and the interrupted thread was in the critical_exit()
 	 * function after setting td_critnest to 0 but before
 	 * performing a deferred preemption, this routine can be
 	 * invoked with td_critnest set to 0 and td_owepreempt true.
 	 * In that case, a critical_exit() during the rendezvous
 	 * action would trigger a preemption which is not permitted in
 	 * a rendezvous action.  To fix this, wrap all of the
 	 * rendezvous action handlers in a critical section.  We
 	 * cannot use a regular critical section however as having
 	 * critical_exit() preempt from this routine would also be
 	 * problematic (the preemption must not occur before the IPI
 	 * has been acknowledged via an EOI).  Instead, we
 	 * intentionally ignore td_owepreempt when leaving the
 	 * critical section.  This should be harmless because we do
 	 * not permit rendezvous action routines to schedule threads,
 	 * and thus td_owepreempt should never transition from 0 to 1
 	 * during this routine.
 	 */
 	td = curthread;
 	td->td_critnest++;
 #ifdef INVARIANTS
 	owepreempt = td->td_owepreempt;
 #endif
 	
 	/*
 	 * If requested, run a setup function before the main action
 	 * function.  Ensure all CPUs have completed the setup
 	 * function before moving on to the action function.
 	 */
 	if (local_setup_func != smp_no_rendezvous_barrier) {
 		if (smp_rv_setup_func != NULL)
 			smp_rv_setup_func(smp_rv_func_arg);
 		atomic_add_int(&smp_rv_waiters[1], 1);
 		while (smp_rv_waiters[1] < smp_rv_ncpus)
                 	cpu_spinwait();
 	}
 
 	if (local_action_func != NULL)
 		local_action_func(local_func_arg);
 
 	if (local_teardown_func != smp_no_rendezvous_barrier) {
 		/*
 		 * Signal that the main action has been completed.  If a
 		 * full exit rendezvous is requested, then all CPUs will
 		 * wait here until all CPUs have finished the main action.
 		 */
 		atomic_add_int(&smp_rv_waiters[2], 1);
 		while (smp_rv_waiters[2] < smp_rv_ncpus)
 			cpu_spinwait();
 
 		if (local_teardown_func != NULL)
 			local_teardown_func(local_func_arg);
 	}
 
 	/*
 	 * Signal that the rendezvous is fully completed by this CPU.
 	 * This means that no member of smp_rv_* pseudo-structure will be
 	 * accessed by this target CPU after this point; in particular,
 	 * memory pointed by smp_rv_func_arg.
 	 *
 	 * The release semantic ensures that all accesses performed by
 	 * the current CPU are visible when smp_rendezvous_cpus()
 	 * returns, by synchronizing with the
 	 * atomic_load_acq_int(&smp_rv_waiters[3]).
 	 */
 	atomic_add_rel_int(&smp_rv_waiters[3], 1);
 
 	td->td_critnest--;
 	KASSERT(owepreempt == td->td_owepreempt,
 	    ("rendezvous action changed td_owepreempt"));
 }
 
 void
 smp_rendezvous_cpus(cpuset_t map,
 	void (* setup_func)(void *), 
 	void (* action_func)(void *),
 	void (* teardown_func)(void *),
 	void *arg)
 {
 	int curcpumap, i, ncpus = 0;
 
 	/* Look comments in the !SMP case. */
 	if (!smp_started) {
 		spinlock_enter();
 		if (setup_func != NULL)
 			setup_func(arg);
 		if (action_func != NULL)
 			action_func(arg);
 		if (teardown_func != NULL)
 			teardown_func(arg);
 		spinlock_exit();
 		return;
 	}
 
 	CPU_FOREACH(i) {
 		if (CPU_ISSET(i, &map))
 			ncpus++;
 	}
 	if (ncpus == 0)
 		panic("ncpus is 0 with non-zero map");
 
 	mtx_lock_spin(&smp_ipi_mtx);
 
 	/* Pass rendezvous parameters via global variables. */
 	smp_rv_ncpus = ncpus;
 	smp_rv_setup_func = setup_func;
 	smp_rv_action_func = action_func;
 	smp_rv_teardown_func = teardown_func;
 	smp_rv_func_arg = arg;
 	smp_rv_waiters[1] = 0;
 	smp_rv_waiters[2] = 0;
 	smp_rv_waiters[3] = 0;
 	atomic_store_rel_int(&smp_rv_waiters[0], 0);
 
 	/*
 	 * Signal other processors, which will enter the IPI with
 	 * interrupts off.
 	 */
 	curcpumap = CPU_ISSET(curcpu, &map);
 	CPU_CLR(curcpu, &map);
 	ipi_selected(map, IPI_RENDEZVOUS);
 
 	/* Check if the current CPU is in the map */
 	if (curcpumap != 0)
 		smp_rendezvous_action();
 
 	/*
 	 * Ensure that the master CPU waits for all the other
 	 * CPUs to finish the rendezvous, so that smp_rv_*
 	 * pseudo-structure and the arg are guaranteed to not
 	 * be in use.
 	 *
 	 * Load acquire synchronizes with the release add in
 	 * smp_rendezvous_action(), which ensures that our caller sees
 	 * all memory actions done by the called functions on other
 	 * CPUs.
 	 */
 	while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 		cpu_spinwait();
 
 	mtx_unlock_spin(&smp_ipi_mtx);
 }
 
 void
 smp_rendezvous(void (* setup_func)(void *), 
 	       void (* action_func)(void *),
 	       void (* teardown_func)(void *),
 	       void *arg)
 {
 	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 }
 
 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 
 struct cpu_group *
 smp_topo(void)
 {
 	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 	struct cpu_group *top;
 
 	/*
 	 * Check for a fake topology request for debugging purposes.
 	 */
 	switch (smp_topology) {
 	case 1:
 		/* Dual core with no sharing.  */
 		top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 		break;
 	case 2:
 		/* No topology, all cpus are equal. */
 		top = smp_topo_none();
 		break;
 	case 3:
 		/* Dual core with shared L2.  */
 		top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 		break;
 	case 4:
 		/* quad core, shared l3 among each package, private l2.  */
 		top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 		break;
 	case 5:
 		/* quad core,  2 dualcore parts on each package share l2.  */
 		top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 		break;
 	case 6:
 		/* Single-core 2xHTT */
 		top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 		break;
 	case 7:
 		/* quad core with a shared l3, 8 threads sharing L2.  */
 		top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 		    CG_FLAG_SMT);
 		break;
 	default:
 		/* Default, ask the system what it wants. */
 		top = cpu_topo();
 		break;
 	}
 	/*
 	 * Verify the returned topology.
 	 */
 	if (top->cg_count != mp_ncpus)
 		panic("Built bad topology at %p.  CPU count %d != %d",
 		    top, top->cg_count, mp_ncpus);
 	if (CPU_CMP(&top->cg_mask, &all_cpus))
 		panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 		    top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 		    cpusetobj_strprint(cpusetbuf2, &all_cpus));
 
 	/*
 	 * Collapse nonsense levels that may be created out of convenience by
 	 * the MD layers.  They cause extra work in the search functions.
 	 */
 	while (top->cg_children == 1) {
 		top = &top->cg_child[0];
 		top->cg_parent = NULL;
 	}
 	return (top);
 }
 
 struct cpu_group *
 smp_topo_alloc(u_int count)
 {
 	static u_int index;
 	u_int curr;
 
 	curr = index;
 	index += count;
 	return (&group[curr]);
 }
 
 struct cpu_group *
 smp_topo_none(void)
 {
 	struct cpu_group *top;
 
 	top = &group[0];
 	top->cg_parent = NULL;
 	top->cg_child = NULL;
 	top->cg_mask = all_cpus;
 	top->cg_count = mp_ncpus;
 	top->cg_children = 0;
 	top->cg_level = CG_SHARE_NONE;
 	top->cg_flags = 0;
 	
 	return (top);
 }
 
 static int
 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
     int count, int flags, int start)
 {
 	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 	cpuset_t mask;
 	int i;
 
 	CPU_ZERO(&mask);
 	for (i = 0; i < count; i++, start++)
 		CPU_SET(start, &mask);
 	child->cg_parent = parent;
 	child->cg_child = NULL;
 	child->cg_children = 0;
 	child->cg_level = share;
 	child->cg_count = count;
 	child->cg_flags = flags;
 	child->cg_mask = mask;
 	parent->cg_children++;
 	for (; parent != NULL; parent = parent->cg_parent) {
 		if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 			panic("Duplicate children in %p.  mask (%s) child (%s)",
 			    parent,
 			    cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 			    cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 		CPU_OR(&parent->cg_mask, &child->cg_mask);
 		parent->cg_count += child->cg_count;
 	}
 
 	return (start);
 }
 
 struct cpu_group *
 smp_topo_1level(int share, int count, int flags)
 {
 	struct cpu_group *child;
 	struct cpu_group *top;
 	int packages;
 	int cpu;
 	int i;
 
 	cpu = 0;
 	top = &group[0];
 	packages = mp_ncpus / count;
 	top->cg_child = child = &group[1];
 	top->cg_level = CG_SHARE_NONE;
 	for (i = 0; i < packages; i++, child++)
 		cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 	return (top);
 }
 
 struct cpu_group *
 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
     int l1flags)
 {
 	struct cpu_group *top;
 	struct cpu_group *l1g;
 	struct cpu_group *l2g;
 	int cpu;
 	int i;
 	int j;
 
 	cpu = 0;
 	top = &group[0];
 	l2g = &group[1];
 	top->cg_child = l2g;
 	top->cg_level = CG_SHARE_NONE;
 	top->cg_children = mp_ncpus / (l2count * l1count);
 	l1g = l2g + top->cg_children;
 	for (i = 0; i < top->cg_children; i++, l2g++) {
 		l2g->cg_parent = top;
 		l2g->cg_child = l1g;
 		l2g->cg_level = l2share;
 		for (j = 0; j < l2count; j++, l1g++)
 			cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 			    l1flags, cpu);
 	}
 	return (top);
 }
 
 
 struct cpu_group *
 smp_topo_find(struct cpu_group *top, int cpu)
 {
 	struct cpu_group *cg;
 	cpuset_t mask;
 	int children;
 	int i;
 
 	CPU_SETOF(cpu, &mask);
 	cg = top;
 	for (;;) {
 		if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 			return (NULL);
 		if (cg->cg_children == 0)
 			return (cg);
 		children = cg->cg_children;
 		for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 			if (CPU_OVERLAP(&cg->cg_mask, &mask))
 				break;
 	}
 	return (NULL);
 }
 #else /* !SMP */
 
 void
 smp_rendezvous_cpus(cpuset_t map,
 	void (*setup_func)(void *), 
 	void (*action_func)(void *),
 	void (*teardown_func)(void *),
 	void *arg)
 {
 	/*
 	 * In the !SMP case we just need to ensure the same initial conditions
 	 * as the SMP case.
 	 */
 	spinlock_enter();
 	if (setup_func != NULL)
 		setup_func(arg);
 	if (action_func != NULL)
 		action_func(arg);
 	if (teardown_func != NULL)
 		teardown_func(arg);
 	spinlock_exit();
 }
 
 void
 smp_rendezvous(void (*setup_func)(void *), 
 	       void (*action_func)(void *),
 	       void (*teardown_func)(void *),
 	       void *arg)
 {
 
 	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
 	    arg);
 }
 
 /*
  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
  * APIs will still work using this dummy support.
  */
 static void
 mp_setvariables_for_up(void *dummy)
 {
 	mp_ncpus = 1;
 	mp_maxid = PCPU_GET(cpuid);
 	CPU_SETOF(mp_maxid, &all_cpus);
 	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 }
 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
     mp_setvariables_for_up, NULL);
 #endif /* SMP */
 
 void
 smp_no_rendezvous_barrier(void *dummy)
 {
 #ifdef SMP
 	KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
 #endif
 }
 
 /*
  * Wait for specified idle threads to switch once.  This ensures that even
  * preempted threads have cycled through the switch function once,
  * exiting their codepaths.  This allows us to change global pointers
  * with no other synchronization.
  */
 int
 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 {
 	struct pcpu *pcpu;
 	u_int gen[MAXCPU];
 	int error;
 	int cpu;
 
 	error = 0;
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 			continue;
 		pcpu = pcpu_find(cpu);
 		gen[cpu] = pcpu->pc_idlethread->td_generation;
 	}
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 			continue;
 		pcpu = pcpu_find(cpu);
 		thread_lock(curthread);
 		sched_bind(curthread, cpu);
 		thread_unlock(curthread);
 		while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
 			error = tsleep(quiesce_cpus, prio, wmesg, 1);
 			if (error != EWOULDBLOCK)
 				goto out;
 			error = 0;
 		}
 	}
 out:
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 
 	return (error);
 }
 
 int
 quiesce_all_cpus(const char *wmesg, int prio)
 {
 
 	return quiesce_cpus(all_cpus, wmesg, prio);
 }
 
 /* Extra care is taken with this sysctl because the data type is volatile */
 static int
 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
 {
 	int error, active;
 
 	active = smp_started;
 	error = SYSCTL_OUT(req, &active, sizeof(active));
 	return (error);
 }
 
 
 #ifdef SMP
 void
 topo_init_node(struct topo_node *node)
 {
 
 	bzero(node, sizeof(*node));
 	TAILQ_INIT(&node->children);
 }
 
 void
 topo_init_root(struct topo_node *root)
 {
 
 	topo_init_node(root);
 	root->type = TOPO_TYPE_SYSTEM;
 }
 
 /*
  * Add a child node with the given ID under the given parent.
  * Do nothing if there is already a child with that ID.
  */
 struct topo_node *
 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
     topo_node_type type, uintptr_t subtype)
 {
 	struct topo_node *node;
 
 	TAILQ_FOREACH_REVERSE(node, &parent->children,
 	    topo_children, siblings) {
 		if (node->hwid == hwid
 		    && node->type == type && node->subtype == subtype) {
 			return (node);
 		}
 	}
 
 	node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
 	topo_init_node(node);
 	node->parent = parent;
 	node->hwid = hwid;
 	node->type = type;
 	node->subtype = subtype;
 	TAILQ_INSERT_TAIL(&parent->children, node, siblings);
 	parent->nchildren++;
 
 	return (node);
 }
 
 /*
  * Find a child node with the given ID under the given parent.
  */
 struct topo_node *
 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
     topo_node_type type, uintptr_t subtype)
 {
 
 	struct topo_node *node;
 
 	TAILQ_FOREACH(node, &parent->children, siblings) {
 		if (node->hwid == hwid
 		    && node->type == type && node->subtype == subtype) {
 			return (node);
 		}
 	}
 
 	return (NULL);
 }
 
 /*
  * Given a node change the order of its parent's child nodes such
  * that the node becomes the firt child while preserving the cyclic
  * order of the children.  In other words, the given node is promoted
  * by rotation.
  */
 void
 topo_promote_child(struct topo_node *child)
 {
 	struct topo_node *next;
 	struct topo_node *node;
 	struct topo_node *parent;
 
 	parent = child->parent;
 	next = TAILQ_NEXT(child, siblings);
 	TAILQ_REMOVE(&parent->children, child, siblings);
 	TAILQ_INSERT_HEAD(&parent->children, child, siblings);
 
 	while (next != NULL) {
 		node = next;
 		next = TAILQ_NEXT(node, siblings);
 		TAILQ_REMOVE(&parent->children, node, siblings);
 		TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
 		child = node;
 	}
 }
 
 /*
  * Iterate to the next node in the depth-first search (traversal) of
  * the topology tree.
  */
 struct topo_node *
 topo_next_node(struct topo_node *top, struct topo_node *node)
 {
 	struct topo_node *next;
 
 	if ((next = TAILQ_FIRST(&node->children)) != NULL)
 		return (next);
 
 	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 		return (next);
 
 	while (node != top && (node = node->parent) != top)
 		if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 			return (next);
 
 	return (NULL);
 }
 
 /*
  * Iterate to the next node in the depth-first search of the topology tree,
  * but without descending below the current node.
  */
 struct topo_node *
 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
 {
 	struct topo_node *next;
 
 	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 		return (next);
 
 	while (node != top && (node = node->parent) != top)
 		if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 			return (next);
 
 	return (NULL);
 }
 
 /*
  * Assign the given ID to the given topology node that represents a logical
  * processor.
  */
 void
 topo_set_pu_id(struct topo_node *node, cpuid_t id)
 {
 
 	KASSERT(node->type == TOPO_TYPE_PU,
 	    ("topo_set_pu_id: wrong node type: %u", node->type));
 	KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
 	    ("topo_set_pu_id: cpuset already not empty"));
 	node->id = id;
 	CPU_SET(id, &node->cpuset);
 	node->cpu_count = 1;
 	node->subtype = 1;
 
 	while ((node = node->parent) != NULL) {
 		KASSERT(!CPU_ISSET(id, &node->cpuset),
 		    ("logical ID %u is already set in node %p", id, node));
 		CPU_SET(id, &node->cpuset);
 		node->cpu_count++;
 	}
 }
 
 static struct topology_spec {
 	topo_node_type	type;
 	bool		match_subtype;
 	uintptr_t	subtype;
 } topology_level_table[TOPO_LEVEL_COUNT] = {
 	[TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
 	[TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
 	[TOPO_LEVEL_CACHEGROUP] = {
 		.type = TOPO_TYPE_CACHE,
 		.match_subtype = true,
 		.subtype = CG_SHARE_L3,
 	},
 	[TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
 	[TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
 };
 
 static bool
 topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
     struct topo_analysis *results)
 {
 	struct topology_spec *spec;
 	struct topo_node *node;
 	int count;
 
 	if (level >= TOPO_LEVEL_COUNT)
 		return (true);
 
 	spec = &topology_level_table[level];
 	count = 0;
 	node = topo_next_node(root, root);
 
 	while (node != NULL) {
 		if (node->type != spec->type ||
 		    (spec->match_subtype && node->subtype != spec->subtype)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		if (!all && CPU_EMPTY(&node->cpuset)) {
 			node = topo_next_nonchild_node(root, node);
 			continue;
 		}
 
 		count++;
 
 		if (!topo_analyze_table(node, all, level + 1, results))
 			return (false);
 
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	/* No explicit subgroups is essentially one subgroup. */
 	if (count == 0) {
 		count = 1;
 
 		if (!topo_analyze_table(root, all, level + 1, results))
 			return (false);
 	}
 
 	if (results->entities[level] == -1)
 		results->entities[level] = count;
 	else if (results->entities[level] != count)
 		return (false);
 
 	return (true);
 }
 
 /*
  * Check if the topology is uniform, that is, each package has the same number
  * of cores in it and each core has the same number of threads (logical
  * processors) in it.  If so, calculate the number of packages, the number of
  * groups per package, the number of cachegroups per group, and the number of
  * logical processors per cachegroup.  'all' parameter tells whether to include
  * administratively disabled logical processors into the analysis.
  */
 int
 topo_analyze(struct topo_node *topo_root, int all,
     struct topo_analysis *results)
 {
 
 	results->entities[TOPO_LEVEL_PKG] = -1;
 	results->entities[TOPO_LEVEL_CORE] = -1;
 	results->entities[TOPO_LEVEL_THREAD] = -1;
 	results->entities[TOPO_LEVEL_GROUP] = -1;
 	results->entities[TOPO_LEVEL_CACHEGROUP] = -1;
 
 	if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
 		return (0);
 
 	KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
 		("bug in topology or analysis"));
 
 	return (1);
 }
 
 #endif /* SMP */
 
Index: head/sys/kern/subr_stack.c
===================================================================
--- head/sys/kern/subr_stack.c	(revision 326270)
+++ head/sys/kern/subr_stack.c	(revision 326271)
@@ -1,277 +1,279 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2005 Antoine Brodin
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #ifdef KTR
 #include <sys/ktr.h>
 #endif
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/sbuf.h>
 #include <sys/stack.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 FEATURE(stack, "Support for capturing kernel stack");
 
 static MALLOC_DEFINE(M_STACK, "stack", "Stack Traces");
 
 static int stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen,
 	    long *offset);
 static int stack_symbol_ddb(vm_offset_t pc, const char **name, long *offset);
 
 struct stack *
 stack_create(int flags)
 {
 	struct stack *st;
 
 	st = malloc(sizeof(*st), M_STACK, flags | M_ZERO);
 	return (st);
 }
 
 void
 stack_destroy(struct stack *st)
 {
 
 	free(st, M_STACK);
 }
 
 int
 stack_put(struct stack *st, vm_offset_t pc)
 {
 
 	if (st->depth < STACK_MAX) {
 		st->pcs[st->depth++] = pc;
 		return (0);
 	} else
 		return (-1);
 }
 
 void
 stack_copy(const struct stack *src, struct stack *dst)
 {
 
 	*dst = *src;
 }
 
 void
 stack_zero(struct stack *st)
 {
 
 	bzero(st, sizeof *st);
 }
 
 void
 stack_print(const struct stack *st)
 {
 	char namebuf[64];
 	long offset;
 	int i;
 
 	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	for (i = 0; i < st->depth; i++) {
 		(void)stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
 		    &offset);
 		printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
 		    namebuf, offset);
 	}
 }
 
 void
 stack_print_short(const struct stack *st)
 {
 	char namebuf[64];
 	long offset;
 	int i;
 
 	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	for (i = 0; i < st->depth; i++) {
 		if (i > 0)
 			printf(" ");
 		if (stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
 		    &offset) == 0)
 			printf("%s+%#lx", namebuf, offset);
 		else
 			printf("%p", (void *)st->pcs[i]);
 	}
 	printf("\n");
 }
 
 void
 stack_print_ddb(const struct stack *st)
 {
 	const char *name;
 	long offset;
 	int i;
 
 	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	for (i = 0; i < st->depth; i++) {
 		stack_symbol_ddb(st->pcs[i], &name, &offset);
 		printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
 		    name, offset);
 	}
 }
 
 #if defined(DDB) || defined(WITNESS)
 void
 stack_print_short_ddb(const struct stack *st)
 {
 	const char *name;
 	long offset;
 	int i;
 
 	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	for (i = 0; i < st->depth; i++) {
 		if (i > 0)
 			printf(" ");
 		if (stack_symbol_ddb(st->pcs[i], &name, &offset) == 0)
 			printf("%s+%#lx", name, offset);
 		else
 			printf("%p", (void *)st->pcs[i]);
 	}
 	printf("\n");
 }
 #endif
 
 /*
  * Two print routines -- one for use from DDB and DDB-like contexts, the
  * other for use in the live kernel.
  */
 void
 stack_sbuf_print(struct sbuf *sb, const struct stack *st)
 {
 	char namebuf[64];
 	long offset;
 	int i;
 
 	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	for (i = 0; i < st->depth; i++) {
 		(void)stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
 		    &offset);
 		sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
 		    namebuf, offset);
 	}
 }
 
 #if defined(DDB) || defined(WITNESS)
 void
 stack_sbuf_print_ddb(struct sbuf *sb, const struct stack *st)
 {
 	const char *name;
 	long offset;
 	int i;
 
 	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	for (i = 0; i < st->depth; i++) {
 		(void)stack_symbol_ddb(st->pcs[i], &name, &offset);
 		sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
 		    name, offset);
 	}
 }
 #endif
 
 #ifdef KTR
 void
 stack_ktr(u_int mask, const char *file, int line, const struct stack *st,
     u_int depth, int cheap)
 {
 #ifdef DDB
 	const char *name;
 	long offset;
 	int i;
 #endif
 
 	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	if (cheap) {
 		ktr_tracepoint(mask, file, line, "#0 %p %p %p %p %p %p",
 		    st->pcs[0], st->pcs[1], st->pcs[2], st->pcs[3],
 		    st->pcs[4], st->pcs[5]);
 		if (st->depth <= 6)
 			return;
 		ktr_tracepoint(mask, file, line, "#1 %p %p %p %p %p %p",
 		    st->pcs[6], st->pcs[7], st->pcs[8], st->pcs[9],
 		    st->pcs[10], st->pcs[11]);
 		if (st->depth <= 12)
 			return;
 		ktr_tracepoint(mask, file, line, "#2 %p %p %p %p %p %p",
 		    st->pcs[12], st->pcs[13], st->pcs[14], st->pcs[15],
 		    st->pcs[16], st->pcs[17]);
 #ifdef DDB
 	} else {
 		if (depth == 0 || st->depth < depth)
 			depth = st->depth;
 		for (i = 0; i < depth; i++) {
 			(void)stack_symbol_ddb(st->pcs[i], &name, &offset);
 			ktr_tracepoint(mask, file, line, "#%d %p at %s+%#lx",
 			    i, st->pcs[i], (u_long)name, offset, 0, 0);
 		}
 #endif
 	}
 }
 #endif
 
 /*
  * Two variants of stack symbol lookup -- one that uses the DDB interfaces
  * and bypasses linker locking, and the other that doesn't.
  */
 static int
 stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen, long *offset)
 {
 
 	if (linker_search_symbol_name((caddr_t)pc, namebuf, buflen,
 	    offset) != 0) {
 		*offset = 0;
 		strlcpy(namebuf, "??", buflen);
 		return (ENOENT);
 	} else
 		return (0);
 }
 
 static int
 stack_symbol_ddb(vm_offset_t pc, const char **name, long *offset)
 {
 	linker_symval_t symval;
 	c_linker_sym_t sym;
 
 	if (linker_ddb_search_symbol((caddr_t)pc, &sym, offset) != 0)
 		goto out;
 	if (linker_ddb_symbol_values(sym, &symval) != 0)
 		goto out;
 	if (symval.name != NULL) {
 		*name = symval.name;
 		return (0);
 	}
  out:
 	*offset = 0;
 	*name = "??";
 	return (ENOENT);
 }
Index: head/sys/kern/subr_taskqueue.c
===================================================================
--- head/sys/kern/subr_taskqueue.c	(revision 326270)
+++ head/sys/kern/subr_taskqueue.c	(revision 326271)
@@ -1,844 +1,846 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <machine/stdarg.h>
 
 static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues");
 static void	*taskqueue_giant_ih;
 static void	*taskqueue_ih;
 static void	 taskqueue_fast_enqueue(void *);
 static void	 taskqueue_swi_enqueue(void *);
 static void	 taskqueue_swi_giant_enqueue(void *);
 
 struct taskqueue_busy {
 	struct task	*tb_running;
 	TAILQ_ENTRY(taskqueue_busy) tb_link;
 };
 
 struct task * const TB_DRAIN_WAITER = (struct task *)0x1;
 
 struct taskqueue {
 	STAILQ_HEAD(, task)	tq_queue;
 	taskqueue_enqueue_fn	tq_enqueue;
 	void			*tq_context;
 	char			*tq_name;
 	TAILQ_HEAD(, taskqueue_busy) tq_active;
 	struct mtx		tq_mutex;
 	struct thread		**tq_threads;
 	int			tq_tcount;
 	int			tq_spin;
 	int			tq_flags;
 	int			tq_callouts;
 	taskqueue_callback_fn	tq_callbacks[TASKQUEUE_NUM_CALLBACKS];
 	void			*tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS];
 };
 
 #define	TQ_FLAGS_ACTIVE		(1 << 0)
 #define	TQ_FLAGS_BLOCKED	(1 << 1)
 #define	TQ_FLAGS_UNLOCKED_ENQUEUE	(1 << 2)
 
 #define	DT_CALLOUT_ARMED	(1 << 0)
 #define	DT_DRAIN_IN_PROGRESS	(1 << 1)
 
 #define	TQ_LOCK(tq)							\
 	do {								\
 		if ((tq)->tq_spin)					\
 			mtx_lock_spin(&(tq)->tq_mutex);			\
 		else							\
 			mtx_lock(&(tq)->tq_mutex);			\
 	} while (0)
 #define	TQ_ASSERT_LOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_OWNED)
 
 #define	TQ_UNLOCK(tq)							\
 	do {								\
 		if ((tq)->tq_spin)					\
 			mtx_unlock_spin(&(tq)->tq_mutex);		\
 		else							\
 			mtx_unlock(&(tq)->tq_mutex);			\
 	} while (0)
 #define	TQ_ASSERT_UNLOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED)
 
 void
 _timeout_task_init(struct taskqueue *queue, struct timeout_task *timeout_task,
     int priority, task_fn_t func, void *context)
 {
 
 	TASK_INIT(&timeout_task->t, priority, func, context);
 	callout_init_mtx(&timeout_task->c, &queue->tq_mutex,
 	    CALLOUT_RETURNUNLOCKED);
 	timeout_task->q = queue;
 	timeout_task->f = 0;
 }
 
 static __inline int
 TQ_SLEEP(struct taskqueue *tq, void *p, struct mtx *m, int pri, const char *wm,
     int t)
 {
 	if (tq->tq_spin)
 		return (msleep_spin(p, m, wm, t));
 	return (msleep(p, m, pri, wm, t));
 }
 
 static struct taskqueue *
 _taskqueue_create(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context,
 		 int mtxflags, const char *mtxname __unused)
 {
 	struct taskqueue *queue;
 	char *tq_name;
 
 	tq_name = malloc(TASKQUEUE_NAMELEN, M_TASKQUEUE, mflags | M_ZERO);
 	if (tq_name == NULL)
 		return (NULL);
 
 	queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO);
 	if (queue == NULL) {
 		free(tq_name, M_TASKQUEUE);
 		return (NULL);
 	}
 
 	snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue");
 
 	STAILQ_INIT(&queue->tq_queue);
 	TAILQ_INIT(&queue->tq_active);
 	queue->tq_enqueue = enqueue;
 	queue->tq_context = context;
 	queue->tq_name = tq_name;
 	queue->tq_spin = (mtxflags & MTX_SPIN) != 0;
 	queue->tq_flags |= TQ_FLAGS_ACTIVE;
 	if (enqueue == taskqueue_fast_enqueue ||
 	    enqueue == taskqueue_swi_enqueue ||
 	    enqueue == taskqueue_swi_giant_enqueue ||
 	    enqueue == taskqueue_thread_enqueue)
 		queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE;
 	mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags);
 
 	return (queue);
 }
 
 struct taskqueue *
 taskqueue_create(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context)
 {
 
 	return _taskqueue_create(name, mflags, enqueue, context,
 			MTX_DEF, name);
 }
 
 void
 taskqueue_set_callback(struct taskqueue *queue,
     enum taskqueue_callback_type cb_type, taskqueue_callback_fn callback,
     void *context)
 {
 
 	KASSERT(((cb_type >= TASKQUEUE_CALLBACK_TYPE_MIN) &&
 	    (cb_type <= TASKQUEUE_CALLBACK_TYPE_MAX)),
 	    ("Callback type %d not valid, must be %d-%d", cb_type,
 	    TASKQUEUE_CALLBACK_TYPE_MIN, TASKQUEUE_CALLBACK_TYPE_MAX));
 	KASSERT((queue->tq_callbacks[cb_type] == NULL),
 	    ("Re-initialization of taskqueue callback?"));
 
 	queue->tq_callbacks[cb_type] = callback;
 	queue->tq_cb_contexts[cb_type] = context;
 }
 
 /*
  * Signal a taskqueue thread to terminate.
  */
 static void
 taskqueue_terminate(struct thread **pp, struct taskqueue *tq)
 {
 
 	while (tq->tq_tcount > 0 || tq->tq_callouts > 0) {
 		wakeup(tq);
 		TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0);
 	}
 }
 
 void
 taskqueue_free(struct taskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags &= ~TQ_FLAGS_ACTIVE;
 	taskqueue_terminate(queue->tq_threads, queue);
 	KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?"));
 	KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks"));
 	mtx_destroy(&queue->tq_mutex);
 	free(queue->tq_threads, M_TASKQUEUE);
 	free(queue->tq_name, M_TASKQUEUE);
 	free(queue, M_TASKQUEUE);
 }
 
 static int
 taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task)
 {
 	struct task *ins;
 	struct task *prev;
 
 	KASSERT(task->ta_func != NULL, ("enqueueing task with NULL func"));
 	/*
 	 * Count multiple enqueues.
 	 */
 	if (task->ta_pending) {
 		if (task->ta_pending < USHRT_MAX)
 			task->ta_pending++;
 		TQ_UNLOCK(queue);
 		return (0);
 	}
 
 	/*
 	 * Optimise the case when all tasks have the same priority.
 	 */
 	prev = STAILQ_LAST(&queue->tq_queue, task, ta_link);
 	if (!prev || prev->ta_priority >= task->ta_priority) {
 		STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link);
 	} else {
 		prev = NULL;
 		for (ins = STAILQ_FIRST(&queue->tq_queue); ins;
 		     prev = ins, ins = STAILQ_NEXT(ins, ta_link))
 			if (ins->ta_priority < task->ta_priority)
 				break;
 
 		if (prev)
 			STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link);
 		else
 			STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link);
 	}
 
 	task->ta_pending = 1;
 	if ((queue->tq_flags & TQ_FLAGS_UNLOCKED_ENQUEUE) != 0)
 		TQ_UNLOCK(queue);
 	if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0)
 		queue->tq_enqueue(queue->tq_context);
 	if ((queue->tq_flags & TQ_FLAGS_UNLOCKED_ENQUEUE) == 0)
 		TQ_UNLOCK(queue);
 
 	/* Return with lock released. */
 	return (0);
 }
 
 int
 taskqueue_enqueue(struct taskqueue *queue, struct task *task)
 {
 	int res;
 
 	TQ_LOCK(queue);
 	res = taskqueue_enqueue_locked(queue, task);
 	/* The lock is released inside. */
 
 	return (res);
 }
 
 static void
 taskqueue_timeout_func(void *arg)
 {
 	struct taskqueue *queue;
 	struct timeout_task *timeout_task;
 
 	timeout_task = arg;
 	queue = timeout_task->q;
 	KASSERT((timeout_task->f & DT_CALLOUT_ARMED) != 0, ("Stray timeout"));
 	timeout_task->f &= ~DT_CALLOUT_ARMED;
 	queue->tq_callouts--;
 	taskqueue_enqueue_locked(timeout_task->q, &timeout_task->t);
 	/* The lock is released inside. */
 }
 
 int
 taskqueue_enqueue_timeout_sbt(struct taskqueue *queue,
     struct timeout_task *timeout_task, sbintime_t sbt, sbintime_t pr, int flags)
 {
 	int res;
 
 	TQ_LOCK(queue);
 	KASSERT(timeout_task->q == NULL || timeout_task->q == queue,
 	    ("Migrated queue"));
 	KASSERT(!queue->tq_spin, ("Timeout for spin-queue"));
 	timeout_task->q = queue;
 	res = timeout_task->t.ta_pending;
 	if (timeout_task->f & DT_DRAIN_IN_PROGRESS) {
 		/* Do nothing */
 		TQ_UNLOCK(queue);
 		res = -1;
 	} else if (sbt == 0) {
 		taskqueue_enqueue_locked(queue, &timeout_task->t);
 		/* The lock is released inside. */
 	} else {
 		if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) {
 			res++;
 		} else {
 			queue->tq_callouts++;
 			timeout_task->f |= DT_CALLOUT_ARMED;
 			if (sbt < 0)
 				sbt = -sbt; /* Ignore overflow. */
 		}
 		if (sbt > 0) {
 			callout_reset_sbt(&timeout_task->c, sbt, pr,
 			    taskqueue_timeout_func, timeout_task, flags);
 		}
 		TQ_UNLOCK(queue);
 	}
 	return (res);
 }
 
 int
 taskqueue_enqueue_timeout(struct taskqueue *queue,
     struct timeout_task *ttask, int ticks)
 {
 
 	return (taskqueue_enqueue_timeout_sbt(queue, ttask, ticks * tick_sbt,
 	    0, 0));
 }
 
 static void
 taskqueue_task_nop_fn(void *context, int pending)
 {
 }
 
 /*
  * Block until all currently queued tasks in this taskqueue
  * have begun execution.  Tasks queued during execution of
  * this function are ignored.
  */
 static void
 taskqueue_drain_tq_queue(struct taskqueue *queue)
 {
 	struct task t_barrier;
 
 	if (STAILQ_EMPTY(&queue->tq_queue))
 		return;
 
 	/*
 	 * Enqueue our barrier after all current tasks, but with
 	 * the highest priority so that newly queued tasks cannot
 	 * pass it.  Because of the high priority, we can not use
 	 * taskqueue_enqueue_locked directly (which drops the lock
 	 * anyway) so just insert it at tail while we have the
 	 * queue lock.
 	 */
 	TASK_INIT(&t_barrier, USHRT_MAX, taskqueue_task_nop_fn, &t_barrier);
 	STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link);
 	t_barrier.ta_pending = 1;
 
 	/*
 	 * Once the barrier has executed, all previously queued tasks
 	 * have completed or are currently executing.
 	 */
 	while (t_barrier.ta_pending != 0)
 		TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0);
 }
 
 /*
  * Block until all currently executing tasks for this taskqueue
  * complete.  Tasks that begin execution during the execution
  * of this function are ignored.
  */
 static void
 taskqueue_drain_tq_active(struct taskqueue *queue)
 {
 	struct taskqueue_busy tb_marker, *tb_first;
 
 	if (TAILQ_EMPTY(&queue->tq_active))
 		return;
 
 	/* Block taskq_terminate().*/
 	queue->tq_callouts++;
 
 	/*
 	 * Wait for all currently executing taskqueue threads
 	 * to go idle.
 	 */
 	tb_marker.tb_running = TB_DRAIN_WAITER;
 	TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link);
 	while (TAILQ_FIRST(&queue->tq_active) != &tb_marker)
 		TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0);
 	TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link);
 
 	/*
 	 * Wakeup any other drain waiter that happened to queue up
 	 * without any intervening active thread.
 	 */
 	tb_first = TAILQ_FIRST(&queue->tq_active);
 	if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER)
 		wakeup(tb_first);
 
 	/* Release taskqueue_terminate(). */
 	queue->tq_callouts--;
 	if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0)
 		wakeup_one(queue->tq_threads);
 }
 
 void
 taskqueue_block(struct taskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags |= TQ_FLAGS_BLOCKED;
 	TQ_UNLOCK(queue);
 }
 
 void
 taskqueue_unblock(struct taskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags &= ~TQ_FLAGS_BLOCKED;
 	if (!STAILQ_EMPTY(&queue->tq_queue))
 		queue->tq_enqueue(queue->tq_context);
 	TQ_UNLOCK(queue);
 }
 
 static void
 taskqueue_run_locked(struct taskqueue *queue)
 {
 	struct taskqueue_busy tb;
 	struct taskqueue_busy *tb_first;
 	struct task *task;
 	int pending;
 
 	KASSERT(queue != NULL, ("tq is NULL"));
 	TQ_ASSERT_LOCKED(queue);
 	tb.tb_running = NULL;
 
 	while (STAILQ_FIRST(&queue->tq_queue)) {
 		TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link);
 
 		/*
 		 * Carefully remove the first task from the queue and
 		 * zero its pending count.
 		 */
 		task = STAILQ_FIRST(&queue->tq_queue);
 		KASSERT(task != NULL, ("task is NULL"));
 		STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
 		pending = task->ta_pending;
 		task->ta_pending = 0;
 		tb.tb_running = task;
 		TQ_UNLOCK(queue);
 
 		KASSERT(task->ta_func != NULL, ("task->ta_func is NULL"));
 		task->ta_func(task->ta_context, pending);
 
 		TQ_LOCK(queue);
 		tb.tb_running = NULL;
 		wakeup(task);
 
 		TAILQ_REMOVE(&queue->tq_active, &tb, tb_link);
 		tb_first = TAILQ_FIRST(&queue->tq_active);
 		if (tb_first != NULL &&
 		    tb_first->tb_running == TB_DRAIN_WAITER)
 			wakeup(tb_first);
 	}
 }
 
 void
 taskqueue_run(struct taskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	taskqueue_run_locked(queue);
 	TQ_UNLOCK(queue);
 }
 
 static int
 task_is_running(struct taskqueue *queue, struct task *task)
 {
 	struct taskqueue_busy *tb;
 
 	TQ_ASSERT_LOCKED(queue);
 	TAILQ_FOREACH(tb, &queue->tq_active, tb_link) {
 		if (tb->tb_running == task)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Only use this function in single threaded contexts. It returns
  * non-zero if the given task is either pending or running. Else the
  * task is idle and can be queued again or freed.
  */
 int
 taskqueue_poll_is_busy(struct taskqueue *queue, struct task *task)
 {
 	int retval;
 
 	TQ_LOCK(queue);
 	retval = task->ta_pending > 0 || task_is_running(queue, task);
 	TQ_UNLOCK(queue);
 
 	return (retval);
 }
 
 static int
 taskqueue_cancel_locked(struct taskqueue *queue, struct task *task,
     u_int *pendp)
 {
 
 	if (task->ta_pending > 0)
 		STAILQ_REMOVE(&queue->tq_queue, task, task, ta_link);
 	if (pendp != NULL)
 		*pendp = task->ta_pending;
 	task->ta_pending = 0;
 	return (task_is_running(queue, task) ? EBUSY : 0);
 }
 
 int
 taskqueue_cancel(struct taskqueue *queue, struct task *task, u_int *pendp)
 {
 	int error;
 
 	TQ_LOCK(queue);
 	error = taskqueue_cancel_locked(queue, task, pendp);
 	TQ_UNLOCK(queue);
 
 	return (error);
 }
 
 int
 taskqueue_cancel_timeout(struct taskqueue *queue,
     struct timeout_task *timeout_task, u_int *pendp)
 {
 	u_int pending, pending1;
 	int error;
 
 	TQ_LOCK(queue);
 	pending = !!(callout_stop(&timeout_task->c) > 0);
 	error = taskqueue_cancel_locked(queue, &timeout_task->t, &pending1);
 	if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) {
 		timeout_task->f &= ~DT_CALLOUT_ARMED;
 		queue->tq_callouts--;
 	}
 	TQ_UNLOCK(queue);
 
 	if (pendp != NULL)
 		*pendp = pending + pending1;
 	return (error);
 }
 
 void
 taskqueue_drain(struct taskqueue *queue, struct task *task)
 {
 
 	if (!queue->tq_spin)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
 
 	TQ_LOCK(queue);
 	while (task->ta_pending != 0 || task_is_running(queue, task))
 		TQ_SLEEP(queue, task, &queue->tq_mutex, PWAIT, "-", 0);
 	TQ_UNLOCK(queue);
 }
 
 void
 taskqueue_drain_all(struct taskqueue *queue)
 {
 
 	if (!queue->tq_spin)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
 
 	TQ_LOCK(queue);
 	taskqueue_drain_tq_queue(queue);
 	taskqueue_drain_tq_active(queue);
 	TQ_UNLOCK(queue);
 }
 
 void
 taskqueue_drain_timeout(struct taskqueue *queue,
     struct timeout_task *timeout_task)
 {
 
 	/*
 	 * Set flag to prevent timer from re-starting during drain:
 	 */
 	TQ_LOCK(queue);
 	KASSERT((timeout_task->f & DT_DRAIN_IN_PROGRESS) == 0,
 	    ("Drain already in progress"));
 	timeout_task->f |= DT_DRAIN_IN_PROGRESS;
 	TQ_UNLOCK(queue);
 
 	callout_drain(&timeout_task->c);
 	taskqueue_drain(queue, &timeout_task->t);
 
 	/*
 	 * Clear flag to allow timer to re-start:
 	 */
 	TQ_LOCK(queue);
 	timeout_task->f &= ~DT_DRAIN_IN_PROGRESS;
 	TQ_UNLOCK(queue);
 }
 
 static void
 taskqueue_swi_enqueue(void *context)
 {
 	swi_sched(taskqueue_ih, 0);
 }
 
 static void
 taskqueue_swi_run(void *dummy)
 {
 	taskqueue_run(taskqueue_swi);
 }
 
 static void
 taskqueue_swi_giant_enqueue(void *context)
 {
 	swi_sched(taskqueue_giant_ih, 0);
 }
 
 static void
 taskqueue_swi_giant_run(void *dummy)
 {
 	taskqueue_run(taskqueue_swi_giant);
 }
 
 static int
 _taskqueue_start_threads(struct taskqueue **tqp, int count, int pri,
     cpuset_t *mask, const char *name, va_list ap)
 {
 	char ktname[MAXCOMLEN + 1];
 	struct thread *td;
 	struct taskqueue *tq;
 	int i, error;
 
 	if (count <= 0)
 		return (EINVAL);
 
 	vsnprintf(ktname, sizeof(ktname), name, ap);
 	tq = *tqp;
 
 	tq->tq_threads = malloc(sizeof(struct thread *) * count, M_TASKQUEUE,
 	    M_NOWAIT | M_ZERO);
 	if (tq->tq_threads == NULL) {
 		printf("%s: no memory for %s threads\n", __func__, ktname);
 		return (ENOMEM);
 	}
 
 	for (i = 0; i < count; i++) {
 		if (count == 1)
 			error = kthread_add(taskqueue_thread_loop, tqp, NULL,
 			    &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname);
 		else
 			error = kthread_add(taskqueue_thread_loop, tqp, NULL,
 			    &tq->tq_threads[i], RFSTOPPED, 0,
 			    "%s_%d", ktname, i);
 		if (error) {
 			/* should be ok to continue, taskqueue_free will dtrt */
 			printf("%s: kthread_add(%s): error %d", __func__,
 			    ktname, error);
 			tq->tq_threads[i] = NULL;		/* paranoid */
 		} else
 			tq->tq_tcount++;
 	}
 	if (tq->tq_tcount == 0) {
 		free(tq->tq_threads, M_TASKQUEUE);
 		tq->tq_threads = NULL;
 		return (ENOMEM);
 	}
 	for (i = 0; i < count; i++) {
 		if (tq->tq_threads[i] == NULL)
 			continue;
 		td = tq->tq_threads[i];
 		if (mask) {
 			error = cpuset_setthread(td->td_tid, mask);
 			/*
 			 * Failing to pin is rarely an actual fatal error;
 			 * it'll just affect performance.
 			 */
 			if (error)
 				printf("%s: curthread=%llu: can't pin; "
 				    "error=%d\n",
 				    __func__,
 				    (unsigned long long) td->td_tid,
 				    error);
 		}
 		thread_lock(td);
 		sched_prio(td, pri);
 		sched_add(td, SRQ_BORING);
 		thread_unlock(td);
 	}
 
 	return (0);
 }
 
 int
 taskqueue_start_threads(struct taskqueue **tqp, int count, int pri,
     const char *name, ...)
 {
 	va_list ap;
 	int error;
 
 	va_start(ap, name);
 	error = _taskqueue_start_threads(tqp, count, pri, NULL, name, ap);
 	va_end(ap);
 	return (error);
 }
 
 int
 taskqueue_start_threads_cpuset(struct taskqueue **tqp, int count, int pri,
     cpuset_t *mask, const char *name, ...)
 {
 	va_list ap;
 	int error;
 
 	va_start(ap, name);
 	error = _taskqueue_start_threads(tqp, count, pri, mask, name, ap);
 	va_end(ap);
 	return (error);
 }
 
 static inline void
 taskqueue_run_callback(struct taskqueue *tq,
     enum taskqueue_callback_type cb_type)
 {
 	taskqueue_callback_fn tq_callback;
 
 	TQ_ASSERT_UNLOCKED(tq);
 	tq_callback = tq->tq_callbacks[cb_type];
 	if (tq_callback != NULL)
 		tq_callback(tq->tq_cb_contexts[cb_type]);
 }
 
 void
 taskqueue_thread_loop(void *arg)
 {
 	struct taskqueue **tqp, *tq;
 
 	tqp = arg;
 	tq = *tqp;
 	taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT);
 	TQ_LOCK(tq);
 	while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) {
 		/* XXX ? */
 		taskqueue_run_locked(tq);
 		/*
 		 * Because taskqueue_run() can drop tq_mutex, we need to
 		 * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the
 		 * meantime, which means we missed a wakeup.
 		 */
 		if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0)
 			break;
 		TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
 	}
 	taskqueue_run_locked(tq);
 	/*
 	 * This thread is on its way out, so just drop the lock temporarily
 	 * in order to call the shutdown callback.  This allows the callback
 	 * to look at the taskqueue, even just before it dies.
 	 */
 	TQ_UNLOCK(tq);
 	taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN);
 	TQ_LOCK(tq);
 
 	/* rendezvous with thread that asked us to terminate */
 	tq->tq_tcount--;
 	wakeup_one(tq->tq_threads);
 	TQ_UNLOCK(tq);
 	kthread_exit();
 }
 
 void
 taskqueue_thread_enqueue(void *context)
 {
 	struct taskqueue **tqp, *tq;
 
 	tqp = context;
 	tq = *tqp;
 	wakeup_one(tq);
 }
 
 TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL,
 		 swi_add(NULL, "task queue", taskqueue_swi_run, NULL, SWI_TQ,
 		     INTR_MPSAFE, &taskqueue_ih));
 
 TASKQUEUE_DEFINE(swi_giant, taskqueue_swi_giant_enqueue, NULL,
 		 swi_add(NULL, "Giant taskq", taskqueue_swi_giant_run,
 		     NULL, SWI_TQ_GIANT, 0, &taskqueue_giant_ih));
 
 TASKQUEUE_DEFINE_THREAD(thread);
 
 struct taskqueue *
 taskqueue_create_fast(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context)
 {
 	return _taskqueue_create(name, mflags, enqueue, context,
 			MTX_SPIN, "fast_taskqueue");
 }
 
 static void	*taskqueue_fast_ih;
 
 static void
 taskqueue_fast_enqueue(void *context)
 {
 	swi_sched(taskqueue_fast_ih, 0);
 }
 
 static void
 taskqueue_fast_run(void *dummy)
 {
 	taskqueue_run(taskqueue_fast);
 }
 
 TASKQUEUE_FAST_DEFINE(fast, taskqueue_fast_enqueue, NULL,
 	swi_add(NULL, "fast taskq", taskqueue_fast_run, NULL,
 	SWI_TQ_FAST, INTR_MPSAFE, &taskqueue_fast_ih));
 
 int
 taskqueue_member(struct taskqueue *queue, struct thread *td)
 {
 	int i, j, ret = 0;
 
 	for (i = 0, j = 0; ; i++) {
 		if (queue->tq_threads[i] == NULL)
 			continue;
 		if (queue->tq_threads[i] == td) {
 			ret = 1;
 			break;
 		}
 		if (++j >= queue->tq_tcount)
 			break;
 	}
 	return (ret);
 }
Index: head/sys/kern/subr_terminal.c
===================================================================
--- head/sys/kern/subr_terminal.c	(revision 326270)
+++ head/sys/kern/subr_terminal.c	(revision 326271)
@@ -1,658 +1,660 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2009 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Ed Schouten under sponsorship from the
  * FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/cons.h>
 #include <sys/consio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/terminal.h>
 #include <sys/tty.h>
 
 #include <machine/stdarg.h>
 
 static MALLOC_DEFINE(M_TERMINAL, "terminal", "terminal device");
 
 /*
  * Locking.
  *
  * Normally we don't need to lock down the terminal emulator, because
  * the TTY lock is already held when calling teken_input().
  * Unfortunately this is not the case when the terminal acts as a
  * console device, because cnputc() can be called at the same time.
  * This means terminals may need to be locked down using a spin lock.
  */
 #define	TERMINAL_LOCK(tm)	do {					\
 	if ((tm)->tm_flags & TF_CONS)					\
 		mtx_lock_spin(&(tm)->tm_mtx);				\
 	else if ((tm)->tm_tty != NULL)					\
 		tty_lock((tm)->tm_tty);					\
 } while (0)
 #define	TERMINAL_UNLOCK(tm)	do {					\
 	if ((tm)->tm_flags & TF_CONS)					\
 		mtx_unlock_spin(&(tm)->tm_mtx);				\
 	else if ((tm)->tm_tty != NULL)					\
 		tty_unlock((tm)->tm_tty);				\
 } while (0)
 #define	TERMINAL_LOCK_TTY(tm)	do {					\
 	if ((tm)->tm_flags & TF_CONS)					\
 		mtx_lock_spin(&(tm)->tm_mtx);				\
 } while (0)
 #define	TERMINAL_UNLOCK_TTY(tm)	do {					\
 	if ((tm)->tm_flags & TF_CONS)					\
 		mtx_unlock_spin(&(tm)->tm_mtx);				\
 } while (0)
 #define	TERMINAL_LOCK_CONS(tm)		mtx_lock_spin(&(tm)->tm_mtx)
 #define	TERMINAL_UNLOCK_CONS(tm)	mtx_unlock_spin(&(tm)->tm_mtx)
 
 /*
  * TTY routines.
  */
 
 static tsw_open_t	termtty_open;
 static tsw_close_t	termtty_close;
 static tsw_outwakeup_t	termtty_outwakeup;
 static tsw_ioctl_t	termtty_ioctl;
 static tsw_mmap_t	termtty_mmap;
 
 static struct ttydevsw terminal_tty_class = {
 	.tsw_open	= termtty_open,
 	.tsw_close	= termtty_close,
 	.tsw_outwakeup	= termtty_outwakeup,
 	.tsw_ioctl	= termtty_ioctl,
 	.tsw_mmap	= termtty_mmap,
 };
 
 /*
  * Terminal emulator routines.
  */
 
 static tf_bell_t	termteken_bell;
 static tf_cursor_t	termteken_cursor;
 static tf_putchar_t	termteken_putchar;
 static tf_fill_t	termteken_fill;
 static tf_copy_t	termteken_copy;
 static tf_param_t	termteken_param;
 static tf_respond_t	termteken_respond;
 
 static teken_funcs_t terminal_drawmethods = {
 	.tf_bell	= termteken_bell,
 	.tf_cursor	= termteken_cursor,
 	.tf_putchar	= termteken_putchar,
 	.tf_fill	= termteken_fill,
 	.tf_copy	= termteken_copy,
 	.tf_param	= termteken_param,
 	.tf_respond	= termteken_respond,
 };
 
 /* Kernel message formatting. */
 static const teken_attr_t kernel_message = {
 	.ta_fgcolor	= TCHAR_FGCOLOR(TERMINAL_KERN_ATTR),
 	.ta_bgcolor	= TCHAR_BGCOLOR(TERMINAL_KERN_ATTR),
 	.ta_format	= TCHAR_FORMAT(TERMINAL_KERN_ATTR)
 };
 
 static const teken_attr_t default_message = {
 	.ta_fgcolor	= TCHAR_FGCOLOR(TERMINAL_NORM_ATTR),
 	.ta_bgcolor	= TCHAR_BGCOLOR(TERMINAL_NORM_ATTR),
 	.ta_format	= TCHAR_FORMAT(TERMINAL_NORM_ATTR)
 };
 
 /* Fudge fg brightness as TF_BOLD (shifted). */
 #define	TCOLOR_FG_FUDGED(color) __extension__ ({			\
 	teken_color_t _c;						\
 									\
 	_c = (color);							\
 	TCOLOR_FG(_c & 7) | ((_c & 8) << 18);				\
 })
 
 /* Fudge bg brightness as TF_BLINK (shifted). */
 #define	TCOLOR_BG_FUDGED(color) __extension__ ({			\
 	teken_color_t _c;						\
 									\
 	_c = (color);							\
 	TCOLOR_BG(_c & 7) | ((_c & 8) << 20);				\
 })
 
 #define	TCOLOR_256TO16(color) __extension__ ({				\
 	teken_color_t _c;						\
 									\
 	_c = (color);							\
 	if (_c >= 16)							\
 		_c = teken_256to16(_c);					\
 	_c;								\
 })
 
 #define	TCHAR_CREATE(c, a)	((c) | TFORMAT((a)->ta_format) |	\
 	TCOLOR_FG_FUDGED(TCOLOR_256TO16((a)->ta_fgcolor)) |		\
 	TCOLOR_BG_FUDGED(TCOLOR_256TO16((a)->ta_bgcolor)))
 
 static void
 terminal_init(struct terminal *tm)
 {
 
 	if (tm->tm_flags & TF_CONS)
 		mtx_init(&tm->tm_mtx, "trmlck", NULL, MTX_SPIN);
 	teken_init(&tm->tm_emulator, &terminal_drawmethods, tm);
 	teken_set_defattr(&tm->tm_emulator, &default_message);
 }
 
 struct terminal *
 terminal_alloc(const struct terminal_class *tc, void *softc)
 {
 	struct terminal *tm;
 
 	tm = malloc(sizeof(struct terminal), M_TERMINAL, M_WAITOK|M_ZERO);
 	terminal_init(tm);
 
 	tm->tm_class = tc;
 	tm->tm_softc = softc;
 
 	return (tm);
 }
 
 static void
 terminal_sync_ttysize(struct terminal *tm)
 {
 	struct tty *tp;
 
 	tp = tm->tm_tty;
 	if (tp == NULL)
 		return;
 
 	tty_lock(tp);
 	tty_set_winsize(tp, &tm->tm_winsize);
 	tty_unlock(tp);
 }
 
 void
 terminal_maketty(struct terminal *tm, const char *fmt, ...)
 {
 	struct tty *tp;
 	char name[8];
 	va_list ap;
 
 	va_start(ap, fmt);
 	vsnrprintf(name, sizeof name, 32, fmt, ap);
 	va_end(ap);
 
 	tp = tty_alloc(&terminal_tty_class, tm);
 	tty_makedev(tp, NULL, "%s", name);
 	tm->tm_tty = tp;
 	terminal_sync_ttysize(tm);
 }
 
 void
 terminal_set_cursor(struct terminal *tm, const term_pos_t *pos)
 {
 
 	teken_set_cursor(&tm->tm_emulator, pos);
 }
 
 void
 terminal_set_winsize_blank(struct terminal *tm, const struct winsize *size,
     int blank, const term_attr_t *attr)
 {
 	term_rect_t r;
 
 	tm->tm_winsize = *size;
 
 	r.tr_begin.tp_row = r.tr_begin.tp_col = 0;
 	r.tr_end.tp_row = size->ws_row;
 	r.tr_end.tp_col = size->ws_col;
 
 	TERMINAL_LOCK(tm);
 	if (blank == 0)
 		teken_set_winsize_noreset(&tm->tm_emulator, &r.tr_end);
 	else
 		teken_set_winsize(&tm->tm_emulator, &r.tr_end);
 	TERMINAL_UNLOCK(tm);
 
 	if ((blank != 0) && !(tm->tm_flags & TF_MUTE))
 		tm->tm_class->tc_fill(tm, &r,
 		    TCHAR_CREATE((teken_char_t)' ', attr));
 
 	terminal_sync_ttysize(tm);
 }
 
 void
 terminal_set_winsize(struct terminal *tm, const struct winsize *size)
 {
 
 	terminal_set_winsize_blank(tm, size, 1,
 	    (const term_attr_t *)&default_message);
 }
 
 /*
  * XXX: This function is a kludge.  Drivers like vt(4) need to
  * temporarily stop input when resizing, etc.  This should ideally be
  * handled within the driver.
  */
 
 void
 terminal_mute(struct terminal *tm, int yes)
 {
 
 	TERMINAL_LOCK(tm);
 	if (yes)
 		tm->tm_flags |= TF_MUTE;
 	else
 		tm->tm_flags &= ~TF_MUTE;
 	TERMINAL_UNLOCK(tm);
 }
 
 void
 terminal_input_char(struct terminal *tm, term_char_t c)
 {
 	struct tty *tp;
 
 	tp = tm->tm_tty;
 	if (tp == NULL)
 		return;
 
 	/*
 	 * Strip off any attributes. Also ignore input of second part of
 	 * CJK fullwidth characters, as we don't want to return these
 	 * characters twice.
 	 */
 	if (TCHAR_FORMAT(c) & TF_CJK_RIGHT)
 		return;
 	c = TCHAR_CHARACTER(c);
 
 	tty_lock(tp);
 	/*
 	 * Conversion to UTF-8.
 	 */
 	if (c < 0x80) {
 		ttydisc_rint(tp, c, 0);
 	} else if (c < 0x800) {
 		char str[2] = {
 			0xc0 | (c >> 6),
 			0x80 | (c & 0x3f)
 		};
 
 		ttydisc_rint_simple(tp, str, sizeof str);
 	} else if (c < 0x10000) {
 		char str[3] = {
 			0xe0 | (c >> 12),
 			0x80 | ((c >> 6) & 0x3f),
 			0x80 | (c & 0x3f)
 		};
 
 		ttydisc_rint_simple(tp, str, sizeof str);
 	} else {
 		char str[4] = {
 			0xf0 | (c >> 18),
 			0x80 | ((c >> 12) & 0x3f),
 			0x80 | ((c >> 6) & 0x3f),
 			0x80 | (c & 0x3f)
 		};
 
 		ttydisc_rint_simple(tp, str, sizeof str);
 	}
 	ttydisc_rint_done(tp);
 	tty_unlock(tp);
 }
 
 void
 terminal_input_raw(struct terminal *tm, char c)
 {
 	struct tty *tp;
 
 	tp = tm->tm_tty;
 	if (tp == NULL)
 		return;
 
 	tty_lock(tp);
 	ttydisc_rint(tp, c, 0);
 	ttydisc_rint_done(tp);
 	tty_unlock(tp);
 }
 
 void
 terminal_input_special(struct terminal *tm, unsigned int k)
 {
 	struct tty *tp;
 	const char *str;
 
 	tp = tm->tm_tty;
 	if (tp == NULL)
 		return;
 
 	str = teken_get_sequence(&tm->tm_emulator, k);
 	if (str == NULL)
 		return;
 
 	tty_lock(tp);
 	ttydisc_rint_simple(tp, str, strlen(str));
 	ttydisc_rint_done(tp);
 	tty_unlock(tp);
 }
 
 /*
  * Binding with the TTY layer.
  */
 
 static int
 termtty_open(struct tty *tp)
 {
 	struct terminal *tm = tty_softc(tp);
 
 	tm->tm_class->tc_opened(tm, 1);
 	return (0);
 }
 
 static void
 termtty_close(struct tty *tp)
 {
 	struct terminal *tm = tty_softc(tp);
 
 	tm->tm_class->tc_opened(tm, 0);
 }
 
 static void
 termtty_outwakeup(struct tty *tp)
 {
 	struct terminal *tm = tty_softc(tp);
 	char obuf[128];
 	size_t olen;
 	unsigned int flags = 0;
 
 	while ((olen = ttydisc_getc(tp, obuf, sizeof obuf)) > 0) {
 		TERMINAL_LOCK_TTY(tm);
 		if (!(tm->tm_flags & TF_MUTE)) {
 			tm->tm_flags &= ~TF_BELL;
 			teken_input(&tm->tm_emulator, obuf, olen);
 			flags |= tm->tm_flags;
 		}
 		TERMINAL_UNLOCK_TTY(tm);
 	}
 
 	TERMINAL_LOCK_TTY(tm);
 	if (!(tm->tm_flags & TF_MUTE))
 		tm->tm_class->tc_done(tm);
 	TERMINAL_UNLOCK_TTY(tm);
 	if (flags & TF_BELL)
 		tm->tm_class->tc_bell(tm);
 }
 
 static int
 termtty_ioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
 {
 	struct terminal *tm = tty_softc(tp);
 	int error;
 
 	switch (cmd) {
 	case CONS_GETINFO: {
 		vid_info_t *vi = (vid_info_t *)data;
 		const teken_pos_t *p;
 		int fg, bg;
 
 		if (vi->size != sizeof(vid_info_t))
 			return (EINVAL);
 
 		/* Already help the console driver by filling in some data. */
 		p = teken_get_cursor(&tm->tm_emulator);
 		vi->mv_row = p->tp_row;
 		vi->mv_col = p->tp_col;
 
 		p = teken_get_winsize(&tm->tm_emulator);
 		vi->mv_rsz = p->tp_row;
 		vi->mv_csz = p->tp_col;
 
 		teken_get_defattr_cons25(&tm->tm_emulator, &fg, &bg);
 		vi->mv_norm.fore = fg;
 		vi->mv_norm.back = bg;
 		/* XXX: keep vidcontrol happy; bold backgrounds. */
 		vi->mv_rev.fore = bg;
 		vi->mv_rev.back = fg & 0x7;
 		break;
 	}
 	}
 
 	/*
 	 * Unlike various other drivers, this driver will never
 	 * deallocate TTYs.  This means it's safe to temporarily unlock
 	 * the TTY when handling ioctls.
 	 */
 	tty_unlock(tp);
 	error = tm->tm_class->tc_ioctl(tm, cmd, data, td);
 	tty_lock(tp);
 	return (error);
 }
 
 static int
 termtty_mmap(struct tty *tp, vm_ooffset_t offset, vm_paddr_t * paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	struct terminal *tm = tty_softc(tp);
 
 	return (tm->tm_class->tc_mmap(tm, offset, paddr, nprot, memattr));
 }
 
 /*
  * Binding with the kernel and debug console.
  */
 
 static cn_probe_t	termcn_cnprobe;
 static cn_init_t	termcn_cninit;
 static cn_term_t	termcn_cnterm;
 static cn_getc_t	termcn_cngetc;
 static cn_putc_t	termcn_cnputc;
 static cn_grab_t	termcn_cngrab;
 static cn_ungrab_t	termcn_cnungrab;
 
 const struct consdev_ops termcn_cnops = {
 	.cn_probe	= termcn_cnprobe,
 	.cn_init	= termcn_cninit,
 	.cn_term	= termcn_cnterm,
 	.cn_getc	= termcn_cngetc,
 	.cn_putc	= termcn_cnputc,
 	.cn_grab	= termcn_cngrab,
 	.cn_ungrab	= termcn_cnungrab,
 };
 
 void
 termcn_cnregister(struct terminal *tm)
 {
 	struct consdev *cp;
 
 	cp = tm->consdev;
 	if (cp == NULL) {
 		cp = malloc(sizeof(struct consdev), M_TERMINAL,
 		    M_WAITOK|M_ZERO);
 		cp->cn_ops = &termcn_cnops;
 		cp->cn_arg = tm;
 		cp->cn_pri = CN_INTERNAL;
 		sprintf(cp->cn_name, "ttyv0");
 
 		tm->tm_flags = TF_CONS;
 		tm->consdev = cp;
 
 		terminal_init(tm);
 	}
 
 	/* Attach terminal as console. */
 	cnadd(cp);
 }
 
 static void
 termcn_cngrab(struct consdev *cp)
 {
 	struct terminal *tm = cp->cn_arg;
 
 	tm->tm_class->tc_cngrab(tm);
 }
 
 static void
 termcn_cnungrab(struct consdev *cp)
 {
 	struct terminal *tm = cp->cn_arg;
 
 	tm->tm_class->tc_cnungrab(tm);
 }
 
 static void
 termcn_cnprobe(struct consdev *cp)
 {
 	struct terminal *tm = cp->cn_arg;
 
 	if (tm == NULL) {
 		cp->cn_pri = CN_DEAD;
 		return;
 	}
 
 	tm->consdev = cp;
 	terminal_init(tm);
 
 	tm->tm_class->tc_cnprobe(tm, cp);
 }
 
 static void
 termcn_cninit(struct consdev *cp)
 {
 
 }
 
 static void
 termcn_cnterm(struct consdev *cp)
 {
 
 }
 
 static int
 termcn_cngetc(struct consdev *cp)
 {
 	struct terminal *tm = cp->cn_arg;
 
 	return (tm->tm_class->tc_cngetc(tm));
 }
 
 static void
 termcn_cnputc(struct consdev *cp, int c)
 {
 	struct terminal *tm = cp->cn_arg;
 	teken_attr_t backup;
 	char cv = c;
 
 	TERMINAL_LOCK_CONS(tm);
 	if (!(tm->tm_flags & TF_MUTE)) {
 		backup = *teken_get_curattr(&tm->tm_emulator);
 		teken_set_curattr(&tm->tm_emulator, &kernel_message);
 		teken_input(&tm->tm_emulator, &cv, 1);
 		teken_set_curattr(&tm->tm_emulator, &backup);
 		tm->tm_class->tc_done(tm);
 	}
 	TERMINAL_UNLOCK_CONS(tm);
 }
 
 /*
  * Binding with the terminal emulator.
  */
 
 static void
 termteken_bell(void *softc)
 {
 	struct terminal *tm = softc;
 
 	tm->tm_flags |= TF_BELL;
 }
 
 static void
 termteken_cursor(void *softc, const teken_pos_t *p)
 {
 	struct terminal *tm = softc;
 
 	tm->tm_class->tc_cursor(tm, p);
 }
 
 static void
 termteken_putchar(void *softc, const teken_pos_t *p, teken_char_t c,
     const teken_attr_t *a)
 {
 	struct terminal *tm = softc;
 
 	tm->tm_class->tc_putchar(tm, p, TCHAR_CREATE(c, a));
 }
 
 static void
 termteken_fill(void *softc, const teken_rect_t *r, teken_char_t c,
     const teken_attr_t *a)
 {
 	struct terminal *tm = softc;
 
 	tm->tm_class->tc_fill(tm, r, TCHAR_CREATE(c, a));
 }
 
 static void
 termteken_copy(void *softc, const teken_rect_t *r, const teken_pos_t *p)
 {
 	struct terminal *tm = softc;
 
 	tm->tm_class->tc_copy(tm, r, p);
 }
 
 static void
 termteken_param(void *softc, int cmd, unsigned int arg)
 {
 	struct terminal *tm = softc;
 
 	tm->tm_class->tc_param(tm, cmd, arg);
 }
 
 static void
 termteken_respond(void *softc, const void *buf, size_t len)
 {
 #if 0
 	struct terminal *tm = softc;
 	struct tty *tp;
 
 	/*
 	 * Only inject a response into the TTY if the data actually
 	 * originated from the TTY.
 	 *
 	 * XXX: This cannot be done right now.  The TTY could pick up
 	 * other locks.  It could also in theory cause loops, when the
 	 * TTY performs echoing of a command that generates even more
 	 * input.
 	 */
 	tp = tm->tm_tty;
 	if (tp == NULL)
 		return;
 
 	ttydisc_rint_simple(tp, buf, len);
 	ttydisc_rint_done(tp);
 #endif
 }
Index: head/sys/kern/subr_turnstile.c
===================================================================
--- head/sys/kern/subr_turnstile.c	(revision 326270)
+++ head/sys/kern/subr_turnstile.c	(revision 326271)
@@ -1,1254 +1,1256 @@
 /*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of turnstiles used to hold queue of threads blocked on
  * non-sleepable locks.  Sleepable locks use condition variables to
  * implement their queues.  Turnstiles differ from a sleep queue in that
  * turnstile queue's are assigned to a lock held by an owning thread.  Thus,
  * when one thread is enqueued onto a turnstile, it can lend its priority
  * to the owning thread.
  *
  * We wish to avoid bloating locks with an embedded turnstile and we do not
  * want to use back-pointers in the locks for the same reason.  Thus, we
  * use a similar approach to that of Solaris 7 as described in Solaris
  * Internals by Jim Mauro and Richard McDougall.  Turnstiles are looked up
  * in a hash table based on the address of the lock.  Each entry in the
  * hash table is a linked-lists of turnstiles and is called a turnstile
  * chain.  Each chain contains a spin mutex that protects all of the
  * turnstiles in the chain.
  *
  * Each time a thread is created, a turnstile is allocated from a UMA zone
  * and attached to that thread.  When a thread blocks on a lock, if it is the
  * first thread to block, it lends its turnstile to the lock.  If the lock
  * already has a turnstile, then it gives its turnstile to the lock's
  * turnstile's free list.  When a thread is woken up, it takes a turnstile from
  * the free list if there are any other waiters.  If it is the only thread
  * blocked on the lock, then it reclaims the turnstile associated with the lock
  * and removes it from the hash table.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_turnstile_profiling.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <sys/lockmgr.h>
 #include <sys/sx.h>
 #endif
 
 /*
  * Constants for the hash table of turnstile chains.  TC_SHIFT is a magic
  * number chosen because the sleep queue's use the same value for the
  * shift.  Basically, we ignore the lower 8 bits of the address.
  * TC_TABLESIZE must be a power of two for TC_MASK to work properly.
  */
 #define	TC_TABLESIZE	128			/* Must be power of 2. */
 #define	TC_MASK		(TC_TABLESIZE - 1)
 #define	TC_SHIFT	8
 #define	TC_HASH(lock)	(((uintptr_t)(lock) >> TC_SHIFT) & TC_MASK)
 #define	TC_LOOKUP(lock)	&turnstile_chains[TC_HASH(lock)]
 
 /*
  * There are three different lists of turnstiles as follows.  The list
  * connected by ts_link entries is a per-thread list of all the turnstiles
  * attached to locks that we own.  This is used to fixup our priority when
  * a lock is released.  The other two lists use the ts_hash entries.  The
  * first of these two is the turnstile chain list that a turnstile is on
  * when it is attached to a lock.  The second list to use ts_hash is the
  * free list hung off of a turnstile that is attached to a lock.
  *
  * Each turnstile contains three lists of threads.  The two ts_blocked lists
  * are linked list of threads blocked on the turnstile's lock.  One list is
  * for exclusive waiters, and the other is for shared waiters.  The
  * ts_pending list is a linked list of threads previously awakened by
  * turnstile_signal() or turnstile_wait() that are waiting to be put on
  * the run queue.
  *
  * Locking key:
  *  c - turnstile chain lock
  *  q - td_contested lock
  */
 struct turnstile {
 	struct mtx ts_lock;			/* Spin lock for self. */
 	struct threadqueue ts_blocked[2];	/* (c + q) Blocked threads. */
 	struct threadqueue ts_pending;		/* (c) Pending threads. */
 	LIST_ENTRY(turnstile) ts_hash;		/* (c) Chain and free list. */
 	LIST_ENTRY(turnstile) ts_link;		/* (q) Contested locks. */
 	LIST_HEAD(, turnstile) ts_free;		/* (c) Free turnstiles. */
 	struct lock_object *ts_lockobj;		/* (c) Lock we reference. */
 	struct thread *ts_owner;		/* (c + q) Who owns the lock. */
 };
 
 struct turnstile_chain {
 	LIST_HEAD(, turnstile) tc_turnstiles;	/* List of turnstiles. */
 	struct mtx tc_lock;			/* Spin lock for this chain. */
 #ifdef TURNSTILE_PROFILING
 	u_int	tc_depth;			/* Length of tc_queues. */
 	u_int	tc_max_depth;			/* Max length of tc_queues. */
 #endif
 };
 
 #ifdef TURNSTILE_PROFILING
 u_int turnstile_max_depth;
 static SYSCTL_NODE(_debug, OID_AUTO, turnstile, CTLFLAG_RD, 0,
     "turnstile profiling");
 static SYSCTL_NODE(_debug_turnstile, OID_AUTO, chains, CTLFLAG_RD, 0,
     "turnstile chain stats");
 SYSCTL_UINT(_debug_turnstile, OID_AUTO, max_depth, CTLFLAG_RD,
     &turnstile_max_depth, 0, "maximum depth achieved of a single chain");
 #endif
 static struct mtx td_contested_lock;
 static struct turnstile_chain turnstile_chains[TC_TABLESIZE];
 static uma_zone_t turnstile_zone;
 
 /*
  * Prototypes for non-exported routines.
  */
 static void	init_turnstile0(void *dummy);
 #ifdef TURNSTILE_PROFILING
 static void	init_turnstile_profiling(void *arg);
 #endif
 static void	propagate_priority(struct thread *td);
 static int	turnstile_adjust_thread(struct turnstile *ts,
 		    struct thread *td);
 static struct thread *turnstile_first_waiter(struct turnstile *ts);
 static void	turnstile_setowner(struct turnstile *ts, struct thread *owner);
 #ifdef INVARIANTS
 static void	turnstile_dtor(void *mem, int size, void *arg);
 #endif
 static int	turnstile_init(void *mem, int size, int flags);
 static void	turnstile_fini(void *mem, int size);
 
 SDT_PROVIDER_DECLARE(sched);
 SDT_PROBE_DEFINE(sched, , , sleep);
 SDT_PROBE_DEFINE2(sched, , , wakeup, "struct thread *", 
     "struct proc *");
 
 /*
  * Walks the chain of turnstiles and their owners to propagate the priority
  * of the thread being blocked to all the threads holding locks that have to
  * release their locks before this thread can run again.
  */
 static void
 propagate_priority(struct thread *td)
 {
 	struct turnstile *ts;
 	int pri;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	pri = td->td_priority;
 	ts = td->td_blocked;
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	/*
 	 * Grab a recursive lock on this turnstile chain so it stays locked
 	 * for the whole operation.  The caller expects us to return with
 	 * the original lock held.  We only ever lock down the chain so
 	 * the lock order is constant.
 	 */
 	mtx_lock_spin(&ts->ts_lock);
 	for (;;) {
 		td = ts->ts_owner;
 
 		if (td == NULL) {
 			/*
 			 * This might be a read lock with no owner.  There's
 			 * not much we can do, so just bail.
 			 */
 			mtx_unlock_spin(&ts->ts_lock);
 			return;
 		}
 
 		thread_lock_flags(td, MTX_DUPOK);
 		mtx_unlock_spin(&ts->ts_lock);
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		/*
 		 * If the thread is asleep, then we are probably about
 		 * to deadlock.  To make debugging this easier, show
 		 * backtrace of misbehaving thread and panic to not
 		 * leave the kernel deadlocked.
 		 */
 		if (TD_IS_SLEEPING(td)) {
 			printf(
 		"Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n",
 			    td->td_tid, td->td_proc->p_pid);
 			kdb_backtrace_thread(td);
 			panic("sleeping thread");
 		}
 
 		/*
 		 * If this thread already has higher priority than the
 		 * thread that is being blocked, we are finished.
 		 */
 		if (td->td_priority <= pri) {
 			thread_unlock(td);
 			return;
 		}
 
 		/*
 		 * Bump this thread's priority.
 		 */
 		sched_lend_prio(td, pri);
 
 		/*
 		 * If lock holder is actually running or on the run queue
 		 * then we are done.
 		 */
 		if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) {
 			MPASS(td->td_blocked == NULL);
 			thread_unlock(td);
 			return;
 		}
 
 #ifndef SMP
 		/*
 		 * For UP, we check to see if td is curthread (this shouldn't
 		 * ever happen however as it would mean we are in a deadlock.)
 		 */
 		KASSERT(td != curthread, ("Deadlock detected"));
 #endif
 
 		/*
 		 * If we aren't blocked on a lock, we should be.
 		 */
 		KASSERT(TD_ON_LOCK(td), (
 		    "thread %d(%s):%d holds %s but isn't blocked on a lock\n",
 		    td->td_tid, td->td_name, td->td_state,
 		    ts->ts_lockobj->lo_name));
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		ts = td->td_blocked;
 		MPASS(ts != NULL);
 		THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 		/* Resort td on the list if needed. */
 		if (!turnstile_adjust_thread(ts, td)) {
 			mtx_unlock_spin(&ts->ts_lock);
 			return;
 		}
 		/* The thread lock is released as ts lock above. */
 	}
 }
 
 /*
  * Adjust the thread's position on a turnstile after its priority has been
  * changed.
  */
 static int
 turnstile_adjust_thread(struct turnstile *ts, struct thread *td)
 {
 	struct thread *td1, *td2;
 	int queue;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_LOCK(td));
 
 	/*
 	 * This thread may not be blocked on this turnstile anymore
 	 * but instead might already be woken up on another CPU
 	 * that is waiting on the thread lock in turnstile_unpend() to
 	 * finish waking this thread up.  We can detect this case
 	 * by checking to see if this thread has been given a
 	 * turnstile by either turnstile_signal() or
 	 * turnstile_broadcast().  In this case, treat the thread as
 	 * if it was already running.
 	 */
 	if (td->td_turnstile != NULL)
 		return (0);
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	td1 = TAILQ_PREV(td, threadqueue, td_lockq);
 	td2 = TAILQ_NEXT(td, td_lockq);
 	if ((td1 != NULL && td->td_priority < td1->td_priority) ||
 	    (td2 != NULL && td->td_priority > td2->td_priority)) {
 
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		queue = td->td_tsqueue;
 		MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE);
 		mtx_lock_spin(&td_contested_lock);
 		TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
 		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) {
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (td1->td_priority > td->td_priority)
 				break;
 		}
 
 		if (td1 == NULL)
 			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		else
 			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
 		mtx_unlock_spin(&td_contested_lock);
 		if (td1 == NULL)
 			CTR3(KTR_LOCK,
 		    "turnstile_adjust_thread: td %d put at tail on [%p] %s",
 			    td->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name);
 		else
 			CTR4(KTR_LOCK,
 		    "turnstile_adjust_thread: td %d moved before %d on [%p] %s",
 			    td->td_tid, td1->td_tid, ts->ts_lockobj,
 			    ts->ts_lockobj->lo_name);
 	}
 	return (1);
 }
 
 /*
  * Early initialization of turnstiles.  This is not done via a SYSINIT()
  * since this needs to be initialized very early when mutexes are first
  * initialized.
  */
 void
 init_turnstiles(void)
 {
 	int i;
 
 	for (i = 0; i < TC_TABLESIZE; i++) {
 		LIST_INIT(&turnstile_chains[i].tc_turnstiles);
 		mtx_init(&turnstile_chains[i].tc_lock, "turnstile chain",
 		    NULL, MTX_SPIN);
 	}
 	mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN);
 	LIST_INIT(&thread0.td_contested);
 	thread0.td_turnstile = NULL;
 }
 
 #ifdef TURNSTILE_PROFILING
 static void
 init_turnstile_profiling(void *arg)
 {
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 	int i;
 
 	for (i = 0; i < TC_TABLESIZE; i++) {
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_turnstile_chains), OID_AUTO,
 		    chain_name, CTLFLAG_RD, NULL, "turnstile chain stats");
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "depth", CTLFLAG_RD, &turnstile_chains[i].tc_depth, 0,
 		    NULL);
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_depth", CTLFLAG_RD, &turnstile_chains[i].tc_max_depth,
 		    0, NULL);
 	}
 }
 SYSINIT(turnstile_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
     init_turnstile_profiling, NULL);
 #endif
 
 static void
 init_turnstile0(void *dummy)
 {
 
 	turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile),
 	    NULL,
 #ifdef INVARIANTS
 	    turnstile_dtor,
 #else
 	    NULL,
 #endif
 	    turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 	thread0.td_turnstile = turnstile_alloc();
 }
 SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL);
 
 /*
  * Update a thread on the turnstile list after it's priority has been changed.
  * The old priority is passed in as an argument.
  */
 void
 turnstile_adjust(struct thread *td, u_char oldpri)
 {
 	struct turnstile *ts;
 
 	MPASS(TD_ON_LOCK(td));
 
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	ts = td->td_blocked;
 	MPASS(ts != NULL);
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 
 	/* Resort the turnstile on the list. */
 	if (!turnstile_adjust_thread(ts, td))
 		return;
 	/*
 	 * If our priority was lowered and we are at the head of the
 	 * turnstile, then propagate our new priority up the chain.
 	 * Note that we currently don't try to revoke lent priorities
 	 * when our priority goes up.
 	 */
 	MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE ||
 	    td->td_tsqueue == TS_SHARED_QUEUE);
 	if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
 	    td->td_priority < oldpri) {
 		propagate_priority(td);
 	}
 }
 
 /*
  * Set the owner of the lock this turnstile is attached to.
  */
 static void
 turnstile_setowner(struct turnstile *ts, struct thread *owner)
 {
 
 	mtx_assert(&td_contested_lock, MA_OWNED);
 	MPASS(ts->ts_owner == NULL);
 
 	/* A shared lock might not have an owner. */
 	if (owner == NULL)
 		return;
 
 	MPASS(owner->td_proc->p_magic == P_MAGIC);
 	ts->ts_owner = owner;
 	LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link);
 }
 
 #ifdef INVARIANTS
 /*
  * UMA zone item deallocator.
  */
 static void
 turnstile_dtor(void *mem, int size, void *arg)
 {
 	struct turnstile *ts;
 
 	ts = mem;
 	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]));
 	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
 	MPASS(TAILQ_EMPTY(&ts->ts_pending));
 }
 #endif
 
 /*
  * UMA zone item initializer.
  */
 static int
 turnstile_init(void *mem, int size, int flags)
 {
 	struct turnstile *ts;
 
 	bzero(mem, size);
 	ts = mem;
 	TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
 	TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	TAILQ_INIT(&ts->ts_pending);
 	LIST_INIT(&ts->ts_free);
 	mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE);
 	return (0);
 }
 
 static void
 turnstile_fini(void *mem, int size)
 {
 	struct turnstile *ts;
 
 	ts = mem;
 	mtx_destroy(&ts->ts_lock);
 }
 
 /*
  * Get a turnstile for a new thread.
  */
 struct turnstile *
 turnstile_alloc(void)
 {
 
 	return (uma_zalloc(turnstile_zone, M_WAITOK));
 }
 
 /*
  * Free a turnstile when a thread is destroyed.
  */
 void
 turnstile_free(struct turnstile *ts)
 {
 
 	uma_zfree(turnstile_zone, ts);
 }
 
 /*
  * Lock the turnstile chain associated with the specified lock.
  */
 void
 turnstile_chain_lock(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 
 	tc = TC_LOOKUP(lock);
 	mtx_lock_spin(&tc->tc_lock);
 }
 
 struct turnstile *
 turnstile_trywait(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts;
 
 	tc = TC_LOOKUP(lock);
 	mtx_lock_spin(&tc->tc_lock);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
 		if (ts->ts_lockobj == lock) {
 			mtx_lock_spin(&ts->ts_lock);
 			return (ts);
 		}
 
 	ts = curthread->td_turnstile;
 	MPASS(ts != NULL);
 	mtx_lock_spin(&ts->ts_lock);
 	KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
 	ts->ts_lockobj = lock;
 
 	return (ts);
 }
 
 void
 turnstile_cancel(struct turnstile *ts)
 {
 	struct turnstile_chain *tc;
 	struct lock_object *lock;
 
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 
 	mtx_unlock_spin(&ts->ts_lock);
 	lock = ts->ts_lockobj;
 	if (ts == curthread->td_turnstile)
 		ts->ts_lockobj = NULL;
 	tc = TC_LOOKUP(lock);
 	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
  * Look up the turnstile for a lock in the hash table locking the associated
  * turnstile chain along the way.  If no turnstile is found in the hash
  * table, NULL is returned.
  */
 struct turnstile *
 turnstile_lookup(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts;
 
 	tc = TC_LOOKUP(lock);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
 		if (ts->ts_lockobj == lock) {
 			mtx_lock_spin(&ts->ts_lock);
 			return (ts);
 		}
 	return (NULL);
 }
 
 /*
  * Unlock the turnstile chain associated with a given lock.
  */
 void
 turnstile_chain_unlock(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 
 	tc = TC_LOOKUP(lock);
 	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
  * Return a pointer to the thread waiting on this turnstile with the
  * most important priority or NULL if the turnstile has no waiters.
  */
 static struct thread *
 turnstile_first_waiter(struct turnstile *ts)
 {
 	struct thread *std, *xtd;
 
 	std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
 	if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority))
 		return (std);
 	return (xtd);
 }
 
 /*
  * Take ownership of a turnstile and adjust the priority of the new
  * owner appropriately.
  */
 void
 turnstile_claim(struct turnstile *ts)
 {
 	struct thread *td, *owner;
 	struct turnstile_chain *tc;
 
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(ts != curthread->td_turnstile);
 
 	owner = curthread;
 	mtx_lock_spin(&td_contested_lock);
 	turnstile_setowner(ts, owner);
 	mtx_unlock_spin(&td_contested_lock);
 
 	td = turnstile_first_waiter(ts);
 	MPASS(td != NULL);
 	MPASS(td->td_proc->p_magic == P_MAGIC);
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 
 	/*
 	 * Update the priority of the new owner if needed.
 	 */
 	thread_lock(owner);
 	if (td->td_priority < owner->td_priority)
 		sched_lend_prio(owner, td->td_priority);
 	thread_unlock(owner);
 	tc = TC_LOOKUP(ts->ts_lockobj);
 	mtx_unlock_spin(&ts->ts_lock);
 	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
  * Block the current thread on the turnstile assicated with 'lock'.  This
  * function will context switch and not return until this thread has been
  * woken back up.  This function must be called with the appropriate
  * turnstile chain locked and will return with it unlocked.
  */
 void
 turnstile_wait(struct turnstile *ts, struct thread *owner, int queue)
 {
 	struct turnstile_chain *tc;
 	struct thread *td, *td1;
 	struct lock_object *lock;
 
 	td = curthread;
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	if (owner)
 		MPASS(owner->td_proc->p_magic == P_MAGIC);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * If the lock does not already have a turnstile, use this thread's
 	 * turnstile.  Otherwise insert the current thread into the
 	 * turnstile already in use by this lock.
 	 */
 	tc = TC_LOOKUP(ts->ts_lockobj);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	if (ts == td->td_turnstile) {
 #ifdef TURNSTILE_PROFILING
 		tc->tc_depth++;
 		if (tc->tc_depth > tc->tc_max_depth) {
 			tc->tc_max_depth = tc->tc_depth;
 			if (tc->tc_max_depth > turnstile_max_depth)
 				turnstile_max_depth = tc->tc_max_depth;
 		}
 #endif
 		LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash);
 		KASSERT(TAILQ_EMPTY(&ts->ts_pending),
 		    ("thread's turnstile has pending threads"));
 		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]),
 		    ("thread's turnstile has exclusive waiters"));
 		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]),
 		    ("thread's turnstile has shared waiters"));
 		KASSERT(LIST_EMPTY(&ts->ts_free),
 		    ("thread's turnstile has a non-empty free list"));
 		MPASS(ts->ts_lockobj != NULL);
 		mtx_lock_spin(&td_contested_lock);
 		TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		turnstile_setowner(ts, owner);
 		mtx_unlock_spin(&td_contested_lock);
 	} else {
 		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq)
 			if (td1->td_priority > td->td_priority)
 				break;
 		mtx_lock_spin(&td_contested_lock);
 		if (td1 != NULL)
 			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
 		else
 			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		MPASS(owner == ts->ts_owner);
 		mtx_unlock_spin(&td_contested_lock);
 		MPASS(td->td_turnstile != NULL);
 		LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash);
 	}
 	thread_lock(td);
 	thread_lock_set(td, &ts->ts_lock);
 	td->td_turnstile = NULL;
 
 	/* Save who we are blocked on and switch. */
 	lock = ts->ts_lockobj;
 	td->td_tsqueue = queue;
 	td->td_blocked = ts;
 	td->td_lockname = lock->lo_name;
 	td->td_blktick = ticks;
 	TD_SET_LOCK(td);
 	mtx_unlock_spin(&tc->tc_lock);
 	propagate_priority(td);
 
 	if (LOCK_LOG_TEST(lock, 0))
 		CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
 		    td->td_tid, lock, lock->lo_name);
 
 	SDT_PROBE0(sched, , , sleep);
 
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	mi_switch(SW_VOL | SWT_TURNSTILE, NULL);
 
 	if (LOCK_LOG_TEST(lock, 0))
 		CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s",
 		    __func__, td->td_tid, lock, lock->lo_name);
 	thread_unlock(td);
 }
 
 /*
  * Pick the highest priority thread on this turnstile and put it on the
  * pending list.  This must be called with the turnstile chain locked.
  */
 int
 turnstile_signal(struct turnstile *ts, int queue)
 {
 	struct turnstile_chain *tc;
 	struct thread *td;
 	int empty;
 
 	MPASS(ts != NULL);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
 	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * Pick the highest priority thread blocked on this lock and
 	 * move it to the pending list.
 	 */
 	td = TAILQ_FIRST(&ts->ts_blocked[queue]);
 	MPASS(td->td_proc->p_magic == P_MAGIC);
 	mtx_lock_spin(&td_contested_lock);
 	TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
 	mtx_unlock_spin(&td_contested_lock);
 	TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq);
 
 	/*
 	 * If the turnstile is now empty, remove it from its chain and
 	 * give it to the about-to-be-woken thread.  Otherwise take a
 	 * turnstile from the free list and give it to the thread.
 	 */
 	empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
 	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	if (empty) {
 		tc = TC_LOOKUP(ts->ts_lockobj);
 		mtx_assert(&tc->tc_lock, MA_OWNED);
 		MPASS(LIST_EMPTY(&ts->ts_free));
 #ifdef TURNSTILE_PROFILING
 		tc->tc_depth--;
 #endif
 	} else
 		ts = LIST_FIRST(&ts->ts_free);
 	MPASS(ts != NULL);
 	LIST_REMOVE(ts, ts_hash);
 	td->td_turnstile = ts;
 
 	return (empty);
 }
 	
 /*
  * Put all blocked threads on the pending list.  This must be called with
  * the turnstile chain locked.
  */
 void
 turnstile_broadcast(struct turnstile *ts, int queue)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts1;
 	struct thread *td;
 
 	MPASS(ts != NULL);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
 	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
 	/*
 	 * We must have the chain locked so that we can remove the empty
 	 * turnstile from the hash queue.
 	 */
 	tc = TC_LOOKUP(ts->ts_lockobj);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * Transfer the blocked list to the pending list.
 	 */
 	mtx_lock_spin(&td_contested_lock);
 	TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq);
 	mtx_unlock_spin(&td_contested_lock);
 
 	/*
 	 * Give a turnstile to each thread.  The last thread gets
 	 * this turnstile if the turnstile is empty.
 	 */
 	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) {
 		if (LIST_EMPTY(&ts->ts_free)) {
 			MPASS(TAILQ_NEXT(td, td_lockq) == NULL);
 			ts1 = ts;
 #ifdef TURNSTILE_PROFILING
 			tc->tc_depth--;
 #endif
 		} else
 			ts1 = LIST_FIRST(&ts->ts_free);
 		MPASS(ts1 != NULL);
 		LIST_REMOVE(ts1, ts_hash);
 		td->td_turnstile = ts1;
 	}
 }
 
 /*
  * Wakeup all threads on the pending list and adjust the priority of the
  * current thread appropriately.  This must be called with the turnstile
  * chain locked.
  */
 void
 turnstile_unpend(struct turnstile *ts, int owner_type)
 {
 	TAILQ_HEAD( ,thread) pending_threads;
 	struct turnstile *nts;
 	struct thread *td;
 	u_char cp, pri;
 
 	MPASS(ts != NULL);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
 	MPASS(!TAILQ_EMPTY(&ts->ts_pending));
 
 	/*
 	 * Move the list of pending threads out of the turnstile and
 	 * into a local variable.
 	 */
 	TAILQ_INIT(&pending_threads);
 	TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq);
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
 	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]))
 		ts->ts_lockobj = NULL;
 #endif
 	/*
 	 * Adjust the priority of curthread based on other contested
 	 * locks it owns.  Don't lower the priority below the base
 	 * priority however.
 	 */
 	td = curthread;
 	pri = PRI_MAX;
 	thread_lock(td);
 	mtx_lock_spin(&td_contested_lock);
 	/*
 	 * Remove the turnstile from this thread's list of contested locks
 	 * since this thread doesn't own it anymore.  New threads will
 	 * not be blocking on the turnstile until it is claimed by a new
 	 * owner.  There might not be a current owner if this is a shared
 	 * lock.
 	 */
 	if (ts->ts_owner != NULL) {
 		ts->ts_owner = NULL;
 		LIST_REMOVE(ts, ts_link);
 	}
 	LIST_FOREACH(nts, &td->td_contested, ts_link) {
 		cp = turnstile_first_waiter(nts)->td_priority;
 		if (cp < pri)
 			pri = cp;
 	}
 	mtx_unlock_spin(&td_contested_lock);
 	sched_unlend_prio(td, pri);
 	thread_unlock(td);
 	/*
 	 * Wake up all the pending threads.  If a thread is not blocked
 	 * on a lock, then it is currently executing on another CPU in
 	 * turnstile_wait() or sitting on a run queue waiting to resume
 	 * in turnstile_wait().  Set a flag to force it to try to acquire
 	 * the lock again instead of blocking.
 	 */
 	while (!TAILQ_EMPTY(&pending_threads)) {
 		td = TAILQ_FIRST(&pending_threads);
 		TAILQ_REMOVE(&pending_threads, td, td_lockq);
 		SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
 		thread_lock(td);
 		THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 		MPASS(TD_ON_LOCK(td));
 		TD_CLR_LOCK(td);
 		MPASS(TD_CAN_RUN(td));
 		td->td_blocked = NULL;
 		td->td_lockname = NULL;
 		td->td_blktick = 0;
 #ifdef INVARIANTS
 		td->td_tsqueue = 0xff;
 #endif
 		sched_add(td, SRQ_BORING);
 		thread_unlock(td);
 	}
 	mtx_unlock_spin(&ts->ts_lock);
 }
 
 /*
  * Give up ownership of a turnstile.  This must be called with the
  * turnstile chain locked.
  */
 void
 turnstile_disown(struct turnstile *ts)
 {
 	struct thread *td;
 	u_char cp, pri;
 
 	MPASS(ts != NULL);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(ts->ts_owner == curthread);
 	MPASS(TAILQ_EMPTY(&ts->ts_pending));
 	MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) ||
 	    !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
 
 	/*
 	 * Remove the turnstile from this thread's list of contested locks
 	 * since this thread doesn't own it anymore.  New threads will
 	 * not be blocking on the turnstile until it is claimed by a new
 	 * owner.
 	 */
 	mtx_lock_spin(&td_contested_lock);
 	ts->ts_owner = NULL;
 	LIST_REMOVE(ts, ts_link);
 	mtx_unlock_spin(&td_contested_lock);
 
 	/*
 	 * Adjust the priority of curthread based on other contested
 	 * locks it owns.  Don't lower the priority below the base
 	 * priority however.
 	 */
 	td = curthread;
 	pri = PRI_MAX;
 	thread_lock(td);
 	mtx_unlock_spin(&ts->ts_lock);
 	mtx_lock_spin(&td_contested_lock);
 	LIST_FOREACH(ts, &td->td_contested, ts_link) {
 		cp = turnstile_first_waiter(ts)->td_priority;
 		if (cp < pri)
 			pri = cp;
 	}
 	mtx_unlock_spin(&td_contested_lock);
 	sched_unlend_prio(td, pri);
 	thread_unlock(td);
 }
 
 /*
  * Return the first thread in a turnstile.
  */
 struct thread *
 turnstile_head(struct turnstile *ts, int queue)
 {
 #ifdef INVARIANTS
 
 	MPASS(ts != NULL);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 #endif
 	return (TAILQ_FIRST(&ts->ts_blocked[queue]));
 }
 
 /*
  * Returns true if a sub-queue of a turnstile is empty.
  */
 int
 turnstile_empty(struct turnstile *ts, int queue)
 {
 #ifdef INVARIANTS
 
 	MPASS(ts != NULL);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 	mtx_assert(&ts->ts_lock, MA_OWNED);
 #endif
 	return (TAILQ_EMPTY(&ts->ts_blocked[queue]));
 }
 
 #ifdef DDB
 static void
 print_thread(struct thread *td, const char *prefix)
 {
 
 	db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid,
 	    td->td_proc->p_pid, td->td_name);
 }
 
 static void
 print_queue(struct threadqueue *queue, const char *header, const char *prefix)
 {
 	struct thread *td;
 
 	db_printf("%s:\n", header);
 	if (TAILQ_EMPTY(queue)) {
 		db_printf("%sempty\n", prefix);
 		return;
 	}
 	TAILQ_FOREACH(td, queue, td_lockq) {
 		print_thread(td, prefix);
 	}
 }
 
 DB_SHOW_COMMAND(turnstile, db_show_turnstile)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts;
 	struct lock_object *lock;
 	int i;
 
 	if (!have_addr)
 		return;
 
 	/*
 	 * First, see if there is an active turnstile for the lock indicated
 	 * by the address.
 	 */
 	lock = (struct lock_object *)addr;
 	tc = TC_LOOKUP(lock);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
 		if (ts->ts_lockobj == lock)
 			goto found;
 
 	/*
 	 * Second, see if there is an active turnstile at the address
 	 * indicated.
 	 */
 	for (i = 0; i < TC_TABLESIZE; i++)
 		LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) {
 			if (ts == (struct turnstile *)addr)
 				goto found;
 		}
 
 	db_printf("Unable to locate a turnstile via %p\n", (void *)addr);
 	return;
 found:
 	lock = ts->ts_lockobj;
 	db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name,
 	    lock->lo_name);
 	if (ts->ts_owner)
 		print_thread(ts->ts_owner, "Lock Owner: ");
 	else
 		db_printf("Lock Owner: none\n");
 	print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t");
 	print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters",
 	    "\t");
 	print_queue(&ts->ts_pending, "Pending Threads", "\t");
 	
 }
 
 /*
  * Show all the threads a particular thread is waiting on based on
  * non-spin locks.
  */
 static void
 print_lockchain(struct thread *td, const char *prefix)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 	struct turnstile *ts;
 	struct thread *owner;
 
 	/*
 	 * Follow the chain.  We keep walking as long as the thread is
 	 * blocked on a lock that has an owner.
 	 */
 	while (!db_pager_quit) {
 		db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
 		    td->td_proc->p_pid, td->td_name);
 		switch (td->td_state) {
 		case TDS_INACTIVE:
 			db_printf("is inactive\n");
 			return;
 		case TDS_CAN_RUN:
 			db_printf("can run\n");
 			return;
 		case TDS_RUNQ:
 			db_printf("is on a run queue\n");
 			return;
 		case TDS_RUNNING:
 			db_printf("running on CPU %d\n", td->td_oncpu);
 			return;
 		case TDS_INHIBITED:
 			if (TD_ON_LOCK(td)) {
 				ts = td->td_blocked;
 				lock = ts->ts_lockobj;
 				class = LOCK_CLASS(lock);
 				db_printf("blocked on lock %p (%s) \"%s\"\n",
 				    lock, class->lc_name, lock->lo_name);
 				if (ts->ts_owner == NULL)
 					return;
 				td = ts->ts_owner;
 				break;
 			} else if (TD_ON_SLEEPQ(td)) {
 				if (!lockmgr_chain(td, &owner) &&
 				    !sx_chain(td, &owner)) {
 					db_printf("sleeping on %p \"%s\"\n",
 					    td->td_wchan, td->td_wmesg);
 					return;
 				}
 				if (owner == NULL)
 					return;
 				td = owner;
 				break;
 			}
 			db_printf("inhibited\n");
 			return;
 		default:
 			db_printf("??? (%#x)\n", td->td_state);
 			return;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(lockchain, db_show_lockchain)
 {
 	struct thread *td;
 
 	/* Figure out which thread to start with. */
 	if (have_addr)
 		td = db_lookup_thread(addr, true);
 	else
 		td = kdb_thread;
 
 	print_lockchain(td, "");
 }
 DB_SHOW_ALIAS(sleepchain, db_show_lockchain);
 
 DB_SHOW_ALL_COMMAND(chains, db_show_allchains)
 {
 	struct thread *td;
 	struct proc *p;
 	int i;
 
 	i = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if ((TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested))
 			    || (TD_IS_INHIBITED(td) && TD_ON_SLEEPQ(td))) {
 				db_printf("chain %d:\n", i++);
 				print_lockchain(td, " ");
 			}
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS(allchains, db_show_allchains)
 
 static void	print_waiters(struct turnstile *ts, int indent);
 	
 static void
 print_waiter(struct thread *td, int indent)
 {
 	struct turnstile *ts;
 	int i;
 
 	if (db_pager_quit)
 		return;
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 	print_thread(td, "thread ");
 	LIST_FOREACH(ts, &td->td_contested, ts_link)
 		print_waiters(ts, indent + 1);
 }
 
 static void
 print_waiters(struct turnstile *ts, int indent)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 	struct thread *td;
 	int i;
 
 	if (db_pager_quit)
 		return;
 	lock = ts->ts_lockobj;
 	class = LOCK_CLASS(lock);
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 	db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name);
 	TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq)
 		print_waiter(td, indent + 1);
 	TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq)
 		print_waiter(td, indent + 1);
 	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq)
 		print_waiter(td, indent + 1);
 }
 
 DB_SHOW_COMMAND(locktree, db_show_locktree)
 {
 	struct lock_object *lock;
 	struct lock_class *class;
 	struct turnstile_chain *tc;
 	struct turnstile *ts;
 
 	if (!have_addr)
 		return;
 	lock = (struct lock_object *)addr;
 	tc = TC_LOOKUP(lock);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
 		if (ts->ts_lockobj == lock)
 			break;
 	if (ts == NULL) {
 		class = LOCK_CLASS(lock);
 		db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name,
 		    lock->lo_name);
 	} else
 		print_waiters(ts, 0);
 }
 #endif
Index: head/sys/kern/subr_unit.c
===================================================================
--- head/sys/kern/subr_unit.c	(revision 326270)
+++ head/sys/kern/subr_unit.c	(revision 326271)
@@ -1,1079 +1,1081 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2004 Poul-Henning Kamp
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  *
  * Unit number allocation functions.
  *
  * These functions implement a mixed run-length/bitmap management of unit
  * number spaces in a very memory efficient manner.
  *
  * Allocation policy is always lowest free number first.
  *
  * A return value of -1 signals that no more unit numbers are available.
  *
  * There is no cost associated with the range of unitnumbers, so unless
  * the resource really is finite, specify INT_MAX to new_unrhdr() and
  * forget about checking the return value.
  *
  * If a mutex is not provided when the unit number space is created, a
  * default global mutex is used.  The advantage to passing a mutex in, is
  * that the alloc_unrl() function can be called with the mutex already
  * held (it will not be released by alloc_unrl()).
  *
  * The allocation function alloc_unr{l}() never sleeps (but it may block on
  * the mutex of course).
  *
  * Freeing a unit number may require allocating memory, and can therefore
  * sleep so the free_unr() function does not come in a pre-locked variant.
  *
  * A userland test program is included.
  *
  * Memory usage is a very complex function of the exact allocation
  * pattern, but always very compact:
  *    * For the very typical case where a single unbroken run of unit
  *      numbers are allocated 44 bytes are used on i386.
  *    * For a unit number space of 1000 units and the random pattern
  *      in the usermode test program included, the worst case usage
  *	was 252 bytes on i386 for 500 allocated and 500 free units.
  *    * For a unit number space of 10000 units and the random pattern
  *      in the usermode test program included, the worst case usage
  *	was 798 bytes on i386 for 5000 allocated and 5000 free units.
  *    * The worst case is where every other unit number is allocated and
  *	the rest are free.  In that case 44 + N/4 bytes are used where
  *	N is the number of the highest unit allocated.
  */
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/_unrhdr.h>
 
 #ifdef _KERNEL
 
 #include <sys/bitstring.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 
 /*
  * In theory it would be smarter to allocate the individual blocks
  * with the zone allocator, but at this time the expectation is that
  * there will typically not even be enough allocations to fill a single
  * page, so we stick with malloc for now.
  */
 static MALLOC_DEFINE(M_UNIT, "Unitno", "Unit number allocation");
 
 #define Malloc(foo) malloc(foo, M_UNIT, M_WAITOK | M_ZERO)
 #define Free(foo) free(foo, M_UNIT)
 
 static struct mtx unitmtx;
 
 MTX_SYSINIT(unit, &unitmtx, "unit# allocation", MTX_DEF);
 
 #else /* ...USERLAND */
 
 #include <bitstring.h>
 #include <err.h>
 #include <errno.h>
 #include <getopt.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #define KASSERT(cond, arg) \
 	do { \
 		if (!(cond)) { \
 			printf arg; \
 			abort(); \
 		} \
 	} while (0)
 
 static int no_alloc;
 #define Malloc(foo) _Malloc(foo, __LINE__)
 static void *
 _Malloc(size_t foo, int line)
 {
 
 	KASSERT(no_alloc == 0, ("malloc in wrong place() line %d", line));
 	return (calloc(foo, 1));
 }
 #define Free(foo) free(foo)
 
 struct unrhdr;
 
 
 struct mtx {
 	int	state;
 } unitmtx;
 
 static void
 mtx_lock(struct mtx *mp)
 {
 	KASSERT(mp->state == 0, ("mutex already locked"));
 	mp->state = 1;
 }
 
 static void
 mtx_unlock(struct mtx *mp)
 {
 	KASSERT(mp->state == 1, ("mutex not locked"));
 	mp->state = 0;
 }
 
 #define MA_OWNED	9
 
 static void
 mtx_assert(struct mtx *mp, int flag)
 {
 	if (flag == MA_OWNED) {
 		KASSERT(mp->state == 1, ("mtx_assert(MA_OWNED) not true"));
 	}
 }
 
 #define CTASSERT(foo)
 #define WITNESS_WARN(flags, lock, fmt, ...)	(void)0
 
 #endif /* USERLAND */
 
 /*
  * This is our basic building block.
  *
  * It can be used in three different ways depending on the value of the ptr
  * element:
  *     If ptr is NULL, it represents a run of free items.
  *     If ptr points to the unrhdr it represents a run of allocated items.
  *     Otherwise it points to a bitstring of allocated items.
  *
  * For runs the len field is the length of the run.
  * For bitmaps the len field represents the number of allocated items.
  *
  * The bitmap is the same size as struct unr to optimize memory management.
  */
 struct unr {
 	TAILQ_ENTRY(unr)	list;
 	u_int			len;
 	void			*ptr;
 };
 
 struct unrb {
 	bitstr_t		map[sizeof(struct unr) / sizeof(bitstr_t)];
 };
 
 CTASSERT((sizeof(struct unr) % sizeof(bitstr_t)) == 0);
 
 /* Number of bits we can store in the bitmap */
 #define NBITS (8 * sizeof(((struct unrb*)NULL)->map))
 
 /* Is the unrb empty in at least the first len bits? */
 static inline bool
 ub_empty(struct unrb *ub, int len) {
 	int first_set;
 
 	bit_ffs(ub->map, len, &first_set);
 	return (first_set == -1);
 }
 
 /* Is the unrb full?  That is, is the number of set elements equal to len? */
 static inline bool
 ub_full(struct unrb *ub, int len)
 {
 	int first_clear;
 
 	bit_ffc(ub->map, len, &first_clear);
 	return (first_clear == -1);
 }
 
 
 #if defined(DIAGNOSTIC) || !defined(_KERNEL)
 /*
  * Consistency check function.
  *
  * Checks the internal consistency as well as we can.
  *
  * Called at all boundaries of this API.
  */
 static void
 check_unrhdr(struct unrhdr *uh, int line)
 {
 	struct unr *up;
 	struct unrb *ub;
 	int w;
 	u_int y, z;
 
 	y = uh->first;
 	z = 0;
 	TAILQ_FOREACH(up, &uh->head, list) {
 		z++;
 		if (up->ptr != uh && up->ptr != NULL) {
 			ub = up->ptr;
 			KASSERT (up->len <= NBITS,
 			    ("UNR inconsistency: len %u max %zd (line %d)\n",
 			    up->len, NBITS, line));
 			z++;
 			w = 0;
 			bit_count(ub->map, 0, up->len, &w);
 			y += w;
 		} else if (up->ptr != NULL)
 			y += up->len;
 	}
 	KASSERT (y == uh->busy,
 	    ("UNR inconsistency: items %u found %u (line %d)\n",
 	    uh->busy, y, line));
 	KASSERT (z == uh->alloc,
 	    ("UNR inconsistency: chunks %u found %u (line %d)\n",
 	    uh->alloc, z, line));
 }
 
 #else
 
 static __inline void
 check_unrhdr(struct unrhdr *uh __unused, int line __unused)
 {
 
 }
 
 #endif
 
 
 /*
  * Userland memory management.  Just use calloc and keep track of how
  * many elements we have allocated for check_unrhdr().
  */
 
 static __inline void *
 new_unr(struct unrhdr *uh, void **p1, void **p2)
 {
 	void *p;
 
 	uh->alloc++;
 	KASSERT(*p1 != NULL || *p2 != NULL, ("Out of cached memory"));
 	if (*p1 != NULL) {
 		p = *p1;
 		*p1 = NULL;
 		return (p);
 	} else {
 		p = *p2;
 		*p2 = NULL;
 		return (p);
 	}
 }
 
 static __inline void
 delete_unr(struct unrhdr *uh, void *ptr)
 {
 	struct unr *up;
 
 	uh->alloc--;
 	up = ptr;
 	TAILQ_INSERT_TAIL(&uh->ppfree, up, list);
 }
 
 void
 clean_unrhdrl(struct unrhdr *uh)
 {
 	struct unr *up;
 
 	mtx_assert(uh->mtx, MA_OWNED);
 	while ((up = TAILQ_FIRST(&uh->ppfree)) != NULL) {
 		TAILQ_REMOVE(&uh->ppfree, up, list);
 		mtx_unlock(uh->mtx);
 		Free(up);
 		mtx_lock(uh->mtx);
 	}
 
 }
 
 void
 clean_unrhdr(struct unrhdr *uh)
 {
 
 	mtx_lock(uh->mtx);
 	clean_unrhdrl(uh);
 	mtx_unlock(uh->mtx);
 }
 
 void
 init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex)
 {
 
 	KASSERT(low >= 0 && low <= high,
 	    ("UNR: use error: new_unrhdr(%d, %d)", low, high));
 	if (mutex != NULL)
 		uh->mtx = mutex;
 	else
 		uh->mtx = &unitmtx;
 	TAILQ_INIT(&uh->head);
 	TAILQ_INIT(&uh->ppfree);
 	uh->low = low;
 	uh->high = high;
 	uh->first = 0;
 	uh->last = 1 + (high - low);
 	check_unrhdr(uh, __LINE__);
 }
 
 /*
  * Allocate a new unrheader set.
  *
  * Highest and lowest valid values given as parameters.
  */
 
 struct unrhdr *
 new_unrhdr(int low, int high, struct mtx *mutex)
 {
 	struct unrhdr *uh;
 
 	uh = Malloc(sizeof *uh);
 	init_unrhdr(uh, low, high, mutex);
 	return (uh);
 }
 
 void
 delete_unrhdr(struct unrhdr *uh)
 {
 
 	check_unrhdr(uh, __LINE__);
 	KASSERT(uh->busy == 0, ("unrhdr has %u allocations", uh->busy));
 	KASSERT(uh->alloc == 0, ("UNR memory leak in delete_unrhdr"));
 	KASSERT(TAILQ_FIRST(&uh->ppfree) == NULL,
 	    ("unrhdr has postponed item for free"));
 	Free(uh);
 }
 
 void
 clear_unrhdr(struct unrhdr *uh)
 {
 	struct unr *up, *uq;
 
 	KASSERT(TAILQ_EMPTY(&uh->ppfree),
 	    ("unrhdr has postponed item for free"));
 	TAILQ_FOREACH_SAFE(up, &uh->head, list, uq) {
 		if (up->ptr != uh) {
 			Free(up->ptr);
 		}
 		Free(up);
 	}
 	uh->busy = 0;
 	uh->alloc = 0;
 	init_unrhdr(uh, uh->low, uh->high, uh->mtx);
 
 	check_unrhdr(uh, __LINE__);
 }
 
 static __inline int
 is_bitmap(struct unrhdr *uh, struct unr *up)
 {
 	return (up->ptr != uh && up->ptr != NULL);
 }
 
 /*
  * Look for sequence of items which can be combined into a bitmap, if
  * multiple are present, take the one which saves most memory.
  *
  * Return (1) if a sequence was found to indicate that another call
  * might be able to do more.  Return (0) if we found no suitable sequence.
  *
  * NB: called from alloc_unr(), no new memory allocation allowed.
  */
 static int
 optimize_unr(struct unrhdr *uh)
 {
 	struct unr *up, *uf, *us;
 	struct unrb *ub, *ubf;
 	u_int a, l, ba;
 
 	/*
 	 * Look for the run of items (if any) which when collapsed into
 	 * a bitmap would save most memory.
 	 */
 	us = NULL;
 	ba = 0;
 	TAILQ_FOREACH(uf, &uh->head, list) {
 		if (uf->len >= NBITS)
 			continue;
 		a = 1;
 		if (is_bitmap(uh, uf))
 			a++;
 		l = uf->len;
 		up = uf;
 		while (1) {
 			up = TAILQ_NEXT(up, list);
 			if (up == NULL)
 				break;
 			if ((up->len + l) > NBITS)
 				break;
 			a++;
 			if (is_bitmap(uh, up))
 				a++;
 			l += up->len;
 		}
 		if (a > ba) {
 			ba = a;
 			us = uf;
 		}
 	}
 	if (ba < 3)
 		return (0);
 
 	/*
 	 * If the first element is not a bitmap, make it one.
 	 * Trying to do so without allocating more memory complicates things
 	 * a bit
 	 */
 	if (!is_bitmap(uh, us)) {
 		uf = TAILQ_NEXT(us, list);
 		TAILQ_REMOVE(&uh->head, us, list);
 		a = us->len;
 		l = us->ptr == uh ? 1 : 0;
 		ub = (void *)us;
 		bit_nclear(ub->map, 0, NBITS - 1);
 		if (l)
 			bit_nset(ub->map, 0, a);
 		if (!is_bitmap(uh, uf)) {
 			if (uf->ptr == NULL)
 				bit_nclear(ub->map, a, a + uf->len - 1);
 			else
 				bit_nset(ub->map, a, a + uf->len - 1);
 			uf->ptr = ub;
 			uf->len += a;
 			us = uf;
 		} else {
 			ubf = uf->ptr;
 			for (l = 0; l < uf->len; l++, a++) {
 				if (bit_test(ubf->map, l))
 					bit_set(ub->map, a);
 				else
 					bit_clear(ub->map, a);
 			}
 			uf->len = a;
 			delete_unr(uh, uf->ptr);
 			uf->ptr = ub;
 			us = uf;
 		}
 	}
 	ub = us->ptr;
 	while (1) {
 		uf = TAILQ_NEXT(us, list);
 		if (uf == NULL)
 			return (1);
 		if (uf->len + us->len > NBITS)
 			return (1);
 		if (uf->ptr == NULL) {
 			bit_nclear(ub->map, us->len, us->len + uf->len - 1);
 			us->len += uf->len;
 			TAILQ_REMOVE(&uh->head, uf, list);
 			delete_unr(uh, uf);
 		} else if (uf->ptr == uh) {
 			bit_nset(ub->map, us->len, us->len + uf->len - 1);
 			us->len += uf->len;
 			TAILQ_REMOVE(&uh->head, uf, list);
 			delete_unr(uh, uf);
 		} else {
 			ubf = uf->ptr;
 			for (l = 0; l < uf->len; l++, us->len++) {
 				if (bit_test(ubf->map, l))
 					bit_set(ub->map, us->len);
 				else
 					bit_clear(ub->map, us->len);
 			}
 			TAILQ_REMOVE(&uh->head, uf, list);
 			delete_unr(uh, ubf);
 			delete_unr(uh, uf);
 		}
 	}
 }
 
 /*
  * See if a given unr should be collapsed with a neighbor.
  *
  * NB: called from alloc_unr(), no new memory allocation allowed.
  */
 static void
 collapse_unr(struct unrhdr *uh, struct unr *up)
 {
 	struct unr *upp;
 	struct unrb *ub;
 
 	/* If bitmap is all set or clear, change it to runlength */
 	if (is_bitmap(uh, up)) {
 		ub = up->ptr;
 		if (ub_full(ub, up->len)) {
 			delete_unr(uh, up->ptr);
 			up->ptr = uh;
 		} else if (ub_empty(ub, up->len)) {
 			delete_unr(uh, up->ptr);
 			up->ptr = NULL;
 		}
 	}
 
 	/* If nothing left in runlength, delete it */
 	if (up->len == 0) {
 		upp = TAILQ_PREV(up, unrhd, list);
 		if (upp == NULL)
 			upp = TAILQ_NEXT(up, list);
 		TAILQ_REMOVE(&uh->head, up, list);
 		delete_unr(uh, up);
 		up = upp;
 	}
 
 	/* If we have "hot-spot" still, merge with neighbor if possible */
 	if (up != NULL) {
 		upp = TAILQ_PREV(up, unrhd, list);
 		if (upp != NULL && up->ptr == upp->ptr) {
 			up->len += upp->len;
 			TAILQ_REMOVE(&uh->head, upp, list);
 			delete_unr(uh, upp);
 			}
 		upp = TAILQ_NEXT(up, list);
 		if (upp != NULL && up->ptr == upp->ptr) {
 			up->len += upp->len;
 			TAILQ_REMOVE(&uh->head, upp, list);
 			delete_unr(uh, upp);
 		}
 	}
 
 	/* Merge into ->first if possible */
 	upp = TAILQ_FIRST(&uh->head);
 	if (upp != NULL && upp->ptr == uh) {
 		uh->first += upp->len;
 		TAILQ_REMOVE(&uh->head, upp, list);
 		delete_unr(uh, upp);
 		if (up == upp)
 			up = NULL;
 	}
 
 	/* Merge into ->last if possible */
 	upp = TAILQ_LAST(&uh->head, unrhd);
 	if (upp != NULL && upp->ptr == NULL) {
 		uh->last += upp->len;
 		TAILQ_REMOVE(&uh->head, upp, list);
 		delete_unr(uh, upp);
 		if (up == upp)
 			up = NULL;
 	}
 
 	/* Try to make bitmaps */
 	while (optimize_unr(uh))
 		continue;
 }
 
 /*
  * Allocate a free unr.
  */
 int
 alloc_unrl(struct unrhdr *uh)
 {
 	struct unr *up;
 	struct unrb *ub;
 	u_int x;
 	int y;
 
 	mtx_assert(uh->mtx, MA_OWNED);
 	check_unrhdr(uh, __LINE__);
 	x = uh->low + uh->first;
 
 	up = TAILQ_FIRST(&uh->head);
 
 	/*
 	 * If we have an ideal split, just adjust the first+last
 	 */
 	if (up == NULL && uh->last > 0) {
 		uh->first++;
 		uh->last--;
 		uh->busy++;
 		return (x);
 	}
 
 	/*
 	 * We can always allocate from the first list element, so if we have
 	 * nothing on the list, we must have run out of unit numbers.
 	 */
 	if (up == NULL)
 		return (-1);
 
 	KASSERT(up->ptr != uh, ("UNR first element is allocated"));
 
 	if (up->ptr == NULL) {	/* free run */
 		uh->first++;
 		up->len--;
 	} else {		/* bitmap */
 		ub = up->ptr;
 		bit_ffc(ub->map, up->len, &y);
 		KASSERT(y != -1, ("UNR corruption: No clear bit in bitmap."));
 		bit_set(ub->map, y);
 		x += y;
 	}
 	uh->busy++;
 	collapse_unr(uh, up);
 	return (x);
 }
 
 int
 alloc_unr(struct unrhdr *uh)
 {
 	int i;
 
 	mtx_lock(uh->mtx);
 	i = alloc_unrl(uh);
 	clean_unrhdrl(uh);
 	mtx_unlock(uh->mtx);
 	return (i);
 }
 
 static int
 alloc_unr_specificl(struct unrhdr *uh, u_int item, void **p1, void **p2)
 {
 	struct unr *up, *upn;
 	struct unrb *ub;
 	u_int i, last, tl;
 
 	mtx_assert(uh->mtx, MA_OWNED);
 
 	if (item < uh->low + uh->first || item > uh->high)
 		return (-1);
 
 	up = TAILQ_FIRST(&uh->head);
 	/* Ideal split. */
 	if (up == NULL && item - uh->low == uh->first) {
 		uh->first++;
 		uh->last--;
 		uh->busy++;
 		check_unrhdr(uh, __LINE__);
 		return (item);
 	}
 
 	i = item - uh->low - uh->first;
 
 	if (up == NULL) {
 		up = new_unr(uh, p1, p2);
 		up->ptr = NULL;
 		up->len = i;
 		TAILQ_INSERT_TAIL(&uh->head, up, list);
 		up = new_unr(uh, p1, p2);
 		up->ptr = uh;
 		up->len = 1;
 		TAILQ_INSERT_TAIL(&uh->head, up, list);
 		uh->last = uh->high - uh->low - i;
 		uh->busy++;
 		check_unrhdr(uh, __LINE__);
 		return (item);
 	} else {
 		/* Find the item which contains the unit we want to allocate. */
 		TAILQ_FOREACH(up, &uh->head, list) {
 			if (up->len > i)
 				break;
 			i -= up->len;
 		}
 	}
 
 	if (up == NULL) {
 		if (i > 0) {
 			up = new_unr(uh, p1, p2);
 			up->ptr = NULL;
 			up->len = i;
 			TAILQ_INSERT_TAIL(&uh->head, up, list);
 		}
 		up = new_unr(uh, p1, p2);
 		up->ptr = uh;
 		up->len = 1;
 		TAILQ_INSERT_TAIL(&uh->head, up, list);
 		goto done;
 	}
 
 	if (is_bitmap(uh, up)) {
 		ub = up->ptr;
 		if (bit_test(ub->map, i) == 0) {
 			bit_set(ub->map, i);
 			goto done;
 		} else
 			return (-1);
 	} else if (up->ptr == uh)
 		return (-1);
 
 	KASSERT(up->ptr == NULL,
 	    ("alloc_unr_specificl: up->ptr != NULL (up=%p)", up));
 
 	/* Split off the tail end, if any. */
 	tl = up->len - (1 + i);
 	if (tl > 0) {
 		upn = new_unr(uh, p1, p2);
 		upn->ptr = NULL;
 		upn->len = tl;
 		TAILQ_INSERT_AFTER(&uh->head, up, upn, list);
 	}
 
 	/* Split off head end, if any */
 	if (i > 0) {
 		upn = new_unr(uh, p1, p2);
 		upn->len = i;
 		upn->ptr = NULL;
 		TAILQ_INSERT_BEFORE(up, upn, list);
 	}
 	up->len = 1;
 	up->ptr = uh;
 
 done:
 	last = uh->high - uh->low - (item - uh->low);
 	if (uh->last > last)
 		uh->last = last;
 	uh->busy++;
 	collapse_unr(uh, up);
 	check_unrhdr(uh, __LINE__);
 	return (item);
 }
 
 int
 alloc_unr_specific(struct unrhdr *uh, u_int item)
 {
 	void *p1, *p2;
 	int i;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "alloc_unr_specific");
 
 	p1 = Malloc(sizeof(struct unr));
 	p2 = Malloc(sizeof(struct unr));
 
 	mtx_lock(uh->mtx);
 	i = alloc_unr_specificl(uh, item, &p1, &p2);
 	mtx_unlock(uh->mtx);
 
 	if (p1 != NULL)
 		Free(p1);
 	if (p2 != NULL)
 		Free(p2);
 
 	return (i);
 }
 
 /*
  * Free a unr.
  *
  * If we can save unrs by using a bitmap, do so.
  */
 static void
 free_unrl(struct unrhdr *uh, u_int item, void **p1, void **p2)
 {
 	struct unr *up, *upp, *upn;
 	struct unrb *ub;
 	u_int pl;
 
 	KASSERT(item >= uh->low && item <= uh->high,
 	    ("UNR: free_unr(%u) out of range [%u...%u]",
 	     item, uh->low, uh->high));
 	check_unrhdr(uh, __LINE__);
 	item -= uh->low;
 	upp = TAILQ_FIRST(&uh->head);
 	/*
 	 * Freeing in the ideal split case
 	 */
 	if (item + 1 == uh->first && upp == NULL) {
 		uh->last++;
 		uh->first--;
 		uh->busy--;
 		check_unrhdr(uh, __LINE__);
 		return;
 	}
 	/*
  	 * Freeing in the ->first section.  Create a run starting at the
 	 * freed item.  The code below will subdivide it.
 	 */
 	if (item < uh->first) {
 		up = new_unr(uh, p1, p2);
 		up->ptr = uh;
 		up->len = uh->first - item;
 		TAILQ_INSERT_HEAD(&uh->head, up, list);
 		uh->first -= up->len;
 	}
 
 	item -= uh->first;
 
 	/* Find the item which contains the unit we want to free */
 	TAILQ_FOREACH(up, &uh->head, list) {
 		if (up->len > item)
 			break;
 		item -= up->len;
 	}
 
 	/* Handle bitmap items */
 	if (is_bitmap(uh, up)) {
 		ub = up->ptr;
 
 		KASSERT(bit_test(ub->map, item) != 0,
 		    ("UNR: Freeing free item %d (bitmap)\n", item));
 		bit_clear(ub->map, item);
 		uh->busy--;
 		collapse_unr(uh, up);
 		return;
 	}
 
 	KASSERT(up->ptr == uh, ("UNR Freeing free item %d (run))\n", item));
 
 	/* Just this one left, reap it */
 	if (up->len == 1) {
 		up->ptr = NULL;
 		uh->busy--;
 		collapse_unr(uh, up);
 		return;
 	}
 
 	/* Check if we can shift the item into the previous 'free' run */
 	upp = TAILQ_PREV(up, unrhd, list);
 	if (item == 0 && upp != NULL && upp->ptr == NULL) {
 		upp->len++;
 		up->len--;
 		uh->busy--;
 		collapse_unr(uh, up);
 		return;
 	}
 
 	/* Check if we can shift the item to the next 'free' run */
 	upn = TAILQ_NEXT(up, list);
 	if (item == up->len - 1 && upn != NULL && upn->ptr == NULL) {
 		upn->len++;
 		up->len--;
 		uh->busy--;
 		collapse_unr(uh, up);
 		return;
 	}
 
 	/* Split off the tail end, if any. */
 	pl = up->len - (1 + item);
 	if (pl > 0) {
 		upp = new_unr(uh, p1, p2);
 		upp->ptr = uh;
 		upp->len = pl;
 		TAILQ_INSERT_AFTER(&uh->head, up, upp, list);
 	}
 
 	/* Split off head end, if any */
 	if (item > 0) {
 		upp = new_unr(uh, p1, p2);
 		upp->len = item;
 		upp->ptr = uh;
 		TAILQ_INSERT_BEFORE(up, upp, list);
 	}
 	up->len = 1;
 	up->ptr = NULL;
 	uh->busy--;
 	collapse_unr(uh, up);
 }
 
 void
 free_unr(struct unrhdr *uh, u_int item)
 {
 	void *p1, *p2;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "free_unr");
 	p1 = Malloc(sizeof(struct unr));
 	p2 = Malloc(sizeof(struct unr));
 	mtx_lock(uh->mtx);
 	free_unrl(uh, item, &p1, &p2);
 	clean_unrhdrl(uh);
 	mtx_unlock(uh->mtx);
 	if (p1 != NULL)
 		Free(p1);
 	if (p2 != NULL)
 		Free(p2);
 }
 
 #ifndef _KERNEL	/* USERLAND test driver */
 
 /*
  * Simple stochastic test driver for the above functions.  The code resides
  * here so that it can access static functions and structures.
  */
 
 static bool verbose;
 #define VPRINTF(...)	{if (verbose) printf(__VA_ARGS__);}
 
 static void
 print_unr(struct unrhdr *uh, struct unr *up)
 {
 	u_int x;
 	struct unrb *ub;
 
 	printf("  %p len = %5u ", up, up->len);
 	if (up->ptr == NULL)
 		printf("free\n");
 	else if (up->ptr == uh)
 		printf("alloc\n");
 	else {
 		ub = up->ptr;
 		printf("bitmap [");
 		for (x = 0; x < up->len; x++) {
 			if (bit_test(ub->map, x))
 				printf("#");
 			else
 				printf(" ");
 		}
 		printf("]\n");
 	}
 }
 
 static void
 print_unrhdr(struct unrhdr *uh)
 {
 	struct unr *up;
 	u_int x;
 
 	printf(
 	    "%p low = %u high = %u first = %u last = %u busy %u chunks = %u\n",
 	    uh, uh->low, uh->high, uh->first, uh->last, uh->busy, uh->alloc);
 	x = uh->low + uh->first;
 	TAILQ_FOREACH(up, &uh->head, list) {
 		printf("  from = %5u", x);
 		print_unr(uh, up);
 		if (up->ptr == NULL || up->ptr == uh)
 			x += up->len;
 		else
 			x += NBITS;
 	}
 }
 
 static void
 test_alloc_unr(struct unrhdr *uh, u_int i, char a[])
 {
 	int j;
 
 	if (a[i]) {
 		VPRINTF("F %u\n", i);
 		free_unr(uh, i);
 		a[i] = 0;
 	} else {
 		no_alloc = 1;
 		j = alloc_unr(uh);
 		if (j != -1) {
 			a[j] = 1;
 			VPRINTF("A %d\n", j);
 		}
 		no_alloc = 0;
 	}
 }
 
 static void
 test_alloc_unr_specific(struct unrhdr *uh, u_int i, char a[])
 {
 	int j;
 
 	j = alloc_unr_specific(uh, i);
 	if (j == -1) {
 		VPRINTF("F %u\n", i);
 		a[i] = 0;
 		free_unr(uh, i);
 	} else {
 		a[i] = 1;
 		VPRINTF("A %d\n", j);
 	}
 }
 
 static void
 usage(char** argv)
 {
 	printf("%s [-h] [-r REPETITIONS] [-v]\n", argv[0]);
 }
 
 int
 main(int argc, char **argv)
 {
 	struct unrhdr *uh;
 	char *a;
 	long count = 10000;	/* Number of unrs to test */
 	long reps = 1, m;
 	int ch;
 	u_int i, j;
 
 	verbose = false;
 
 	while ((ch = getopt(argc, argv, "hr:v")) != -1) {
 		switch (ch) {
 		case 'r':
 			errno = 0;
 			reps = strtol(optarg, NULL, 0);
 			if (errno == ERANGE || errno == EINVAL) {
 				usage(argv);
 				exit(2);
 			}
 
 			break;
 		case 'v':
 			verbose = true;
 			break;
 		case 'h':
 		default:
 			usage(argv);
 			exit(2);
 		}
 
 
 	}
 
 	setbuf(stdout, NULL);
 	uh = new_unrhdr(0, count - 1, NULL);
 	print_unrhdr(uh);
 
 	a = calloc(count, sizeof(char));
 	if (a == NULL)
 		err(1, "calloc failed");
 	srandomdev();
 
 	printf("sizeof(struct unr) %zu\n", sizeof(struct unr));
 	printf("sizeof(struct unrb) %zu\n", sizeof(struct unrb));
 	printf("sizeof(struct unrhdr) %zu\n", sizeof(struct unrhdr));
 	printf("NBITS %lu\n", (unsigned long)NBITS);
 	for (m = 0; m < count * reps; m++) {
 		j = random();
 		i = (j >> 1) % count;
 #if 0
 		if (a[i] && (j & 1))
 			continue;
 #endif
 		if ((random() & 1) != 0)
 			test_alloc_unr(uh, i, a);
 		else
 			test_alloc_unr_specific(uh, i, a);
 
 		if (verbose)
 			print_unrhdr(uh);
 		check_unrhdr(uh, __LINE__);
 	}
 	for (i = 0; i < (u_int)count; i++) {
 		if (a[i]) {
 			if (verbose) {
 				printf("C %u\n", i);
 				print_unrhdr(uh);
 			}
 			free_unr(uh, i);
 		}
 	}
 	print_unrhdr(uh);
 	delete_unrhdr(uh);
 	free(a);
 	return (0);
 }
 #endif
Index: head/sys/kern/subr_vmem.c
===================================================================
--- head/sys/kern/subr_vmem.c	(revision 326270)
+++ head/sys/kern/subr_vmem.c	(revision 326271)
@@ -1,1586 +1,1588 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
  * Copyright (c) 2013 EMC Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * From:
  *	$NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $
  *	$NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $
  */
 
 /*
  * reference:
  * -	Magazines and Vmem: Extending the Slab Allocator
  *	to Many CPUs and Arbitrary Resources
  *	http://www.usenix.org/event/usenix01/bonwick.html
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/queue.h>
 #include <sys/callout.h>
 #include <sys/hash.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/smp.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/vmem.h>
 
 #include "opt_vm.h"
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/vm_pageout.h>
 
 #define	VMEM_OPTORDER		5
 #define	VMEM_OPTVALUE		(1 << VMEM_OPTORDER)
 #define	VMEM_MAXORDER						\
     (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER)
 
 #define	VMEM_HASHSIZE_MIN	16
 #define	VMEM_HASHSIZE_MAX	131072
 
 #define	VMEM_QCACHE_IDX_MAX	16
 
 #define	VMEM_FITMASK	(M_BESTFIT | M_FIRSTFIT)
 
 #define	VMEM_FLAGS						\
     (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT)
 
 #define	BT_FLAGS	(M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM)
 
 #define	QC_NAME_MAX	16
 
 /*
  * Data structures private to vmem.
  */
 MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures");
 
 typedef struct vmem_btag bt_t;
 
 TAILQ_HEAD(vmem_seglist, vmem_btag);
 LIST_HEAD(vmem_freelist, vmem_btag);
 LIST_HEAD(vmem_hashlist, vmem_btag);
 
 struct qcache {
 	uma_zone_t	qc_cache;
 	vmem_t 		*qc_vmem;
 	vmem_size_t	qc_size;
 	char		qc_name[QC_NAME_MAX];
 };
 typedef struct qcache qcache_t;
 #define	QC_POOL_TO_QCACHE(pool)	((qcache_t *)(pool->pr_qcache))
 
 #define	VMEM_NAME_MAX	16
 
 /* vmem arena */
 struct vmem {
 	struct mtx_padalign	vm_lock;
 	struct cv		vm_cv;
 	char			vm_name[VMEM_NAME_MAX+1];
 	LIST_ENTRY(vmem)	vm_alllist;
 	struct vmem_hashlist	vm_hash0[VMEM_HASHSIZE_MIN];
 	struct vmem_freelist	vm_freelist[VMEM_MAXORDER];
 	struct vmem_seglist	vm_seglist;
 	struct vmem_hashlist	*vm_hashlist;
 	vmem_size_t		vm_hashsize;
 
 	/* Constant after init */
 	vmem_size_t		vm_qcache_max;
 	vmem_size_t		vm_quantum_mask;
 	vmem_size_t		vm_import_quantum;
 	int			vm_quantum_shift;
 
 	/* Written on alloc/free */
 	LIST_HEAD(, vmem_btag)	vm_freetags;
 	int			vm_nfreetags;
 	int			vm_nbusytag;
 	vmem_size_t		vm_inuse;
 	vmem_size_t		vm_size;
 
 	/* Used on import. */
 	vmem_import_t		*vm_importfn;
 	vmem_release_t		*vm_releasefn;
 	void			*vm_arg;
 
 	/* Space exhaustion callback. */
 	vmem_reclaim_t		*vm_reclaimfn;
 
 	/* quantum cache */
 	qcache_t		vm_qcache[VMEM_QCACHE_IDX_MAX];
 };
 
 /* boundary tag */
 struct vmem_btag {
 	TAILQ_ENTRY(vmem_btag) bt_seglist;
 	union {
 		LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */
 		LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */
 	} bt_u;
 #define	bt_hashlist	bt_u.u_hashlist
 #define	bt_freelist	bt_u.u_freelist
 	vmem_addr_t	bt_start;
 	vmem_size_t	bt_size;
 	int		bt_type;
 };
 
 #define	BT_TYPE_SPAN		1	/* Allocated from importfn */
 #define	BT_TYPE_SPAN_STATIC	2	/* vmem_add() or create. */
 #define	BT_TYPE_FREE		3	/* Available space. */
 #define	BT_TYPE_BUSY		4	/* Used space. */
 #define	BT_ISSPAN_P(bt)	((bt)->bt_type <= BT_TYPE_SPAN_STATIC)
 
 #define	BT_END(bt)	((bt)->bt_start + (bt)->bt_size - 1)
 
 #if defined(DIAGNOSTIC)
 static int enable_vmem_check = 1;
 SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN,
     &enable_vmem_check, 0, "Enable vmem check");
 static void vmem_check(vmem_t *);
 #endif
 
 static struct callout	vmem_periodic_ch;
 static int		vmem_periodic_interval;
 static struct task	vmem_periodic_wk;
 
 static struct mtx_padalign __exclusive_cache_line vmem_list_lock;
 static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
 
 /* ---- misc */
 #define	VMEM_CONDVAR_INIT(vm, wchan)	cv_init(&vm->vm_cv, wchan)
 #define	VMEM_CONDVAR_DESTROY(vm)	cv_destroy(&vm->vm_cv)
 #define	VMEM_CONDVAR_WAIT(vm)		cv_wait(&vm->vm_cv, &vm->vm_lock)
 #define	VMEM_CONDVAR_BROADCAST(vm)	cv_broadcast(&vm->vm_cv)
 
 
 #define	VMEM_LOCK(vm)		mtx_lock(&vm->vm_lock)
 #define	VMEM_TRYLOCK(vm)	mtx_trylock(&vm->vm_lock)
 #define	VMEM_UNLOCK(vm)		mtx_unlock(&vm->vm_lock)
 #define	VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF)
 #define	VMEM_LOCK_DESTROY(vm)	mtx_destroy(&vm->vm_lock)
 #define	VMEM_ASSERT_LOCKED(vm)	mtx_assert(&vm->vm_lock, MA_OWNED);
 
 #define	VMEM_ALIGNUP(addr, align)	(-(-(addr) & -(align)))
 
 #define	VMEM_CROSS_P(addr1, addr2, boundary) \
 	((((addr1) ^ (addr2)) & -(boundary)) != 0)
 
 #define	ORDER2SIZE(order)	((order) < VMEM_OPTVALUE ? ((order) + 1) : \
     (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1)))
 #define	SIZE2ORDER(size)	((size) <= VMEM_OPTVALUE ? ((size) - 1) : \
     (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2)))
 
 /*
  * Maximum number of boundary tags that may be required to satisfy an
  * allocation.  Two may be required to import.  Another two may be
  * required to clip edges.
  */
 #define	BT_MAXALLOC	4
 
 /*
  * Max free limits the number of locally cached boundary tags.  We
  * just want to avoid hitting the zone allocator for every call.
  */
 #define BT_MAXFREE	(BT_MAXALLOC * 8)
 
 /* Allocator for boundary tags. */
 static uma_zone_t vmem_bt_zone;
 
 /* boot time arena storage. */
 static struct vmem kernel_arena_storage;
 static struct vmem kmem_arena_storage;
 static struct vmem buffer_arena_storage;
 static struct vmem transient_arena_storage;
 vmem_t *kernel_arena = &kernel_arena_storage;
 vmem_t *kmem_arena = &kmem_arena_storage;
 vmem_t *buffer_arena = &buffer_arena_storage;
 vmem_t *transient_arena = &transient_arena_storage;
 
 #ifdef DEBUG_MEMGUARD
 static struct vmem memguard_arena_storage;
 vmem_t *memguard_arena = &memguard_arena_storage;
 #endif
 
 /*
  * Fill the vmem's boundary tag cache.  We guarantee that boundary tag
  * allocation will not fail once bt_fill() passes.  To do so we cache
  * at least the maximum possible tag allocations in the arena.
  */
 static int
 bt_fill(vmem_t *vm, int flags)
 {
 	bt_t *bt;
 
 	VMEM_ASSERT_LOCKED(vm);
 
 	/*
 	 * Only allow the kmem arena to dip into reserve tags.  It is the
 	 * vmem where new tags come from.
 	 */
 	flags &= BT_FLAGS;
 	if (vm != kmem_arena)
 		flags &= ~M_USE_RESERVE;
 
 	/*
 	 * Loop until we meet the reserve.  To minimize the lock shuffle
 	 * and prevent simultaneous fills we first try a NOWAIT regardless
 	 * of the caller's flags.  Specify M_NOVM so we don't recurse while
 	 * holding a vmem lock.
 	 */
 	while (vm->vm_nfreetags < BT_MAXALLOC) {
 		bt = uma_zalloc(vmem_bt_zone,
 		    (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM);
 		if (bt == NULL) {
 			VMEM_UNLOCK(vm);
 			bt = uma_zalloc(vmem_bt_zone, flags);
 			VMEM_LOCK(vm);
 			if (bt == NULL && (flags & M_NOWAIT) != 0)
 				break;
 		}
 		LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
 		vm->vm_nfreetags++;
 	}
 
 	if (vm->vm_nfreetags < BT_MAXALLOC)
 		return ENOMEM;
 
 	return 0;
 }
 
 /*
  * Pop a tag off of the freetag stack.
  */
 static bt_t *
 bt_alloc(vmem_t *vm)
 {
 	bt_t *bt;
 
 	VMEM_ASSERT_LOCKED(vm);
 	bt = LIST_FIRST(&vm->vm_freetags);
 	MPASS(bt != NULL);
 	LIST_REMOVE(bt, bt_freelist);
 	vm->vm_nfreetags--;
 
 	return bt;
 }
 
 /*
  * Trim the per-vmem free list.  Returns with the lock released to
  * avoid allocator recursions.
  */
 static void
 bt_freetrim(vmem_t *vm, int freelimit)
 {
 	LIST_HEAD(, vmem_btag) freetags;
 	bt_t *bt;
 
 	LIST_INIT(&freetags);
 	VMEM_ASSERT_LOCKED(vm);
 	while (vm->vm_nfreetags > freelimit) {
 		bt = LIST_FIRST(&vm->vm_freetags);
 		LIST_REMOVE(bt, bt_freelist);
 		vm->vm_nfreetags--;
 		LIST_INSERT_HEAD(&freetags, bt, bt_freelist);
 	}
 	VMEM_UNLOCK(vm);
 	while ((bt = LIST_FIRST(&freetags)) != NULL) {
 		LIST_REMOVE(bt, bt_freelist);
 		uma_zfree(vmem_bt_zone, bt);
 	}
 }
 
 static inline void
 bt_free(vmem_t *vm, bt_t *bt)
 {
 
 	VMEM_ASSERT_LOCKED(vm);
 	MPASS(LIST_FIRST(&vm->vm_freetags) != bt);
 	LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
 	vm->vm_nfreetags++;
 }
 
 /*
  * freelist[0] ... [1, 1]
  * freelist[1] ... [2, 2]
  *  :
  * freelist[29] ... [30, 30]
  * freelist[30] ... [31, 31]
  * freelist[31] ... [32, 63]
  * freelist[33] ... [64, 127]
  *  :
  * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1]
  *  :
  */
 
 static struct vmem_freelist *
 bt_freehead_tofree(vmem_t *vm, vmem_size_t size)
 {
 	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
 	const int idx = SIZE2ORDER(qsize);
 
 	MPASS(size != 0 && qsize != 0);
 	MPASS((size & vm->vm_quantum_mask) == 0);
 	MPASS(idx >= 0);
 	MPASS(idx < VMEM_MAXORDER);
 
 	return &vm->vm_freelist[idx];
 }
 
 /*
  * bt_freehead_toalloc: return the freelist for the given size and allocation
  * strategy.
  *
  * For M_FIRSTFIT, return the list in which any blocks are large enough
  * for the requested size.  otherwise, return the list which can have blocks
  * large enough for the requested size.
  */
 static struct vmem_freelist *
 bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat)
 {
 	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
 	int idx = SIZE2ORDER(qsize);
 
 	MPASS(size != 0 && qsize != 0);
 	MPASS((size & vm->vm_quantum_mask) == 0);
 
 	if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) {
 		idx++;
 		/* check too large request? */
 	}
 	MPASS(idx >= 0);
 	MPASS(idx < VMEM_MAXORDER);
 
 	return &vm->vm_freelist[idx];
 }
 
 /* ---- boundary tag hash */
 
 static struct vmem_hashlist *
 bt_hashhead(vmem_t *vm, vmem_addr_t addr)
 {
 	struct vmem_hashlist *list;
 	unsigned int hash;
 
 	hash = hash32_buf(&addr, sizeof(addr), 0);
 	list = &vm->vm_hashlist[hash % vm->vm_hashsize];
 
 	return list;
 }
 
 static bt_t *
 bt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
 {
 	struct vmem_hashlist *list;
 	bt_t *bt;
 
 	VMEM_ASSERT_LOCKED(vm);
 	list = bt_hashhead(vm, addr); 
 	LIST_FOREACH(bt, list, bt_hashlist) {
 		if (bt->bt_start == addr) {
 			break;
 		}
 	}
 
 	return bt;
 }
 
 static void
 bt_rembusy(vmem_t *vm, bt_t *bt)
 {
 
 	VMEM_ASSERT_LOCKED(vm);
 	MPASS(vm->vm_nbusytag > 0);
 	vm->vm_inuse -= bt->bt_size;
 	vm->vm_nbusytag--;
 	LIST_REMOVE(bt, bt_hashlist);
 }
 
 static void
 bt_insbusy(vmem_t *vm, bt_t *bt)
 {
 	struct vmem_hashlist *list;
 
 	VMEM_ASSERT_LOCKED(vm);
 	MPASS(bt->bt_type == BT_TYPE_BUSY);
 
 	list = bt_hashhead(vm, bt->bt_start);
 	LIST_INSERT_HEAD(list, bt, bt_hashlist);
 	vm->vm_nbusytag++;
 	vm->vm_inuse += bt->bt_size;
 }
 
 /* ---- boundary tag list */
 
 static void
 bt_remseg(vmem_t *vm, bt_t *bt)
 {
 
 	TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
 	bt_free(vm, bt);
 }
 
 static void
 bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
 {
 
 	TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
 }
 
 static void
 bt_insseg_tail(vmem_t *vm, bt_t *bt)
 {
 
 	TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
 }
 
 static void
 bt_remfree(vmem_t *vm, bt_t *bt)
 {
 
 	MPASS(bt->bt_type == BT_TYPE_FREE);
 
 	LIST_REMOVE(bt, bt_freelist);
 }
 
 static void
 bt_insfree(vmem_t *vm, bt_t *bt)
 {
 	struct vmem_freelist *list;
 
 	list = bt_freehead_tofree(vm, bt->bt_size);
 	LIST_INSERT_HEAD(list, bt, bt_freelist);
 }
 
 /* ---- vmem internal functions */
 
 /*
  * Import from the arena into the quantum cache in UMA.
  */
 static int
 qc_import(void *arg, void **store, int cnt, int flags)
 {
 	qcache_t *qc;
 	vmem_addr_t addr;
 	int i;
 
 	qc = arg;
 	if ((flags & VMEM_FITMASK) == 0)
 		flags |= M_BESTFIT;
 	for (i = 0; i < cnt; i++) {
 		if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0,
 		    VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0)
 			break;
 		store[i] = (void *)addr;
 		/* Only guarantee one allocation. */
 		flags &= ~M_WAITOK;
 		flags |= M_NOWAIT;
 	}
 	return i;
 }
 
 /*
  * Release memory from the UMA cache to the arena.
  */
 static void
 qc_release(void *arg, void **store, int cnt)
 {
 	qcache_t *qc;
 	int i;
 
 	qc = arg;
 	for (i = 0; i < cnt; i++)
 		vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size);
 }
 
 static void
 qc_init(vmem_t *vm, vmem_size_t qcache_max)
 {
 	qcache_t *qc;
 	vmem_size_t size;
 	int qcache_idx_max;
 	int i;
 
 	MPASS((qcache_max & vm->vm_quantum_mask) == 0);
 	qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift,
 	    VMEM_QCACHE_IDX_MAX);
 	vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift;
 	for (i = 0; i < qcache_idx_max; i++) {
 		qc = &vm->vm_qcache[i];
 		size = (i + 1) << vm->vm_quantum_shift;
 		snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
 		    vm->vm_name, size);
 		qc->qc_vmem = vm;
 		qc->qc_size = size;
 		qc->qc_cache = uma_zcache_create(qc->qc_name, size,
 		    NULL, NULL, NULL, NULL, qc_import, qc_release, qc,
 		    UMA_ZONE_VM);
 		MPASS(qc->qc_cache);
 	}
 }
 
 static void
 qc_destroy(vmem_t *vm)
 {
 	int qcache_idx_max;
 	int i;
 
 	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
 	for (i = 0; i < qcache_idx_max; i++)
 		uma_zdestroy(vm->vm_qcache[i].qc_cache);
 }
 
 static void
 qc_drain(vmem_t *vm)
 {
 	int qcache_idx_max;
 	int i;
 
 	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
 	for (i = 0; i < qcache_idx_max; i++)
 		zone_drain(vm->vm_qcache[i].qc_cache);
 }
 
 #ifndef UMA_MD_SMALL_ALLOC
 
 static struct mtx_padalign __exclusive_cache_line vmem_bt_lock;
 
 /*
  * vmem_bt_alloc:  Allocate a new page of boundary tags.
  *
  * On architectures with uma_small_alloc there is no recursion; no address
  * space need be allocated to allocate boundary tags.  For the others, we
  * must handle recursion.  Boundary tags are necessary to allocate new
  * boundary tags.
  *
  * UMA guarantees that enough tags are held in reserve to allocate a new
  * page of kva.  We dip into this reserve by specifying M_USE_RESERVE only
  * when allocating the page to hold new boundary tags.  In this way the
  * reserve is automatically filled by the allocation that uses the reserve.
  * 
  * We still have to guarantee that the new tags are allocated atomically since
  * many threads may try concurrently.  The bt_lock provides this guarantee.
  * We convert WAITOK allocations to NOWAIT and then handle the blocking here
  * on failure.  It's ok to return NULL for a WAITOK allocation as UMA will
  * loop again after checking to see if we lost the race to allocate.
  *
  * There is a small race between vmem_bt_alloc() returning the page and the
  * zone lock being acquired to add the page to the zone.  For WAITOK
  * allocations we just pause briefly.  NOWAIT may experience a transient
  * failure.  To alleviate this we permit a small number of simultaneous
  * fills to proceed concurrently so NOWAIT is less likely to fail unless
  * we are really out of KVA.
  */
 static void *
 vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
 {
 	vmem_addr_t addr;
 
 	*pflag = UMA_SLAB_KMEM;
 
 	/*
 	 * Single thread boundary tag allocation so that the address space
 	 * and memory are added in one atomic operation.
 	 */
 	mtx_lock(&vmem_bt_lock);
 	if (vmem_xalloc(kmem_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN,
 	    VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT,
 	    &addr) == 0) {
 		if (kmem_back(kmem_object, addr, bytes,
 		    M_NOWAIT | M_USE_RESERVE) == 0) {
 			mtx_unlock(&vmem_bt_lock);
 			return ((void *)addr);
 		}
 		vmem_xfree(kmem_arena, addr, bytes);
 		mtx_unlock(&vmem_bt_lock);
 		/*
 		 * Out of memory, not address space.  This may not even be
 		 * possible due to M_USE_RESERVE page allocation.
 		 */
 		if (wait & M_WAITOK)
 			VM_WAIT;
 		return (NULL);
 	}
 	mtx_unlock(&vmem_bt_lock);
 	/*
 	 * We're either out of address space or lost a fill race.
 	 */
 	if (wait & M_WAITOK)
 		pause("btalloc", 1);
 
 	return (NULL);
 }
 #endif
 
 void
 vmem_startup(void)
 {
 
 	mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF);
 	vmem_bt_zone = uma_zcreate("vmem btag",
 	    sizeof(struct vmem_btag), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_VM);
 #ifndef UMA_MD_SMALL_ALLOC
 	mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF);
 	uma_prealloc(vmem_bt_zone, BT_MAXALLOC);
 	/*
 	 * Reserve enough tags to allocate new tags.  We allow multiple
 	 * CPUs to attempt to allocate new tags concurrently to limit
 	 * false restarts in UMA.
 	 */
 	uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2);
 	uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc);
 #endif
 }
 
 /* ---- rehash */
 
 static int
 vmem_rehash(vmem_t *vm, vmem_size_t newhashsize)
 {
 	bt_t *bt;
 	int i;
 	struct vmem_hashlist *newhashlist;
 	struct vmem_hashlist *oldhashlist;
 	vmem_size_t oldhashsize;
 
 	MPASS(newhashsize > 0);
 
 	newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize,
 	    M_VMEM, M_NOWAIT);
 	if (newhashlist == NULL)
 		return ENOMEM;
 	for (i = 0; i < newhashsize; i++) {
 		LIST_INIT(&newhashlist[i]);
 	}
 
 	VMEM_LOCK(vm);
 	oldhashlist = vm->vm_hashlist;
 	oldhashsize = vm->vm_hashsize;
 	vm->vm_hashlist = newhashlist;
 	vm->vm_hashsize = newhashsize;
 	if (oldhashlist == NULL) {
 		VMEM_UNLOCK(vm);
 		return 0;
 	}
 	for (i = 0; i < oldhashsize; i++) {
 		while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
 			bt_rembusy(vm, bt);
 			bt_insbusy(vm, bt);
 		}
 	}
 	VMEM_UNLOCK(vm);
 
 	if (oldhashlist != vm->vm_hash0) {
 		free(oldhashlist, M_VMEM);
 	}
 
 	return 0;
 }
 
 static void
 vmem_periodic_kick(void *dummy)
 {
 
 	taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk);
 }
 
 static void
 vmem_periodic(void *unused, int pending)
 {
 	vmem_t *vm;
 	vmem_size_t desired;
 	vmem_size_t current;
 
 	mtx_lock(&vmem_list_lock);
 	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
 #ifdef DIAGNOSTIC
 		/* Convenient time to verify vmem state. */
 		if (enable_vmem_check == 1) {
 			VMEM_LOCK(vm);
 			vmem_check(vm);
 			VMEM_UNLOCK(vm);
 		}
 #endif
 		desired = 1 << flsl(vm->vm_nbusytag);
 		desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN),
 		    VMEM_HASHSIZE_MAX);
 		current = vm->vm_hashsize;
 
 		/* Grow in powers of two.  Shrink less aggressively. */
 		if (desired >= current * 2 || desired * 4 <= current)
 			vmem_rehash(vm, desired);
 
 		/*
 		 * Periodically wake up threads waiting for resources,
 		 * so they could ask for reclamation again.
 		 */
 		VMEM_CONDVAR_BROADCAST(vm);
 	}
 	mtx_unlock(&vmem_list_lock);
 
 	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
 	    vmem_periodic_kick, NULL);
 }
 
 static void
 vmem_start_callout(void *unused)
 {
 
 	TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL);
 	vmem_periodic_interval = hz * 10;
 	callout_init(&vmem_periodic_ch, 1);
 	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
 	    vmem_periodic_kick, NULL);
 }
 SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL);
 
 static void
 vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type)
 {
 	bt_t *btspan;
 	bt_t *btfree;
 
 	MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC);
 	MPASS((size & vm->vm_quantum_mask) == 0);
 
 	btspan = bt_alloc(vm);
 	btspan->bt_type = type;
 	btspan->bt_start = addr;
 	btspan->bt_size = size;
 	bt_insseg_tail(vm, btspan);
 
 	btfree = bt_alloc(vm);
 	btfree->bt_type = BT_TYPE_FREE;
 	btfree->bt_start = addr;
 	btfree->bt_size = size;
 	bt_insseg(vm, btfree, btspan);
 	bt_insfree(vm, btfree);
 
 	vm->vm_size += size;
 }
 
 static void
 vmem_destroy1(vmem_t *vm)
 {
 	bt_t *bt;
 
 	/*
 	 * Drain per-cpu quantum caches.
 	 */
 	qc_destroy(vm);
 
 	/*
 	 * The vmem should now only contain empty segments.
 	 */
 	VMEM_LOCK(vm);
 	MPASS(vm->vm_nbusytag == 0);
 
 	while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL)
 		bt_remseg(vm, bt);
 
 	if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0)
 		free(vm->vm_hashlist, M_VMEM);
 
 	bt_freetrim(vm, 0);
 
 	VMEM_CONDVAR_DESTROY(vm);
 	VMEM_LOCK_DESTROY(vm);
 	free(vm, M_VMEM);
 }
 
 static int
 vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
 {
 	vmem_addr_t addr;
 	int error;
 
 	if (vm->vm_importfn == NULL)
 		return EINVAL;
 
 	/*
 	 * To make sure we get a span that meets the alignment we double it
 	 * and add the size to the tail.  This slightly overestimates.
 	 */
 	if (align != vm->vm_quantum_mask + 1)
 		size = (align * 2) + size;
 	size = roundup(size, vm->vm_import_quantum);
 
 	/*
 	 * Hide MAXALLOC tags so we're guaranteed to be able to add this
 	 * span and the tag we want to allocate from it.
 	 */
 	MPASS(vm->vm_nfreetags >= BT_MAXALLOC);
 	vm->vm_nfreetags -= BT_MAXALLOC;
 	VMEM_UNLOCK(vm);
 	error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
 	VMEM_LOCK(vm);
 	vm->vm_nfreetags += BT_MAXALLOC;
 	if (error)
 		return ENOMEM;
 
 	vmem_add1(vm, addr, size, BT_TYPE_SPAN);
 
 	return 0;
 }
 
 /*
  * vmem_fit: check if a bt can satisfy the given restrictions.
  *
  * it's a caller's responsibility to ensure the region is big enough
  * before calling us.
  */
 static int
 vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
     vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
     vmem_addr_t maxaddr, vmem_addr_t *addrp)
 {
 	vmem_addr_t start;
 	vmem_addr_t end;
 
 	MPASS(size > 0);
 	MPASS(bt->bt_size >= size); /* caller's responsibility */
 
 	/*
 	 * XXX assumption: vmem_addr_t and vmem_size_t are
 	 * unsigned integer of the same size.
 	 */
 
 	start = bt->bt_start;
 	if (start < minaddr) {
 		start = minaddr;
 	}
 	end = BT_END(bt);
 	if (end > maxaddr)
 		end = maxaddr;
 	if (start > end) 
 		return (ENOMEM);
 
 	start = VMEM_ALIGNUP(start - phase, align) + phase;
 	if (start < bt->bt_start)
 		start += align;
 	if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
 		MPASS(align < nocross);
 		start = VMEM_ALIGNUP(start - phase, nocross) + phase;
 	}
 	if (start <= end && end - start >= size - 1) {
 		MPASS((start & (align - 1)) == phase);
 		MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross));
 		MPASS(minaddr <= start);
 		MPASS(maxaddr == 0 || start + size - 1 <= maxaddr);
 		MPASS(bt->bt_start <= start);
 		MPASS(BT_END(bt) - start >= size - 1);
 		*addrp = start;
 
 		return (0);
 	}
 	return (ENOMEM);
 }
 
 /*
  * vmem_clip:  Trim the boundary tag edges to the requested start and size.
  */
 static void
 vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size)
 {
 	bt_t *btnew;
 	bt_t *btprev;
 
 	VMEM_ASSERT_LOCKED(vm);
 	MPASS(bt->bt_type == BT_TYPE_FREE);
 	MPASS(bt->bt_size >= size);
 	bt_remfree(vm, bt);
 	if (bt->bt_start != start) {
 		btprev = bt_alloc(vm);
 		btprev->bt_type = BT_TYPE_FREE;
 		btprev->bt_start = bt->bt_start;
 		btprev->bt_size = start - bt->bt_start;
 		bt->bt_start = start;
 		bt->bt_size -= btprev->bt_size;
 		bt_insfree(vm, btprev);
 		bt_insseg(vm, btprev,
 		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
 	}
 	MPASS(bt->bt_start == start);
 	if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
 		/* split */
 		btnew = bt_alloc(vm);
 		btnew->bt_type = BT_TYPE_BUSY;
 		btnew->bt_start = bt->bt_start;
 		btnew->bt_size = size;
 		bt->bt_start = bt->bt_start + size;
 		bt->bt_size -= size;
 		bt_insfree(vm, bt);
 		bt_insseg(vm, btnew,
 		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
 		bt_insbusy(vm, btnew);
 		bt = btnew;
 	} else {
 		bt->bt_type = BT_TYPE_BUSY;
 		bt_insbusy(vm, bt);
 	}
 	MPASS(bt->bt_size >= size);
 	bt->bt_type = BT_TYPE_BUSY;
 }
 
 /* ---- vmem API */
 
 void
 vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
      vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum)
 {
 
 	VMEM_LOCK(vm);
 	vm->vm_importfn = importfn;
 	vm->vm_releasefn = releasefn;
 	vm->vm_arg = arg;
 	vm->vm_import_quantum = import_quantum;
 	VMEM_UNLOCK(vm);
 }
 
 void
 vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn)
 {
 
 	VMEM_LOCK(vm);
 	vm->vm_reclaimfn = reclaimfn;
 	VMEM_UNLOCK(vm);
 }
 
 /*
  * vmem_init: Initializes vmem arena.
  */
 vmem_t *
 vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size,
     vmem_size_t quantum, vmem_size_t qcache_max, int flags)
 {
 	int i;
 
 	MPASS(quantum > 0);
 	MPASS((quantum & (quantum - 1)) == 0);
 
 	bzero(vm, sizeof(*vm));
 
 	VMEM_CONDVAR_INIT(vm, name);
 	VMEM_LOCK_INIT(vm, name);
 	vm->vm_nfreetags = 0;
 	LIST_INIT(&vm->vm_freetags);
 	strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
 	vm->vm_quantum_mask = quantum - 1;
 	vm->vm_quantum_shift = flsl(quantum) - 1;
 	vm->vm_nbusytag = 0;
 	vm->vm_size = 0;
 	vm->vm_inuse = 0;
 	qc_init(vm, qcache_max);
 
 	TAILQ_INIT(&vm->vm_seglist);
 	for (i = 0; i < VMEM_MAXORDER; i++) {
 		LIST_INIT(&vm->vm_freelist[i]);
 	}
 	memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
 	vm->vm_hashsize = VMEM_HASHSIZE_MIN;
 	vm->vm_hashlist = vm->vm_hash0;
 
 	if (size != 0) {
 		if (vmem_add(vm, base, size, flags) != 0) {
 			vmem_destroy1(vm);
 			return NULL;
 		}
 	}
 
 	mtx_lock(&vmem_list_lock);
 	LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
 	mtx_unlock(&vmem_list_lock);
 
 	return vm;
 }
 
 /*
  * vmem_create: create an arena.
  */
 vmem_t *
 vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
     vmem_size_t quantum, vmem_size_t qcache_max, int flags)
 {
 
 	vmem_t *vm;
 
 	vm = malloc(sizeof(*vm), M_VMEM, flags & (M_WAITOK|M_NOWAIT));
 	if (vm == NULL)
 		return (NULL);
 	if (vmem_init(vm, name, base, size, quantum, qcache_max,
 	    flags) == NULL)
 		return (NULL);
 	return (vm);
 }
 
 void
 vmem_destroy(vmem_t *vm)
 {
 
 	mtx_lock(&vmem_list_lock);
 	LIST_REMOVE(vm, vm_alllist);
 	mtx_unlock(&vmem_list_lock);
 
 	vmem_destroy1(vm);
 }
 
 vmem_size_t
 vmem_roundup_size(vmem_t *vm, vmem_size_t size)
 {
 
 	return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
 }
 
 /*
  * vmem_alloc: allocate resource from the arena.
  */
 int
 vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp)
 {
 	const int strat __unused = flags & VMEM_FITMASK;
 	qcache_t *qc;
 
 	flags &= VMEM_FLAGS;
 	MPASS(size > 0);
 	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
 	if ((flags & M_NOWAIT) == 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc");
 
 	if (size <= vm->vm_qcache_max) {
 		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
 		*addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags);
 		if (*addrp == 0)
 			return (ENOMEM);
 		return (0);
 	}
 
 	return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
 	    flags, addrp);
 }
 
 int
 vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
     const vmem_size_t phase, const vmem_size_t nocross,
     const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags,
     vmem_addr_t *addrp)
 {
 	const vmem_size_t size = vmem_roundup_size(vm, size0);
 	struct vmem_freelist *list;
 	struct vmem_freelist *first;
 	struct vmem_freelist *end;
 	vmem_size_t avail;
 	bt_t *bt;
 	int error;
 	int strat;
 
 	flags &= VMEM_FLAGS;
 	strat = flags & VMEM_FITMASK;
 	MPASS(size0 > 0);
 	MPASS(size > 0);
 	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
 	MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK));
 	if ((flags & M_NOWAIT) == 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc");
 	MPASS((align & vm->vm_quantum_mask) == 0);
 	MPASS((align & (align - 1)) == 0);
 	MPASS((phase & vm->vm_quantum_mask) == 0);
 	MPASS((nocross & vm->vm_quantum_mask) == 0);
 	MPASS((nocross & (nocross - 1)) == 0);
 	MPASS((align == 0 && phase == 0) || phase < align);
 	MPASS(nocross == 0 || nocross >= size);
 	MPASS(minaddr <= maxaddr);
 	MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
 
 	if (align == 0)
 		align = vm->vm_quantum_mask + 1;
 
 	*addrp = 0;
 	end = &vm->vm_freelist[VMEM_MAXORDER];
 	/*
 	 * choose a free block from which we allocate.
 	 */
 	first = bt_freehead_toalloc(vm, size, strat);
 	VMEM_LOCK(vm);
 	for (;;) {
 		/*
 		 * Make sure we have enough tags to complete the
 		 * operation.
 		 */
 		if (vm->vm_nfreetags < BT_MAXALLOC &&
 		    bt_fill(vm, flags) != 0) {
 			error = ENOMEM;
 			break;
 		}
 		/*
 	 	 * Scan freelists looking for a tag that satisfies the
 		 * allocation.  If we're doing BESTFIT we may encounter
 		 * sizes below the request.  If we're doing FIRSTFIT we
 		 * inspect only the first element from each list.
 		 */
 		for (list = first; list < end; list++) {
 			LIST_FOREACH(bt, list, bt_freelist) {
 				if (bt->bt_size >= size) {
 					error = vmem_fit(bt, size, align, phase,
 					    nocross, minaddr, maxaddr, addrp);
 					if (error == 0) {
 						vmem_clip(vm, bt, *addrp, size);
 						goto out;
 					}
 				}
 				/* FIRST skips to the next list. */
 				if (strat == M_FIRSTFIT)
 					break;
 			}
 		}
 		/*
 		 * Retry if the fast algorithm failed.
 		 */
 		if (strat == M_FIRSTFIT) {
 			strat = M_BESTFIT;
 			first = bt_freehead_toalloc(vm, size, strat);
 			continue;
 		}
 		/*
 		 * XXX it is possible to fail to meet restrictions with the
 		 * imported region.  It is up to the user to specify the
 		 * import quantum such that it can satisfy any allocation.
 		 */
 		if (vmem_import(vm, size, align, flags) == 0)
 			continue;
 
 		/*
 		 * Try to free some space from the quantum cache or reclaim
 		 * functions if available.
 		 */
 		if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) {
 			avail = vm->vm_size - vm->vm_inuse;
 			VMEM_UNLOCK(vm);
 			if (vm->vm_qcache_max != 0)
 				qc_drain(vm);
 			if (vm->vm_reclaimfn != NULL)
 				vm->vm_reclaimfn(vm, flags);
 			VMEM_LOCK(vm);
 			/* If we were successful retry even NOWAIT. */
 			if (vm->vm_size - vm->vm_inuse > avail)
 				continue;
 		}
 		if ((flags & M_NOWAIT) != 0) {
 			error = ENOMEM;
 			break;
 		}
 		VMEM_CONDVAR_WAIT(vm);
 	}
 out:
 	VMEM_UNLOCK(vm);
 	if (error != 0 && (flags & M_NOWAIT) == 0)
 		panic("failed to allocate waiting allocation\n");
 
 	return (error);
 }
 
 /*
  * vmem_free: free the resource to the arena.
  */
 void
 vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
 {
 	qcache_t *qc;
 	MPASS(size > 0);
 
 	if (size <= vm->vm_qcache_max) {
 		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
 		uma_zfree(qc->qc_cache, (void *)addr);
 	} else
 		vmem_xfree(vm, addr, size);
 }
 
 void
 vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
 {
 	bt_t *bt;
 	bt_t *t;
 
 	MPASS(size > 0);
 
 	VMEM_LOCK(vm);
 	bt = bt_lookupbusy(vm, addr);
 	MPASS(bt != NULL);
 	MPASS(bt->bt_start == addr);
 	MPASS(bt->bt_size == vmem_roundup_size(vm, size) ||
 	    bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
 	MPASS(bt->bt_type == BT_TYPE_BUSY);
 	bt_rembusy(vm, bt);
 	bt->bt_type = BT_TYPE_FREE;
 
 	/* coalesce */
 	t = TAILQ_NEXT(bt, bt_seglist);
 	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
 		MPASS(BT_END(bt) < t->bt_start);	/* YYY */
 		bt->bt_size += t->bt_size;
 		bt_remfree(vm, t);
 		bt_remseg(vm, t);
 	}
 	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
 	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
 		MPASS(BT_END(t) < bt->bt_start);	/* YYY */
 		bt->bt_size += t->bt_size;
 		bt->bt_start = t->bt_start;
 		bt_remfree(vm, t);
 		bt_remseg(vm, t);
 	}
 
 	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
 	MPASS(t != NULL);
 	MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
 	if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
 	    t->bt_size == bt->bt_size) {
 		vmem_addr_t spanaddr;
 		vmem_size_t spansize;
 
 		MPASS(t->bt_start == bt->bt_start);
 		spanaddr = bt->bt_start;
 		spansize = bt->bt_size;
 		bt_remseg(vm, bt);
 		bt_remseg(vm, t);
 		vm->vm_size -= spansize;
 		VMEM_CONDVAR_BROADCAST(vm);
 		bt_freetrim(vm, BT_MAXFREE);
 		(*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
 	} else {
 		bt_insfree(vm, bt);
 		VMEM_CONDVAR_BROADCAST(vm);
 		bt_freetrim(vm, BT_MAXFREE);
 	}
 }
 
 /*
  * vmem_add:
  *
  */
 int
 vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags)
 {
 	int error;
 
 	error = 0;
 	flags &= VMEM_FLAGS;
 	VMEM_LOCK(vm);
 	if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0)
 		vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC);
 	else
 		error = ENOMEM;
 	VMEM_UNLOCK(vm);
 
 	return (error);
 }
 
 /*
  * vmem_size: information about arenas size
  */
 vmem_size_t
 vmem_size(vmem_t *vm, int typemask)
 {
 	int i;
 
 	switch (typemask) {
 	case VMEM_ALLOC:
 		return vm->vm_inuse;
 	case VMEM_FREE:
 		return vm->vm_size - vm->vm_inuse;
 	case VMEM_FREE|VMEM_ALLOC:
 		return vm->vm_size;
 	case VMEM_MAXFREE:
 		VMEM_LOCK(vm);
 		for (i = VMEM_MAXORDER - 1; i >= 0; i--) {
 			if (LIST_EMPTY(&vm->vm_freelist[i]))
 				continue;
 			VMEM_UNLOCK(vm);
 			return ((vmem_size_t)ORDER2SIZE(i) <<
 			    vm->vm_quantum_shift);
 		}
 		VMEM_UNLOCK(vm);
 		return (0);
 	default:
 		panic("vmem_size");
 	}
 }
 
 /* ---- debug */
 
 #if defined(DDB) || defined(DIAGNOSTIC)
 
 static void bt_dump(const bt_t *, int (*)(const char *, ...)
     __printflike(1, 2));
 
 static const char *
 bt_type_string(int type)
 {
 
 	switch (type) {
 	case BT_TYPE_BUSY:
 		return "busy";
 	case BT_TYPE_FREE:
 		return "free";
 	case BT_TYPE_SPAN:
 		return "span";
 	case BT_TYPE_SPAN_STATIC:
 		return "static span";
 	default:
 		break;
 	}
 	return "BOGUS";
 }
 
 static void
 bt_dump(const bt_t *bt, int (*pr)(const char *, ...))
 {
 
 	(*pr)("\t%p: %jx %jx, %d(%s)\n",
 	    bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size,
 	    bt->bt_type, bt_type_string(bt->bt_type));
 }
 
 static void
 vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2))
 {
 	const bt_t *bt;
 	int i;
 
 	(*pr)("vmem %p '%s'\n", vm, vm->vm_name);
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		bt_dump(bt, pr);
 	}
 
 	for (i = 0; i < VMEM_MAXORDER; i++) {
 		const struct vmem_freelist *fl = &vm->vm_freelist[i];
 
 		if (LIST_EMPTY(fl)) {
 			continue;
 		}
 
 		(*pr)("freelist[%d]\n", i);
 		LIST_FOREACH(bt, fl, bt_freelist) {
 			bt_dump(bt, pr);
 		}
 	}
 }
 
 #endif /* defined(DDB) || defined(DIAGNOSTIC) */
 
 #if defined(DDB)
 #include <ddb/ddb.h>
 
 static bt_t *
 vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr)
 {
 	bt_t *bt;
 
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		if (BT_ISSPAN_P(bt)) {
 			continue;
 		}
 		if (bt->bt_start <= addr && addr <= BT_END(bt)) {
 			return bt;
 		}
 	}
 
 	return NULL;
 }
 
 void
 vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...))
 {
 	vmem_t *vm;
 
 	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
 		bt_t *bt;
 
 		bt = vmem_whatis_lookup(vm, addr);
 		if (bt == NULL) {
 			continue;
 		}
 		(*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
 		    (void *)addr, (void *)bt->bt_start,
 		    (vmem_size_t)(addr - bt->bt_start), vm->vm_name,
 		    (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
 	}
 }
 
 void
 vmem_printall(const char *modif, int (*pr)(const char *, ...))
 {
 	const vmem_t *vm;
 
 	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
 		vmem_dump(vm, pr);
 	}
 }
 
 void
 vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...))
 {
 	const vmem_t *vm = (const void *)addr;
 
 	vmem_dump(vm, pr);
 }
 
 DB_SHOW_COMMAND(vmemdump, vmemdump)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show vmemdump <addr>\n");
 		return;
 	}
 
 	vmem_dump((const vmem_t *)addr, db_printf);
 }
 
 DB_SHOW_ALL_COMMAND(vmemdump, vmemdumpall)
 {
 	const vmem_t *vm;
 
 	LIST_FOREACH(vm, &vmem_list, vm_alllist)
 		vmem_dump(vm, db_printf);
 }
 
 DB_SHOW_COMMAND(vmem, vmem_summ)
 {
 	const vmem_t *vm = (const void *)addr;
 	const bt_t *bt;
 	size_t ft[VMEM_MAXORDER], ut[VMEM_MAXORDER];
 	size_t fs[VMEM_MAXORDER], us[VMEM_MAXORDER];
 	int ord;
 
 	if (!have_addr) {
 		db_printf("usage: show vmem <addr>\n");
 		return;
 	}
 
 	db_printf("vmem %p '%s'\n", vm, vm->vm_name);
 	db_printf("\tquantum:\t%zu\n", vm->vm_quantum_mask + 1);
 	db_printf("\tsize:\t%zu\n", vm->vm_size);
 	db_printf("\tinuse:\t%zu\n", vm->vm_inuse);
 	db_printf("\tfree:\t%zu\n", vm->vm_size - vm->vm_inuse);
 	db_printf("\tbusy tags:\t%d\n", vm->vm_nbusytag);
 	db_printf("\tfree tags:\t%d\n", vm->vm_nfreetags);
 
 	memset(&ft, 0, sizeof(ft));
 	memset(&ut, 0, sizeof(ut));
 	memset(&fs, 0, sizeof(fs));
 	memset(&us, 0, sizeof(us));
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		ord = SIZE2ORDER(bt->bt_size >> vm->vm_quantum_shift);
 		if (bt->bt_type == BT_TYPE_BUSY) {
 			ut[ord]++;
 			us[ord] += bt->bt_size;
 		} else if (bt->bt_type == BT_TYPE_FREE) {
 			ft[ord]++;
 			fs[ord] += bt->bt_size;
 		}
 	}
 	db_printf("\t\t\tinuse\tsize\t\tfree\tsize\n");
 	for (ord = 0; ord < VMEM_MAXORDER; ord++) {
 		if (ut[ord] == 0 && ft[ord] == 0)
 			continue;
 		db_printf("\t%-15zu %zu\t%-15zu %zu\t%-16zu\n",
 		    ORDER2SIZE(ord) << vm->vm_quantum_shift,
 		    ut[ord], us[ord], ft[ord], fs[ord]);
 	}
 }
 
 DB_SHOW_ALL_COMMAND(vmem, vmem_summall)
 {
 	const vmem_t *vm;
 
 	LIST_FOREACH(vm, &vmem_list, vm_alllist)
 		vmem_summ((db_expr_t)vm, TRUE, count, modif);
 }
 #endif /* defined(DDB) */
 
 #define vmem_printf printf
 
 #if defined(DIAGNOSTIC)
 
 static bool
 vmem_check_sanity(vmem_t *vm)
 {
 	const bt_t *bt, *bt2;
 
 	MPASS(vm != NULL);
 
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		if (bt->bt_start > BT_END(bt)) {
 			printf("corrupted tag\n");
 			bt_dump(bt, vmem_printf);
 			return false;
 		}
 	}
 	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
 		TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
 			if (bt == bt2) {
 				continue;
 			}
 			if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
 				continue;
 			}
 			if (bt->bt_start <= BT_END(bt2) &&
 			    bt2->bt_start <= BT_END(bt)) {
 				printf("overwrapped tags\n");
 				bt_dump(bt, vmem_printf);
 				bt_dump(bt2, vmem_printf);
 				return false;
 			}
 		}
 	}
 
 	return true;
 }
 
 static void
 vmem_check(vmem_t *vm)
 {
 
 	if (!vmem_check_sanity(vm)) {
 		panic("insanity vmem %p", vm);
 	}
 }
 
 #endif /* defined(DIAGNOSTIC) */
Index: head/sys/kern/subr_witness.c
===================================================================
--- head/sys/kern/subr_witness.c	(revision 326270)
+++ head/sys/kern/subr_witness.c	(revision 326271)
@@ -1,3056 +1,3058 @@
 /*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
  * Copyright (c) 2008 Isilon Systems, Inc.
  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
  * Copyright (c) 1998 Berkeley Software Design, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of the `witness' lock verifier.  Originally implemented for
  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
  * classes in FreeBSD.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 /*
  * Special rules concerning Giant and lock orders:
  *
  * 1) Giant must be acquired before any other mutexes.  Stated another way,
  *    no other mutex may be held when Giant is acquired.
  *
  * 2) Giant must be released when blocking on a sleepable lock.
  *
  * This rule is less obvious, but is a result of Giant providing the same
  * semantics as spl().  Basically, when a thread sleeps, it must release
  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
  * 2).
  *
  * 3) Giant may be acquired before or after sleepable locks.
  *
  * This rule is also not quite as obvious.  Giant may be acquired after
  * a sleepable lock because it is a non-sleepable lock and non-sleepable
  * locks may always be acquired while holding a sleepable lock.  The second
  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
  * execute.  Thus, acquiring Giant both before and after a sleepable lock
  * will not result in a lock order reversal.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_stack.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #if !defined(DDB) && !defined(STACK)
 #error "DDB or STACK options are required for WITNESS"
 #endif
 
 /* Note that these traces do not work with KTR_ALQ. */
 #if 0
 #define	KTR_WITNESS	KTR_SUBSYS
 #else
 #define	KTR_WITNESS	0
 #endif
 
 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
 
 /* Define this to check for blessed mutexes */
 #undef BLESSING
 
 #ifndef WITNESS_COUNT
 #define	WITNESS_COUNT 		1536
 #endif
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	(2048 + MAXCPU)
 
 /* Allocate 256 KB of stack data space */
 #define	WITNESS_LO_DATA_COUNT	2048
 
 /* Prime, gives load factor of ~2 at full load */
 #define	WITNESS_LO_HASH_SIZE	1021
 
 /*
  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
  * probably be safe for the most part, but it's still a SWAG.
  */
 #define	LOCK_NCHILDREN	5
 #define	LOCK_CHILDCOUNT	2048
 
 #define	MAX_W_NAME	64
 
 #define	FULLGRAPH_SBUF_SIZE	512
 
 /*
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
 #define	WITNESS_RELATED_MASK						\
 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
 					  * observed. */
 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
 
 /* Ancestor to descendant flags */
 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
 
 #define	WITNESS_INDEX_ASSERT(i)						\
 	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
 
 static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
 
 /*
  * Lock instances.  A lock instance is the data associated with a lock while
  * it is held by witness.  For example, a lock instance will hold the
  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
  * are held in a per-cpu list while sleep locks are held in per-thread list.
  */
 struct lock_instance {
 	struct lock_object	*li_lock;
 	const char		*li_file;
 	int			li_line;
 	u_int			li_flags;
 };
 
 /*
  * A simple list type used to build the list of locks held by a thread
  * or CPU.  We can't simply embed the list in struct lock_object since a
  * lock may be held by more than one thread if it is a shared lock.  Locks
  * are added to the head of the list, so we fill up each list entry from
  * "the back" logically.  To ease some of the arithmetic, we actually fill
  * in each list entry the normal way (children[0] then children[1], etc.) but
  * when we traverse the list we read children[count-1] as the first entry
  * down to children[0] as the final entry.
  */
 struct lock_list_entry {
 	struct lock_list_entry	*ll_next;
 	struct lock_instance	ll_children[LOCK_NCHILDREN];
 	u_int			ll_count;
 };
 
 /*
  * The main witness structure. One of these per named lock type in the system
  * (for example, "vnode interlock").
  */
 struct witness {
 	char  			w_name[MAX_W_NAME];
 	uint32_t 		w_index;  /* Index in the relationship matrix */
 	struct lock_class	*w_class;
 	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
 	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
 	const char		*w_file; /* File where last acquired */
 	uint32_t 		w_line; /* Line where last acquired */
 	uint32_t 		w_refcount;
 	uint16_t 		w_num_ancestors; /* direct/indirect
 						  * ancestor count */
 	uint16_t 		w_num_descendants; /* direct/indirect
 						    * descendant count */
 	int16_t 		w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
 
 STAILQ_HEAD(witness_list, witness);
 
 /*
  * The witness hash table. Keys are witness names (const char *), elements are
  * witness objects (struct witness *).
  */
 struct witness_hash {
 	struct witness	*wh_array[WITNESS_HASH_SIZE];
 	uint32_t	wh_size;
 	uint32_t	wh_count;
 };
 
 /*
  * Key type for the lock order data hash table.
  */
 struct witness_lock_order_key {
 	uint16_t	from;
 	uint16_t	to;
 };
 
 struct witness_lock_order_data {
 	struct stack			wlod_stack;
 	struct witness_lock_order_key	wlod_key;
 	struct witness_lock_order_data	*wlod_next;
 };
 
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
  * (struct witness_lock_order_data). 
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
 	u_int	wloh_size;
 	u_int	wloh_count;
 };
 
 #ifdef BLESSING
 struct witness_blessed {
 	const char	*b_lock1;
 	const char	*b_lock2;
 };
 #endif
 
 struct witness_pendhelp {
 	const char		*wh_type;
 	struct lock_object	*wh_lock;
 };
 
 struct witness_order_list_entry {
 	const char		*w_name;
 	struct lock_class	*w_class;
 };
 
 /*
  * Returns 0 if one of the locks is a spin lock and the other is not.
  * Returns 1 otherwise.
  */
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
 
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
 
 static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
 
 	return (a->from == b->from && a->to == b->to);
 }
 
 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
 		    const char *fname);
 static void	adopt(struct witness *parent, struct witness *child);
 #ifdef BLESSING
 static int	blessed(struct witness *, struct witness *);
 #endif
 static void	depart(struct witness *w);
 static struct witness	*enroll(const char *description,
 			    struct lock_class *lock_class);
 static struct lock_instance	*find_instance(struct lock_list_entry *list,
 				    const struct lock_object *lock);
 static int	isitmychild(struct witness *parent, struct witness *child);
 static int	isitmydescendant(struct witness *parent, struct witness *child);
 static void	itismychild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
 static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
 #ifdef DDB
 static void	witness_ddb_compute_levels(void);
 static void	witness_ddb_display(int(*)(const char *fmt, ...));
 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
 		    struct witness *, int indent);
 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
 		    struct witness_list *list);
 static void	witness_ddb_level_descendants(struct witness *parent, int l);
 static void	witness_ddb_list(struct thread *td);
 #endif
 static void	witness_debugger(int cond, const char *msg);
 static void	witness_free(struct witness *m);
 static struct witness	*witness_get(void);
 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
 static struct witness	*witness_hash_get(const char *key);
 static void	witness_hash_put(struct witness *w);
 static void	witness_init_hash_tables(void);
 static void	witness_increment_graph_generation(void);
 static void	witness_lock_list_free(struct lock_list_entry *lle);
 static struct lock_list_entry	*witness_lock_list_get(void);
 static int	witness_lock_order_add(struct witness *parent,
 		    struct witness *child);
 static int	witness_lock_order_check(struct witness *parent,
 		    struct witness *child);
 static struct witness_lock_order_data	*witness_lock_order_get(
 					    struct witness *parent,
 					    struct witness *child);
 static void	witness_list_lock(struct lock_instance *instance,
 		    int (*prnt)(const char *fmt, ...));
 static int	witness_output(const char *fmt, ...) __printflike(1, 2);
 static int	witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
 static void	witness_setflag(struct lock_object *lock, int flag, int set);
 
 static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
     "Witness Locking");
 
 /*
  * If set to 0, lock order checking is disabled.  If set to -1,
  * witness is completely disabled.  Otherwise witness performs full
  * lock order checking for all locks.  At runtime, lock order checking
  * may be toggled.  However, witness cannot be reenabled once it is
  * completely disabled.
  */
 static int witness_watch = 1;
 SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0,
     sysctl_debug_witness_watch, "I", "witness is watching lock operations");
 
 #ifdef KDB
 /*
  * When KDB is enabled and witness_kdb is 1, it will cause the system
  * to drop into kdebug() when:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
 int	witness_kdb = 1;
 #else
 int	witness_kdb = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
 #endif /* KDB */
 
 #if defined(DDB) || defined(KDB)
 /*
  * When DDB or KDB is enabled and witness_trace is 1, it will cause the system
  * to print a stack trace:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
 SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
 #endif /* DDB || KDB */
 
 #ifdef WITNESS_SKIPSPIN
 int	witness_skipspin = 1;
 #else
 int	witness_skipspin = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");
 
 int badstack_sbuf_size;
 
 int witness_count = WITNESS_COUNT;
 SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, 
     &witness_count, 0, "");
 
 /*
  * Output channel for witness messages.  By default we print to the console.
  */
 enum witness_channel {
 	WITNESS_CONSOLE,
 	WITNESS_LOG,
 	WITNESS_NONE,
 };
 
 static enum witness_channel witness_channel = WITNESS_CONSOLE;
 SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING |
     CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A",
     "Output channel for warnings");
 
 /*
  * Call this to print out the relations between locks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
 
 /*
  * Call this to print out the witness faulty stacks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
 
 static struct mtx w_mtx;
 
 /* w_list */
 static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
 static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
 
 /* w_typelist */
 static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
 static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
 
 /* lock list */
 static struct lock_list_entry *w_lock_list_free = NULL;
 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
 static u_int pending_cnt;
 
 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
 SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
     "");
 
 static struct witness *w_data;
 static uint8_t **w_rmatrix;
 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
 static struct witness_hash w_hash;	/* The witness hash table. */
 
 /* The lock order data hash */
 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
 static struct witness_lock_order_data *w_lofree = NULL;
 static struct witness_lock_order_hash w_lohash;
 static int w_max_used_index = 0;
 static unsigned int w_generation = 0;
 static const char w_notrunning[] = "Witness not running\n";
 static const char w_stillcold[] = "Witness is still cold\n";
 
 
 static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * sx locks
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
 	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
 	{ "process lock", &lock_class_mtx_sleep },
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_rw },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-sleep", &lock_class_mtx_sleep },
 #endif
 	{ "time lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * umtx
 	 */
 	{ "umtx lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "sellck", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "radix node head", &lock_class_rw },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv4 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in_multi_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_rw },
 	{ NULL, NULL },
 	/*
 	 * IPv6 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in6_multi_mtx", &lock_class_mtx_sleep },
 	{ "mld_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_rw },
 	{ NULL, NULL },
 	/*
 	 * UNIX Domain Sockets
 	 */
 	{ "unp_link_rwlock", &lock_class_rw },
 	{ "unp_list_lock", &lock_class_mtx_sleep },
 	{ "unp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UDP/IP
 	 */
 	{ "udp", &lock_class_rw },
 	{ "udpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
 	{ "tcp", &lock_class_rw },
 	{ "tcpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * BPF
 	 */
 	{ "bpf global lock", &lock_class_mtx_sleep },
 	{ "bpf interface lock", &lock_class_rw },
 	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
 	 */
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 
 	/*
 	 * IEEE 802.11
 	 */
 	{ "802.11 com lock", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 	/*
 	 * Network drivers
 	 */
 	{ "network driver", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 
 	/*
 	 * Netgraph
 	 */
 	{ "ng_node", &lock_class_mtx_sleep },
 	{ "ng_worklist", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
 	{ "vm map (system)", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
 	 */
 	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_rw },
 	{ "vm page", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ "pmap pv list", &lock_class_rw },
 	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
 	 */
 	{ "kqueue", &lock_class_mtx_sleep },
 	{ "struct mount mtx", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VFS namecache
 	 */
 	{ "ncvn", &lock_class_mtx_sleep },
 	{ "ncbuc", &lock_class_rw },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "ncneg", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },
 	{ "dr->dt.di.dr_mtx", &lock_class_sx },
 	{ "db->db_mtx", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * spin locks
 	 */
 #ifdef SMP
 	{ "ap boot", &lock_class_mtx_spin },
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 	{ "sio", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
 #ifdef __sparc64__
 	{ "pcib_mtx", &lock_class_mtx_spin },
 	{ "rtc_mtx", &lock_class_mtx_spin },
 #endif
 	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
 	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
 	{ "process slock", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
 	{ "rm_spinlock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
 	{ "turnstile lock", &lock_class_mtx_spin },
 	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 #ifdef SMP
 	{ "smp rendezvous", &lock_class_mtx_spin },
 #endif
 #ifdef __powerpc__
 	{ "tlb0", &lock_class_mtx_spin },
 #endif
 	/*
 	 * leaf locks
 	 */
 	{ "intrcnt", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
 #if defined(SMP) && defined(__sparc64__)
 	{ "ipi", &lock_class_mtx_spin },
 #endif
 #ifdef __i386__
 	{ "allpmaps", &lock_class_mtx_spin },
 	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "cpuset", &lock_class_mtx_spin },
 	{ "mprof lock", &lock_class_mtx_spin },
 	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #if defined(__i386__) || defined(__amd64__)
 	{ "pcicfg", &lock_class_mtx_spin },
 	{ "NDIS thread lock", &lock_class_mtx_spin },
 #endif
 	{ "tw_osl_io_lock", &lock_class_mtx_spin },
 	{ "tw_osl_q_lock", &lock_class_mtx_spin },
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
 	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
 
 #ifdef BLESSING
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 #endif
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 /*
  * This global is set to 1 once the static lock orders have been enrolled
  * so that a warning can be issued for any spin locks enrolled later.
  */
 static int witness_spin_warn = 0;
 
 /* Trim useless garbage from filenames. */
 static const char *
 fixup_filename(const char *file)
 {
 
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
 		file += 3;
 	return (file);
 }
 
 /*
  * The WITNESS-enabled diagnostic code.  Note that the witness code does
  * assume that the early boot is single-threaded at least until after this
  * routine is completed.
  */
 static void
 witness_initialize(void *dummy __unused)
 {
 	struct lock_object *lock;
 	struct witness_order_list_entry *order;
 	struct witness *w, *w1;
 	int i;
 
 	w_data = malloc(sizeof (struct witness) * witness_count, M_WITNESS,
 	    M_WAITOK | M_ZERO);
 
 	w_rmatrix = malloc(sizeof(*w_rmatrix) * (witness_count + 1),
 	    M_WITNESS, M_WAITOK | M_ZERO);
 
 	for (i = 0; i < witness_count + 1; i++) {
 		w_rmatrix[i] = malloc(sizeof(*w_rmatrix[i]) *
 		    (witness_count + 1), M_WITNESS, M_WAITOK | M_ZERO);
 	}
 	badstack_sbuf_size = witness_count * 256;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
 	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = witness_count - 1; i >= 0; i--) {
 		w = &w_data[i];
 		memset(w, 0, sizeof(*w));
 		w_data[i].w_index = i;	/* Witness index never changes. */
 		witness_free(w);
 	}
 	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
 	    ("%s: Invalid list of free witness objects", __func__));
 
 	/* Witness with index 0 is not used to aid in debugging. */
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 
 	for (i = 0; i < witness_count; i++) {
 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * 
 		    (witness_count + 1));
 	}
 
 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
 		witness_lock_list_free(&w_locklistdata[i]);
 	witness_init_hash_tables();
 
 	/* First add in all the specified order lists. */
 	for (order = order_lists; order->w_name != NULL; order++) {
 		w = enroll(order->w_name, order->w_class);
 		if (w == NULL)
 			continue;
 		w->w_file = "order list";
 		for (order++; order->w_name != NULL; order++) {
 			w1 = enroll(order->w_name, order->w_class);
 			if (w1 == NULL)
 				continue;
 			w1->w_file = "order list";
 			itismychild(w, w1);
 			w = w1;
 		}
 	}
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
 		lock = pending_locks[i].wh_lock;
 		KASSERT(lock->lo_flags & LO_WITNESS,
 		    ("%s: lock %s is on pending list but not LO_WITNESS",
 		    __func__, lock->lo_name));
 		lock->lo_witness = enroll(pending_locks[i].wh_type,
 		    LOCK_CLASS(lock));
 	}
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
 
 	mtx_lock(&Giant);
 }
 SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize,
     NULL);
 
 void
 witness_init(struct lock_object *lock, const char *type)
 {
 	struct lock_class *class;
 
 	/* Various sanity checks. */
 	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be recursable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (class->lc_flags & LC_SLEEPABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be sleepable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
 	    (class->lc_flags & LC_UPGRADABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be upgradable",
 		    __func__, class->lc_name, lock->lo_name);
 
 	/*
 	 * If we shouldn't watch this lock, then just clear lo_witness.
 	 * Otherwise, if witness_cold is set, then it is too early to
 	 * enroll this lock, so defer it to witness_initialize() by adding
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
 	if (witness_watch < 1 || panicstr != NULL ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
 		pending_locks[pending_cnt].wh_lock = lock;
 		pending_locks[pending_cnt++].wh_type = type;
 		if (pending_cnt > WITNESS_PENDLIST)
 			panic("%s: pending locks list is too small, "
 			    "increase WITNESS_PENDLIST\n",
 			    __func__);
 	} else
 		lock->lo_witness = enroll(type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct witness *w;
 
 	class = LOCK_CLASS(lock);
 
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
 		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
 	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
 		return;
 	w = lock->lo_witness;
 
 	mtx_lock_spin(&w_mtx);
 	MPASS(w->w_refcount > 0);
 	w->w_refcount--;
 
 	if (w->w_refcount == 0)
 		depart(w);
 	mtx_unlock_spin(&w_mtx);
 }
 
 #ifdef DDB
 static void
 witness_ddb_compute_levels(void)
 {
 	struct witness *w;
 
 	/*
 	 * First clear all levels.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_ddb_level = -1;
 
 	/*
 	 * Look for locks with no parents and level all their descendants.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list) {
 
 		/* If the witness has ancestors (is not a root), skip it. */
 		if (w->w_num_ancestors > 0)
 			continue;
 		witness_ddb_level_descendants(w, 0);
 	}
 }
 
 static void
 witness_ddb_level_descendants(struct witness *w, int l)
 {
 	int i;
 
 	if (w->w_ddb_level >= l)
 		return;
 
 	w->w_ddb_level = l;
 	l++;
 
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_level_descendants(&w_data[i], l);
 	}
 }
 
 static void
 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
     struct witness *w, int indent)
 {
 	int i;
 
  	for (i = 0; i < indent; i++)
  		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
  	if (w->w_displayed) {
  		prnt(" -- (already displayed)\n");
  		return;
  	}
  	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
 		    w->w_line);
 	else
 		prnt(" -- never acquired\n");
 	indent++;
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (db_pager_quit)
 			return;
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_display_descendants(prnt, &w_data[i],
 			    indent);
 	}
 }
 
 static void
 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
     struct witness_list *list)
 {
 	struct witness *w;
 
 	STAILQ_FOREACH(w, list, w_typelist) {
 		if (w->w_file == NULL || w->w_ddb_level > 0)
 			continue;
 
 		/* This lock has no anscestors - display its descendants. */
 		witness_ddb_display_descendants(prnt, w, 0);
 		if (db_pager_quit)
 			return;
 	}
 }
 	
 static void
 witness_ddb_display(int(*prnt)(const char *fmt, ...))
 {
 	struct witness *w;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	witness_ddb_compute_levels();
 
 	/* Clear all the displayed flags. */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 
 	/*
 	 * First, handle sleep locks which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep locks:\n");
 	witness_ddb_display_list(prnt, &w_sleep);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Now do spin locks which have been acquired at least once.
 	 */
 	prnt("\nSpin locks:\n");
 	witness_ddb_display_list(prnt, &w_spin);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Finally, any locks which have not been acquired yet.
 	 */
 	prnt("\nLocks which were never acquired:\n");
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		if (w->w_file != NULL || w->w_refcount == 0)
 			continue;
 		prnt("%s (type: %s, depth: %d)\n", w->w_name,
 		    w->w_class->lc_name, w->w_ddb_level);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif /* DDB */
 
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (0);
 
 	/* Require locks that witness knows about. */
 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
 	    lock2->lo_witness == NULL)
 		return (EINVAL);
 
 	mtx_assert(&w_mtx, MA_NOTOWNED);
 	mtx_lock_spin(&w_mtx);
 
 	/*
 	 * If we already have either an explicit or implied lock order that
 	 * is the other way around, then return an error.
 	 */
 	if (witness_watch &&
 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
 		mtx_unlock_spin(&w_mtx);
 		return (EDOOFUS);
 	}
 	
 	/* Try to add the new order. */
 	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
 	itismychild(lock1->lo_witness, lock2->lo_witness);
 	mtx_unlock_spin(&w_mtx);
 	return (0);
 }
 
 void
 witness_checkorder(struct lock_object *lock, int flags, const char *file,
     int line, struct lock_object *interlock)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1, *lock2, *plock;
 	struct lock_class *class, *iclass;
 	struct witness *w, *w1;
 	struct thread *td;
 	int i, j;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 
 	w = lock->lo_witness;
 	class = LOCK_CLASS(lock);
 	td = curthread;
 
 	if (class->lc_flags & LC_SLEEPLOCK) {
 
 		/*
 		 * Since spin locks include a critical section, this check
 		 * implicitly enforces a lock order of all sleep locks before
 		 * all spin locks.
 		 */
 		if (td->td_critnest != 0 && !kdb_active)
 			kassert_panic("acquiring blockable sleep lock with "
 			    "spinlock or critical section held (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 
 		/*
 		 * If this is the first lock acquired then just return as
 		 * no order checking is needed.
 		 */
 		lock_list = td->td_sleeplocks;
 		if (lock_list == NULL || lock_list->ll_count == 0)
 			return;
 	} else {
 
 		/*
 		 * If this is the first lock, just return as no order
 		 * checking is needed.  Avoid problems with thread
 		 * migration pinning the thread while checking if
 		 * spinlocks are held.  If at least one spinlock is held
 		 * the thread is in a safe path and it is allowed to
 		 * unpin it.
 		 */
 		sched_pin();
 		lock_list = PCPU_GET(spinlocks);
 		if (lock_list == NULL || lock_list->ll_count == 0) {
 			sched_unpin();
 			return;
 		}
 		sched_unpin();
 	}
 
 	/*
 	 * Check to see if we are recursing on a lock we already own.  If
 	 * so, make sure that we don't mismatch exclusive and shared lock
 	 * acquires.
 	 */
 	lock1 = find_instance(lock_list, lock);
 	if (lock1 != NULL) {
 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
 		    (flags & LOP_EXCLUSIVE) == 0) {
 			witness_output("shared lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while exclusively locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("excl->share");
 		}
 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
 		    (flags & LOP_EXCLUSIVE) != 0) {
 			witness_output("exclusive lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while share locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("share->excl");
 		}
 		return;
 	}
 
 	/* Warn if the interlock is not locked exactly once. */
 	if (interlock != NULL) {
 		iclass = LOCK_CLASS(interlock);
 		lock1 = find_instance(lock_list, interlock);
 		if (lock1 == NULL)
 			kassert_panic("interlock (%s) %s not locked @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("interlock (%s) %s recursed @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 	}
 
 	/*
 	 * Find the previously acquired lock, but ignore interlocks.
 	 */
 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
 	if (interlock != NULL && plock->li_lock == interlock) {
 		if (lock_list->ll_count > 1)
 			plock =
 			    &lock_list->ll_children[lock_list->ll_count - 2];
 		else {
 			lle = lock_list->ll_next;
 
 			/*
 			 * The interlock is the only lock we hold, so
 			 * simply return.
 			 */
 			if (lle == NULL)
 				return;
 			plock = &lle->ll_children[lle->ll_count - 1];
 		}
 	}
 	
 	/*
 	 * Try to perform most checks without a lock.  If this succeeds we
 	 * can skip acquiring the lock and return success.  Otherwise we redo
 	 * the check with the lock held to handle races with concurrent updates.
 	 */
 	w1 = plock->li_lock->lo_witness;
 	if (witness_lock_order_check(w1, w))
 		return;
 
 	mtx_lock_spin(&w_mtx);
 	if (witness_lock_order_check(w1, w)) {
 		mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	witness_lock_order_add(w1, w);
 
 	/*
 	 * Check for duplicate locks of the same type.  Note that we only
 	 * have to check for this on the last lock we just acquired.  Any
 	 * other cases will be caught as lock order violations.
 	 */
 	if (w1 == w) {
 		i = w->w_index;
 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
 		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			witness_output(
 			    "acquiring duplicate lock of same type: \"%s\"\n", 
 			    w->w_name);
 			witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			    fixup_filename(plock->li_file), plock->li_line);
 			witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
 			    fixup_filename(file), line);
 			witness_debugger(1, __func__);
 		} else
 			mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	mtx_assert(&w_mtx, MA_OWNED);
 
 	/*
 	 * If we know that the lock we are acquiring comes after
 	 * the lock we most recently acquired in the lock order tree,
 	 * then there is no need for any further checks.
 	 */
 	if (isitmychild(w1, w))
 		goto out;
 
 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
 
 			MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
 			lock1 = &lle->ll_children[i];
 
 			/*
 			 * Ignore the interlock.
 			 */
 			if (interlock == lock1->li_lock)
 				continue;
 
 			/*
 			 * If this lock doesn't undergo witness checking,
 			 * then skip it.
 			 */
 			w1 = lock1->li_lock->lo_witness;
 			if (w1 == NULL) {
 				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
 				    ("lock missing witness structure"));
 				continue;
 			}
 
 			/*
 			 * If we are locking Giant and this is a sleepable
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * isn't sleepable, we want to treat it as a lock
 			 * order violation to enfore a general lock order of
 			 * sleepable locks before non-sleepable locks.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				goto reversal;
 
 			/*
 			 * If we are locking Giant and this is a non-sleepable
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
 			    lock == &Giant.lock_object)
 				goto reversal;
 
 			/*
 			 * Check the lock order hierarchy for a reveresal.
 			 */
 			if (!isitmydescendant(w, w1))
 				continue;
 		reversal:
 
 			/*
 			 * We have a lock order violation, check to see if it
 			 * is allowed or has already been yelled about.
 			 */
 #ifdef BLESSING
 
 			/*
 			 * If the lock order is blessed, just bail.  We don't
 			 * look for other lock order violations though, which
 			 * may be a bug.
 			 */
 			if (blessed(w, w1))
 				goto out;
 #endif
 
 			/* Bail if this violation is known */
 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
 				goto out;
 
 			/* Record this as a violation */
 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
 			w->w_reversed = w1->w_reversed = 1;
 			witness_increment_graph_generation();
 			mtx_unlock_spin(&w_mtx);
 
 #ifdef WITNESS_NO_VNODE
 			/*
 			 * There are known LORs between VNODE locks. They are
 			 * not an indication of a bug. VNODE locks are flagged
 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
 			 * is between 2 VNODE locks.
 			 */
 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
 				return;
 #endif
 
 			/*
 			 * Ok, yell about it.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				witness_output(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
 			    && lock == &Giant.lock_object)
 				witness_output(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
 				witness_output("lock order reversal:\n");
 
 			/*
 			 * Try to locate an earlier lock with
 			 * witness w in our list.
 			 */
 			do {
 				lock2 = &lle->ll_children[i];
 				MPASS(lock2->li_lock != NULL);
 				if (lock2->li_lock->lo_witness == w)
 					break;
 				if (i == 0 && lle->ll_next != NULL) {
 					lle = lle->ll_next;
 					i = lle->ll_count - 1;
 					MPASS(i >= 0 && i < LOCK_NCHILDREN);
 				} else
 					i--;
 			} while (i >= 0);
 			if (i < 0) {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			} else {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock2->li_lock, lock2->li_lock->lo_name,
 				    lock2->li_lock->lo_witness->w_name,
 				    fixup_filename(lock2->li_file),
 				    lock2->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			}
 			witness_debugger(1, __func__);
 			return;
 		}
 	}
 
 	/*
 	 * If requested, build a new lock order.  However, don't build a new
 	 * relationship between a sleepable lock and Giant if it is in the
 	 * wrong direction.  The correct lock order is that sleepable locks
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
 	    !(plock->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    w->w_name, plock->li_lock->lo_witness->w_name);
 		itismychild(plock->li_lock->lo_witness, w);
 	}
 out:
 	mtx_unlock_spin(&w_mtx);
 }
 
 void
 witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct witness *w;
 	struct thread *td;
 
 	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 	w = lock->lo_witness;
 	td = curthread;
 
 	/* Determine lock list for this lock. */
 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
 		instance->li_flags++;
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
 		instance->li_file = file;
 		instance->li_line = line;
 		return;
 	}
 
 	/* Update per-witness last file and line acquire. */
 	w->w_file = file;
 	w->w_line = line;
 
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
 		lle = witness_lock_list_get();
 		if (lle == NULL)
 			return;
 		lle->ll_next = *lock_list;
 		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		*lock_list = lle;
 	}
 	instance = &lle->ll_children[lle->ll_count++];
 	instance->li_lock = lock;
 	instance->li_line = line;
 	instance->li_file = file;
 	if ((flags & LOP_EXCLUSIVE) != 0)
 		instance->li_flags = LI_EXCLUSIVE;
 	else
 		instance->li_flags = 0;
 	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
 	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
 }
 
 void
 witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "upgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "upgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "upgrade of exclusive lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags |= LI_EXCLUSIVE;
 }
 
 void
 witness_downgrade(struct lock_object *lock, int flags, const char *file,
     int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "downgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "downgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "downgrade of shared lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags &= ~LI_EXCLUSIVE;
 }
 
 void
 witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct lock_class *class;
 	struct thread *td;
 	register_t s;
 	int i, j;
 
 	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
 
 	/* Find lock instance associated with this lock. */
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 	lle = *lock_list;
 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
 		for (i = 0; i < (*lock_list)->ll_count; i++) {
 			instance = &(*lock_list)->ll_children[i];
 			if (instance->li_lock == lock)
 				goto found;
 		}
 
 	/*
 	 * When disabling WITNESS through witness_watch we could end up in
 	 * having registered locks in the td_sleeplocks queue.
 	 * We have to make sure we flush these queues, so just search for
 	 * eventual register locks and remove them.
 	 */
 	if (witness_watch > 0) {
 		kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
 		    lock->lo_name, fixup_filename(file), line);
 		return;
 	} else {
 		return;
 	}
 found:
 
 	/* First, check for shared/exclusive mismatches. */
 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) == 0) {
 		witness_output("shared unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while exclusively locked from %s:%d\n",
 		    fixup_filename(instance->li_file), instance->li_line);
 		kassert_panic("excl->ushare");
 	}
 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) != 0) {
 		witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while share locked from %s:%d\n",
 		    fixup_filename(instance->li_file),
 		    instance->li_line);
 		kassert_panic("share->uexcl");
 	}
 	/* If we are recursed, unrecurse. */
 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
 		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, instance->li_lock->lo_name,
 		    instance->li_flags);
 		instance->li_flags--;
 		return;
 	}
 	/* The lock is now being dropped, check for NORELEASE flag */
 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
 		witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		kassert_panic("lock marked norelease");
 	}
 
 	/* Otherwise, remove this item from the list. */
 	s = intr_disable();
 	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
 	    td->td_proc->p_pid, instance->li_lock->lo_name,
 	    (*lock_list)->ll_count - 1);
 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
 		(*lock_list)->ll_children[j] =
 		    (*lock_list)->ll_children[j + 1];
 	(*lock_list)->ll_count--;
 	intr_restore(s);
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
 	 * head object into lists so that frequent allocation from the 
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
 	 * further objects in the list, so the list ownership needs to be
 	 * hand over to another object if the current head needs to be freed.
 	 */
 	if ((*lock_list)->ll_count == 0) {
 		if (*lock_list == lle) {
 			if (lle->ll_next == NULL)
 				return;
 		} else
 			lle = *lock_list;
 		*lock_list = lle->ll_next;
 		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		witness_lock_list_free(lle);
 	}
 }
 
 void
 witness_thread_exit(struct thread *td)
 {
 	struct lock_list_entry *lle;
 	int i, n;
 
 	lle = td->td_sleeplocks;
 	if (lle == NULL || panicstr != NULL)
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
 			for (i = lle->ll_count - 1; i >= 0; i--) {
 				if (n == 0)
 					witness_output(
 		    "Thread %p exiting with the following locks held:\n", td);
 				n++;
 				witness_list_lock(&lle->ll_children[i],
 				    witness_output);
 				
 			}
 		kassert_panic(
 		    "Thread %p cannot exit while holding sleeplocks\n", td);
 	}
 	witness_lock_list_free(lle);
 }
 
 /*
  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
  * exempt Giant and sleepable locks from the checks as well.  If any
  * non-exempt locks are held, then a supplied message is printed to the
  * output channel along with a list of the offending locks.  If indicated in the
  * flags then a failure results in a panic as well.
  */
 int
 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1;
 	struct thread *td;
 	va_list ap;
 	int i, n;
 
 	if (witness_cold || witness_watch < 1 || panicstr != NULL)
 		return (0);
 	n = 0;
 	td = curthread;
 	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			lock1 = &lle->ll_children[i];
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
 				continue;
 			if (n == 0) {
 				va_start(ap, fmt);
 				vprintf(fmt, ap);
 				va_end(ap);
 				printf(" with the following %slocks held:\n",
 				    (flags & WARN_SLEEPOK) != 0 ?
 				    "non-sleepable " : "");
 			}
 			n++;
 			witness_list_lock(lock1, printf);
 		}
 
 	/*
 	 * Pin the thread in order to avoid problems with thread migration.
 	 * Once that all verifies are passed about spinlocks ownership,
 	 * the thread is in a safe path and it can be unpinned.
 	 */
 	sched_pin();
 	lock_list = PCPU_GET(spinlocks);
 	if (lock_list != NULL && lock_list->ll_count != 0) {
 		sched_unpin();
 
 		/*
 		 * We should only have one spinlock and as long as
 		 * the flags cannot match for this locks class,
 		 * check if the first spinlock is the one curthread
 		 * should hold.
 		 */
 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
 		    lock1->li_lock == lock && n == 0)
 			return (0);
 
 		va_start(ap, fmt);
 		vprintf(fmt, ap);
 		va_end(ap);
 		printf(" with the following %slocks held:\n",
 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
 		n += witness_list_locks(&lock_list, printf);
 	} else
 		sched_unpin();
 	if (flags & WARN_PANIC && n)
 		kassert_panic("%s", __func__);
 	else
 		witness_debugger(n, __func__);
 	return (n);
 }
 
 const char *
 witness_file(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return ("?");
 	w = lock->lo_witness;
 	return (w->w_file);
 }
 
 int
 witness_line(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return (0);
 	w = lock->lo_witness;
 	return (w->w_line);
 }
 
 static struct witness *
 enroll(const char *description, struct lock_class *lock_class)
 {
 	struct witness *w;
 	struct witness_list *typelist;
 
 	MPASS(description != NULL);
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (NULL);
 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
 		if (witness_skipspin)
 			return (NULL);
 		else
 			typelist = &w_spin;
 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
 		typelist = &w_sleep;
 	} else {
 		kassert_panic("lock class %s is not sleep or spin",
 		    lock_class->lc_name);
 		return (NULL);
 	}
 
 	mtx_lock_spin(&w_mtx);
 	w = witness_hash_get(description);
 	if (w)
 		goto found;
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	MPASS(strlen(description) < MAX_W_NAME);
 	strcpy(w->w_name, description);
 	w->w_class = lock_class;
 	w->w_refcount = 1;
 	STAILQ_INSERT_HEAD(&w_all, w, w_list);
 	if (lock_class->lc_flags & LC_SPINLOCK) {
 		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
 		w_spin_cnt++;
 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
 		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
 		w_sleep_cnt++;
 	}
 
 	/* Insert new witness into the hash */
 	witness_hash_put(w);
 	witness_increment_graph_generation();
 	mtx_unlock_spin(&w_mtx);
 	return (w);
 found:
 	w->w_refcount++;
 	if (w->w_refcount == 1)
 		w->w_class = lock_class;
 	mtx_unlock_spin(&w_mtx);
 	if (lock_class != w->w_class)
 		kassert_panic(
 		    "lock (%s) %s does not match earlier (%s) lock",
 		    description, lock_class->lc_name,
 		    w->w_class->lc_name);
 	return (w);
 }
 
 static void
 depart(struct witness *w)
 {
 	struct witness_list *list;
 
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		list = &w_sleep;
 		w_sleep_cnt--;
 	} else {
 		list = &w_spin;
 		w_spin_cnt--;
 	}
 	/*
 	 * Set file to NULL as it may point into a loadable module.
 	 */
 	w->w_file = NULL;
 	w->w_line = 0;
 	witness_increment_graph_generation();
 }
 
 
 static void
 adopt(struct witness *parent, struct witness *child)
 {
 	int pi, ci, i, j;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	/* If the relationship is already known, there's no work to be done. */
 	if (isitmychild(parent, child))
 		return;
 
 	/* When the structure of the graph changes, bump up the generation. */
 	witness_increment_graph_generation();
 
 	/*
 	 * The hard part ... create the direct relationship, then propagate all
 	 * indirect relationships.
 	 */
 	pi = parent->w_index;
 	ci = child->w_index;
 	WITNESS_INDEX_ASSERT(pi);
 	WITNESS_INDEX_ASSERT(ci);
 	MPASS(pi != ci);
 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
 
 	/*
 	 * If parent was not already an ancestor of child,
 	 * then we increment the descendant and ancestor counters.
 	 */
 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
 		parent->w_num_descendants++;
 		child->w_num_ancestors++;
 	}
 
 	/* 
 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
 
 			/* 
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
 				continue;
 
 			/*
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
 			/* 
 			 * Make sure we aren't marking a node as both an
 			 * ancestor and descendant. We should have caught 
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    i, j, w_rmatrix[i][j]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    j, i, w_rmatrix[j][i]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 		}
 	}
 }
 
 static void
 itismychild(struct witness *parent, struct witness *child)
 {
 	int unlocked;
 
 	MPASS(child != NULL && parent != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (!witness_lock_type_equal(parent, child)) {
 		if (witness_cold == 0) {
 			unlocked = 1;
 			mtx_unlock_spin(&w_mtx);
 		} else {
 			unlocked = 0;
 		}
 		kassert_panic(
 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
 		    "the same lock type", __func__, parent->w_name,
 		    parent->w_class->lc_name, child->w_name,
 		    child->w_class->lc_name);
 		if (unlocked)
 			mtx_lock_spin(&w_mtx);
 	}
 	adopt(parent, child);
 }
 
 /*
  * Generic code for the isitmy*() functions. The rmask parameter is the
  * expected relationship of w1 to w2.
  */
 static int
 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 {
 	unsigned char r1, r2;
 	int i1, i2;
 
 	i1 = w1->w_index;
 	i2 = w2->w_index;
 	WITNESS_INDEX_ASSERT(i1);
 	WITNESS_INDEX_ASSERT(i2);
 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
 
 	/* The flags on one better be the inverse of the flags on the other */
 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
 		/* Don't squawk if we're potentially racing with an update. */
 		if (!mtx_owned(&w_mtx))
 			return (0);
 		printf("%s: rmatrix mismatch between %s (index %d) and %s "
 		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
 		    "w_rmatrix[%d][%d] == %hhx\n",
 		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
 		    i2, i1, r2);
 		kdb_backtrace();
 		printf("Witness disabled.\n");
 		witness_watch = -1;
 	}
 	return (r1 & rmask);
 }
 
 /*
  * Checks if @child is a direct child of @parent.
  */
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
 /*
  * Checks if @descendant is a direct or inderect descendant of @ancestor.
  */
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
 
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
 
 #ifdef BLESSING
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < nitems(blessed_list); i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_name, b->b_lock1) == 0) {
 			if (strcmp(w2->w_name, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_name, b->b_lock2) == 0)
 			if (strcmp(w2->w_name, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 #endif
 
 static struct witness *
 witness_get(void)
 {
 	struct witness *w;
 	int index;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (witness_watch == -1) {
 		mtx_unlock_spin(&w_mtx);
 		return (NULL);
 	}
 	if (STAILQ_EMPTY(&w_free)) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("WITNESS: unable to allocate a new witness object\n");
 		return (NULL);
 	}
 	w = STAILQ_FIRST(&w_free);
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
 	MPASS(index > 0 && index == w_max_used_index+1 &&
 	    index < witness_count);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
 	if (index > w_max_used_index)
 		w_max_used_index = index;
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
 
 static struct lock_list_entry *
 witness_lock_list_get(void)
 {
 	struct lock_list_entry *lle;
 
 	if (witness_watch == -1)
 		return (NULL);
 	mtx_lock_spin(&w_mtx);
 	lle = w_lock_list_free;
 	if (lle == NULL) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("%s: witness exhausted\n", __func__);
 		return (NULL);
 	}
 	w_lock_list_free = lle->ll_next;
 	mtx_unlock_spin(&w_mtx);
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
 		
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
 
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
 	mtx_unlock_spin(&w_mtx);
 }
 
 static struct lock_instance *
 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
 {
 	struct lock_list_entry *lle;
 	struct lock_instance *instance;
 	int i;
 
 	for (lle = list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			instance = &lle->ll_children[i];
 			if (instance->li_lock == lock)
 				return (instance);
 		}
 	return (NULL);
 }
 
 static void
 witness_list_lock(struct lock_instance *instance,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_object *lock;
 
 	lock = instance->li_lock;
 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_witness->w_name != lock->lo_name)
 		prnt(" (%s)", lock->lo_witness->w_name);
 	prnt(" r = %d (%p) locked @ %s:%d\n",
 	    instance->li_flags & LI_RECURSEMASK, lock,
 	    fixup_filename(instance->li_file), instance->li_line);
 }
 
 static int
 witness_output(const char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
 	ret = witness_voutput(fmt, ap);
 	va_end(ap);
 	return (ret);
 }
 
 static int
 witness_voutput(const char *fmt, va_list ap)
 {
 	int ret;
 
 	ret = 0;
 	switch (witness_channel) {
 	case WITNESS_CONSOLE:
 		ret = vprintf(fmt, ap);
 		break;
 	case WITNESS_LOG:
 		vlog(LOG_NOTICE, fmt, ap);
 		break;
 	case WITNESS_NONE:
 		break;
 	}
 	return (ret);
 }
 
 #ifdef DDB
 static int
 witness_thread_has_locks(struct thread *td)
 {
 
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
 }
 
 static int
 witness_proc_has_locks(struct proc *p)
 {
 	struct thread *td;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (witness_thread_has_locks(td))
 			return (1);
 	}
 	return (0);
 }
 #endif
 
 int
 witness_list_locks(struct lock_list_entry **lock_list,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_list_entry *lle;
 	int i, nheld;
 
 	nheld = 0;
 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			witness_list_lock(&lle->ll_children[i], prnt);
 			nheld++;
 		}
 	return (nheld);
 }
 
 /*
  * This is a bit risky at best.  We call this function when we have timed
  * out acquiring a spin lock, and we assume that the other CPU is stuck
  * with this lock held.  So, we go groveling around in the other CPU's
  * per-cpu data to try to find the lock instance for this spin lock to
  * see when it was last acquired.
  */
 void
 witness_display_spinlock(struct lock_object *lock, struct thread *owner,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_instance *instance;
 	struct pcpu *pc;
 
 	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
 		return;
 	pc = pcpu_find(owner->td_oncpu);
 	instance = find_instance(pc->pc_spinlocks, lock);
 	if (instance != NULL)
 		witness_list_lock(instance, prnt);
 }
 
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
 
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	if (instance == NULL)
 		return;
 	instance->li_file = file;
 	instance->li_line = line;
 }
 
 void
 witness_assert(const struct lock_object *lock, int flags, const char *file,
     int line)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
 		instance = find_instance(curthread->td_sleeplocks, lock);
 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
 		instance = find_instance(PCPU_GET(spinlocks), lock);
 	else {
 		kassert_panic("Lock (%s) %s is not sleep or spin!",
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			kassert_panic("Lock (%s) %s locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
 	case LA_LOCKED | LA_NOTRECURSED:
 	case LA_SLOCKED:
 	case LA_SLOCKED | LA_RECURSED:
 	case LA_SLOCKED | LA_NOTRECURSED:
 	case LA_XLOCKED:
 	case LA_XLOCKED | LA_RECURSED:
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			kassert_panic("Lock (%s) %s not locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "Lock (%s) %s not exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "Lock (%s) %s exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("Lock (%s) %s recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	default:
 		kassert_panic("Invalid lock assertion at %s:%d.",
 		    fixup_filename(file), line);
 
 	}
 #endif	/* INVARIANT_SUPPORT */
 }
 
 static void
 witness_setflag(struct lock_object *lock, int flag, int set)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 
 	if (set)
 		instance->li_flags |= flag;
 	else
 		instance->li_flags &= ~flag;
 }
 
 void
 witness_norelease(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
 #ifdef DDB
 static void
 witness_ddb_list(struct thread *td)
 {
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
 	if (witness_watch < 1)
 		return;
 
 	witness_list_locks(&td->td_sleeplocks, db_printf);
 
 	/*
 	 * We only handle spinlocks if td == curthread.  This is somewhat broken
 	 * if td is currently executing on some other CPU and holds spin locks
 	 * as we won't display those locks.  If we had a MI way of getting
 	 * the per-cpu data for a given cpu then we could use
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
 	 * That still wouldn't really fix this unless we locked the scheduler
 	 * lock or stopped the other CPU to make sure it wasn't changing the
 	 * list out from under us.  It is probably best to just not try to
 	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
 }
 
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
 
 	if (have_addr)
 		td = db_lookup_thread(addr, true);
 	else
 		td = kdb_thread;
 	witness_ddb_list(td);
 }
 
 DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
 {
 	struct thread *td;
 	struct proc *p;
 
 	/*
 	 * It would be nice to list only threads and processes that actually
 	 * held sleep locks, but that information is currently not exported
 	 * by WITNESS.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!witness_proc_has_locks(p))
 			continue;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (!witness_thread_has_locks(td))
 				continue;
 			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
 			    p->p_comm, td, td->td_tid);
 			witness_ddb_list(td);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS(alllocks, db_witness_list_all)
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_ddb_display(db_printf);
 }
 #endif
 
 static void
 sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx)
 {
 	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
 	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
 	u_int w_rmatrix1, w_rmatrix2;
 	int generation, i, j;
 
 	tmp_data1 = NULL;
 	tmp_data2 = NULL;
 	tmp_w1 = NULL;
 	tmp_w2 = NULL;
 
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
 
 restart:
 	mtx_lock_spin(&w_mtx);
 	generation = w_generation;
 	mtx_unlock_spin(&w_mtx);
 	sbuf_printf(sb, "Number of known direct relationships is %d\n",
 	    w_lohash.wloh_count);
 	for (i = 1; i < w_max_used_index; i++) {
 		mtx_lock_spin(&w_mtx);
 		if (generation != w_generation) {
 			mtx_unlock_spin(&w_mtx);
 
 			/* The graph has changed, try again. */
 			*oldidx = 0;
 			sbuf_clear(sb);
 			goto restart;
 		}
 
 		w1 = &w_data[i];
 		if (w1->w_reversed == 0) {
 			mtx_unlock_spin(&w_mtx);
 			continue;
 		}
 
 		/* Copy w1 locally so we can release the spin lock. */
 		*tmp_w1 = *w1;
 		mtx_unlock_spin(&w_mtx);
 
 		if (tmp_w1->w_reversed == 0)
 			continue;
 		for (j = 1; j < w_max_used_index; j++) {
 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
 				continue;
 
 			mtx_lock_spin(&w_mtx);
 			if (generation != w_generation) {
 				mtx_unlock_spin(&w_mtx);
 
 				/* The graph has changed, try again. */
 				*oldidx = 0;
 				sbuf_clear(sb);
 				goto restart;
 			}
 
 			w2 = &w_data[j];
 			data1 = witness_lock_order_get(w1, w2);
 			data2 = witness_lock_order_get(w2, w1);
 
 			/*
 			 * Copy information locally so we can release the
 			 * spin lock.
 			 */
 			*tmp_w2 = *w2;
 			w_rmatrix1 = (unsigned int)w_rmatrix[i][j];
 			w_rmatrix2 = (unsigned int)w_rmatrix[j][i];
 
 			if (data1) {
 				stack_zero(&tmp_data1->wlod_stack);
 				stack_copy(&data1->wlod_stack,
 				    &tmp_data1->wlod_stack);
 			}
 			if (data2 && data2 != data1) {
 				stack_zero(&tmp_data2->wlod_stack);
 				stack_copy(&data2->wlod_stack,
 				    &tmp_data2->wlod_stack);
 			}
 			mtx_unlock_spin(&w_mtx);
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
 			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 		}
 	}
 	mtx_lock_spin(&w_mtx);
 	if (generation != w_generation) {
 		mtx_unlock_spin(&w_mtx);
 
 		/*
 		 * The graph changed while we were printing stack data,
 		 * try again.
 		 */
 		*oldidx = 0;
 		sbuf_clear(sb);
 		goto restart;
 	}
 	mtx_unlock_spin(&w_mtx);
 
 	/* Free temporary storage space. */
 	free(tmp_data1, M_TEMP);
 	free(tmp_data2, M_TEMP);
 	free(tmp_w1, M_TEMP);
 	free(tmp_w2, M_TEMP);
 }
 
 static int
 sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 	sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	sbuf_print_witness_badstacks(sb, &req->oldidx);
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 #ifdef DDB
 static int
 sbuf_db_printf_drain(void *arg __unused, const char *data, int len)
 {
 
 	return (db_printf("%.*s", len, data));
 }
 
 DB_SHOW_COMMAND(badstacks, db_witness_badstacks)
 {
 	struct sbuf sb;
 	char buffer[128];
 	size_t dummy;
 
 	sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN);
 	sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL);
 	sbuf_print_witness_badstacks(&sb, &dummy);
 	sbuf_finish(&sb);
 }
 #endif
 
 static int
 sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
 {
 	static const struct {
 		enum witness_channel channel;
 		const char *name;
 	} channels[] = {
 		{ WITNESS_CONSOLE, "console" },
 		{ WITNESS_LOG, "log" },
 		{ WITNESS_NONE, "none" },
 	};
 	char buf[16];
 	u_int i;
 	int error;
 
 	buf[0] = '\0';
 	for (i = 0; i < nitems(channels); i++)
 		if (witness_channel == channels[i].channel) {
 			snprintf(buf, sizeof(buf), "%s", channels[i].name);
 			break;
 		}
 
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = EINVAL;
 	for (i = 0; i < nitems(channels); i++)
 		if (strcmp(channels[i].name, buf) == 0) {
 			witness_channel = channels[i].channel;
 			error = 0;
 			break;
 		}
 	return (error);
 }
 
 static int
 sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
 {
 	struct witness *w;
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_printf(sb, "\n");
 
 	mtx_lock_spin(&w_mtx);
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 	STAILQ_FOREACH(w, &w_all, w_list)
 		witness_add_fullgraph(sb, w);
 	mtx_unlock_spin(&w_mtx);
 
 	/*
 	 * Close the sbuf and return to userland.
 	 */
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = witness_watch;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value > 1 || value < -1 ||
 	    (witness_watch == -1 && value != witness_watch))
 		return (EINVAL);
 	witness_watch = value;
 	return (0);
 }
 
 static void
 witness_add_fullgraph(struct sbuf *sb, struct witness *w)
 {
 	int i;
 
 	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
 		return;
 	w->w_displayed = 1;
 
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
 			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
 			    w_data[i].w_name);
 			witness_add_fullgraph(sb, &w_data[i]);
 		}
 	}
 }
 
 /*
  * A simple hash function. Takes a key pointer and a key size. If size == 0,
  * interprets the key as a string and reads until the null
  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
  * hash value computed from the key.
  */
 static uint32_t
 witness_hash_djb2(const uint8_t *key, uint32_t size)
 {
 	unsigned int hash = 5381;
 	int i;
 
 	/* hash = hash * 33 + key[i] */
 	if (size)
 		for (i = 0; i < size; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 	else
 		for (i = 0; key[i] != 0; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 
 	return (hash);
 }
 
 
 /*
  * Initializes the two witness hash tables. Called exactly once from
  * witness_initialize().
  */
 static void
 witness_init_hash_tables(void)
 {
 	int i;
 
 	MPASS(witness_cold);
 
 	/* Initialize the hash tables. */
 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
 		w_hash.wh_array[i] = NULL;
 
 	w_hash.wh_size = WITNESS_HASH_SIZE;
 	w_hash.wh_count = 0;
 
 	/* Initialize the lock order data hash. */
 	w_lofree = NULL;
 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
 		w_lodata[i].wlod_next = w_lofree;
 		w_lofree = &w_lodata[i];
 	}
 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
 	w_lohash.wloh_count = 0;
 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
 		w_lohash.wloh_array[i] = NULL;
 }
 
 static struct witness *
 witness_hash_get(const char *key)
 {
 	struct witness *w;
 	uint32_t hash;
 	
 	MPASS(key != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
 	w = w_hash.wh_array[hash];
 	while (w != NULL) {
 		if (strcmp(w->w_name, key) == 0)
 			goto out;
 		w = w->w_hash_next;
 	}
 
 out:
 	return (w);
 }
 
 static void
 witness_hash_put(struct witness *w)
 {
 	uint32_t hash;
 
 	MPASS(w != NULL);
 	MPASS(w->w_name != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	KASSERT(witness_hash_get(w->w_name) == NULL,
 	    ("%s: trying to add a hash entry that already exists!", __func__));
 	KASSERT(w->w_hash_next == NULL,
 	    ("%s: w->w_hash_next != NULL", __func__));
 
 	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
 	w->w_hash_next = w_hash.wh_array[hash];
 	w_hash.wh_array[hash] = w;
 	w_hash.wh_count++;
 }
 
 
 static struct witness_lock_order_data *
 witness_lock_order_get(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if ((w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
 			break;
 		data = data->wlod_next;
 	}
 
 out:
 	return (data);
 }
 
 /*
  * Verify that parent and child have a known relationship, are not the same,
  * and child is actually a child of parent.  This is done without w_mtx
  * to avoid contention in the common case.
  */
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
 
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
 	    isitmychild(parent, child))
 		return (1);
 
 	return (0);
 }
 
 static int
 witness_lock_order_add(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 	
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if (w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
 	if (data == NULL)
 		return (0);
 	w_lofree = data->wlod_next;
 	data->wlod_next = w_lohash.wloh_array[hash];
 	data->wlod_key = key;
 	w_lohash.wloh_array[hash] = data;
 	w_lohash.wloh_count++;
 	stack_zero(&data->wlod_stack);
 	stack_save(&data->wlod_stack);
 	return (1);
 }
 
 /* Call this whenever the structure of the witness graph changes. */
 static void
 witness_increment_graph_generation(void)
 {
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
 }
 
 static int
 witness_output_drain(void *arg __unused, const char *data, int len)
 {
 
 	witness_output("%.*s", len, data);
 	return (len);
 }
 
 static void
 witness_debugger(int cond, const char *msg)
 {
 	char buf[32];
 	struct sbuf sb;
 	struct stack st;
 
 	if (!cond)
 		return;
 
 	if (witness_trace) {
 		sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 		sbuf_set_drain(&sb, witness_output_drain, NULL);
 
 		stack_zero(&st);
 		stack_save(&st);
 		witness_output("stack backtrace:\n");
 		stack_sbuf_print_ddb(&sb, &st);
 
 		sbuf_finish(&sb);
 	}
 
 #ifdef KDB
 	if (witness_kdb)
 		kdb_enter(KDB_WHY_WITNESS, msg);
 #endif
 }
Index: head/sys/kern/sys_capability.c
===================================================================
--- head/sys/kern/sys_capability.c	(revision 326270)
+++ head/sys/kern/sys_capability.c	(revision 326271)
@@ -1,655 +1,657 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008-2011 Robert N. M. Watson
  * Copyright (c) 2010-2011 Jonathan Anderson
  * Copyright (c) 2012 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Portions of this software were developed by Pawel Jakub Dawidek under
  * sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * FreeBSD kernel capability facility.
  *
  * Two kernel features are implemented here: capability mode, a sandboxed mode
  * of execution for processes, and capabilities, a refinement on file
  * descriptors that allows fine-grained control over operations on the file
  * descriptor.  Collectively, these allow processes to run in the style of a
  * historic "capability system" in which they can use only resources
  * explicitly delegated to them.  This model is enforced by restricting access
  * to global namespaces in capability mode.
  *
  * Capabilities wrap other file descriptor types, binding them to a constant
  * rights mask set when the capability is created.  New capabilities may be
  * derived from existing capabilities, but only if they have the same or a
  * strict subset of the rights on the original capability.
  *
  * System calls permitted in capability mode are defined in capabilities.conf;
  * calls must be carefully audited for safety to ensure that they don't allow
  * escape from a sandbox.  Some calls permit only a subset of operations in
  * capability mode -- for example, shm_open(2) is limited to creating
  * anonymous, rather than named, POSIX shared memory objects.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/ucred.h>
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 int trap_enotcap;
 SYSCTL_INT(_kern, OID_AUTO, trap_enotcap, CTLFLAG_RW, &trap_enotcap, 0,
     "Deliver SIGTRAP on ENOTCAPABLE");
 
 #ifdef CAPABILITY_MODE
 
 #define        IOCTLS_MAX_COUNT        256     /* XXX: Is 256 sane? */
 
 FEATURE(security_capability_mode, "Capsicum Capability Mode");
 
 /*
  * System call to enter capability mode for the process.
  */
 int
 sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
 {
 	struct ucred *newcred, *oldcred;
 	struct proc *p;
 
 	if (IN_CAPABILITY_MODE(td))
 		return (0);
 
 	newcred = crget();
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 	newcred->cr_flags |= CRED_FLAG_CAPMODE;
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 }
 
 /*
  * System call to query whether the process is in capability mode.
  */
 int
 sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
 {
 	u_int i;
 
 	i = IN_CAPABILITY_MODE(td) ? 1 : 0;
 	return (copyout(&i, uap->modep, sizeof(i)));
 }
 
 #else /* !CAPABILITY_MODE */
 
 int
 sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #endif /* CAPABILITY_MODE */
 
 #ifdef CAPABILITIES
 
 FEATURE(security_capabilities, "Capsicum Capabilities");
 
 MALLOC_DECLARE(M_FILECAPS);
 
 static inline int
 _cap_check(const cap_rights_t *havep, const cap_rights_t *needp,
     enum ktr_cap_fail_type type)
 {
 
 	if (!cap_rights_contains(havep, needp)) {
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_CAPFAIL))
 			ktrcapfail(type, needp, havep);
 #endif
 		return (ENOTCAPABLE);
 	}
 	return (0);
 }
 
 /*
  * Test whether a capability grants the requested rights.
  */
 int
 cap_check(const cap_rights_t *havep, const cap_rights_t *needp)
 {
 
 	return (_cap_check(havep, needp, CAPFAIL_NOTCAPABLE));
 }
 
 /*
  * Convert capability rights into VM access flags.
  */
 u_char
 cap_rights_to_vmprot(cap_rights_t *havep)
 {
 	u_char maxprot;
 
 	maxprot = VM_PROT_NONE;
 	if (cap_rights_is_set(havep, CAP_MMAP_R))
 		maxprot |= VM_PROT_READ;
 	if (cap_rights_is_set(havep, CAP_MMAP_W))
 		maxprot |= VM_PROT_WRITE;
 	if (cap_rights_is_set(havep, CAP_MMAP_X))
 		maxprot |= VM_PROT_EXECUTE;
 
 	return (maxprot);
 }
 
 /*
  * Extract rights from a capability for monitoring purposes -- not for use in
  * any other way, as we want to keep all capability permission evaluation in
  * this one file.
  */
 
 cap_rights_t *
 cap_rights_fde(struct filedescent *fde)
 {
 
 	return (&fde->fde_rights);
 }
 
 cap_rights_t *
 cap_rights(struct filedesc *fdp, int fd)
 {
 
 	return (cap_rights_fde(&fdp->fd_ofiles[fd]));
 }
 
 int
 kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights)
 {
 	struct filedesc *fdp;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (fget_locked(fdp, fd) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	error = _cap_check(cap_rights(fdp, fd), rights, CAPFAIL_INCREASE);
 	if (error == 0) {
 		fdp->fd_ofiles[fd].fde_rights = *rights;
 		if (!cap_rights_is_set(rights, CAP_IOCTL)) {
 			free(fdp->fd_ofiles[fd].fde_ioctls, M_FILECAPS);
 			fdp->fd_ofiles[fd].fde_ioctls = NULL;
 			fdp->fd_ofiles[fd].fde_nioctls = 0;
 		}
 		if (!cap_rights_is_set(rights, CAP_FCNTL))
 			fdp->fd_ofiles[fd].fde_fcntls = 0;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	return (error);
 }
 
 /*
  * System call to limit rights of the given capability.
  */
 int
 sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
 {
 	cap_rights_t rights;
 	int error, version;
 
 	cap_rights_init(&rights);
 
 	error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0]));
 	if (error != 0)
 		return (error);
 	version = CAPVER(&rights);
 	if (version != CAP_RIGHTS_VERSION_00)
 		return (EINVAL);
 
 	error = copyin(uap->rightsp, &rights,
 	    sizeof(rights.cr_rights[0]) * CAPARSIZE(&rights));
 	if (error != 0)
 		return (error);
 	/* Check for race. */
 	if (CAPVER(&rights) != version)
 		return (EINVAL);
 
 	if (!cap_rights_is_valid(&rights))
 		return (EINVAL);
 
 	if (version != CAP_RIGHTS_VERSION) {
 		rights.cr_rights[0] &= ~(0x3ULL << 62);
 		rights.cr_rights[0] |= ((uint64_t)CAP_RIGHTS_VERSION << 62);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrcaprights(&rights);
 #endif
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_RIGHTS(&rights);
 	return (kern_cap_rights_limit(td, uap->fd, &rights));
 }
 
 /*
  * System call to query the rights mask associated with a capability.
  */
 int
 sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
 {
 	struct filedesc *fdp;
 	cap_rights_t rights;
 	int error, fd, i, n;
 
 	if (uap->version != CAP_RIGHTS_VERSION_00)
 		return (EINVAL);
 
 	fd = uap->fd;
 
 	AUDIT_ARG_FD(fd);
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	if (fget_locked(fdp, fd) == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		return (EBADF);
 	}
 	rights = *cap_rights(fdp, fd);
 	FILEDESC_SUNLOCK(fdp);
 	n = uap->version + 2;
 	if (uap->version != CAPVER(&rights)) {
 		/*
 		 * For older versions we need to check if the descriptor
 		 * doesn't contain rights not understood by the caller.
 		 * If it does, we have to return an error.
 		 */
 		for (i = n; i < CAPARSIZE(&rights); i++) {
 			if ((rights.cr_rights[i] & ~(0x7FULL << 57)) != 0)
 				return (EINVAL);
 		}
 	}
 	error = copyout(&rights, uap->rightsp, sizeof(rights.cr_rights[0]) * n);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 		ktrcaprights(&rights);
 #endif
 	return (error);
 }
 
 /*
  * Test whether a capability grants the given ioctl command.
  * If descriptor doesn't have CAP_IOCTL, then ioctls list is empty and
  * ENOTCAPABLE will be returned.
  */
 int
 cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd)
 {
 	u_long *cmds;
 	ssize_t ncmds;
 	long i;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("%s: invalid fd=%d", __func__, fd));
 
 	ncmds = fdp->fd_ofiles[fd].fde_nioctls;
 	if (ncmds == -1)
 		return (0);
 
 	cmds = fdp->fd_ofiles[fd].fde_ioctls;
 	for (i = 0; i < ncmds; i++) {
 		if (cmds[i] == cmd)
 			return (0);
 	}
 
 	return (ENOTCAPABLE);
 }
 
 /*
  * Check if the current ioctls list can be replaced by the new one.
  */
 static int
 cap_ioctl_limit_check(struct filedesc *fdp, int fd, const u_long *cmds,
     size_t ncmds)
 {
 	u_long *ocmds;
 	ssize_t oncmds;
 	u_long i;
 	long j;
 
 	oncmds = fdp->fd_ofiles[fd].fde_nioctls;
 	if (oncmds == -1)
 		return (0);
 	if (oncmds < (ssize_t)ncmds)
 		return (ENOTCAPABLE);
 
 	ocmds = fdp->fd_ofiles[fd].fde_ioctls;
 	for (i = 0; i < ncmds; i++) {
 		for (j = 0; j < oncmds; j++) {
 			if (cmds[i] == ocmds[j])
 				break;
 		}
 		if (j == oncmds)
 			return (ENOTCAPABLE);
 	}
 
 	return (0);
 }
 
 int
 kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, size_t ncmds)
 {
 	struct filedesc *fdp;
 	u_long *ocmds;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	if (ncmds > IOCTLS_MAX_COUNT) {
 		error = EINVAL;
 		goto out_free;
 	}
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 
 	if (fget_locked(fdp, fd) == NULL) {
 		error = EBADF;
 		goto out;
 	}
 
 	error = cap_ioctl_limit_check(fdp, fd, cmds, ncmds);
 	if (error != 0)
 		goto out;
 
 	ocmds = fdp->fd_ofiles[fd].fde_ioctls;
 	fdp->fd_ofiles[fd].fde_ioctls = cmds;
 	fdp->fd_ofiles[fd].fde_nioctls = ncmds;
 
 	cmds = ocmds;
 	error = 0;
 out:
 	FILEDESC_XUNLOCK(fdp);
 out_free:
 	free(cmds, M_FILECAPS);
 	return (error);
 }
 
 int
 sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
 {
 	u_long *cmds;
 	size_t ncmds;
 	int error;
 
 	ncmds = uap->ncmds;
 
 	if (ncmds > IOCTLS_MAX_COUNT)
 		return (EINVAL);
 
 	if (ncmds == 0) {
 		cmds = NULL;
 	} else {
 		cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK);
 		error = copyin(uap->cmds, cmds, sizeof(cmds[0]) * ncmds);
 		if (error != 0) {
 			free(cmds, M_FILECAPS);
 			return (error);
 		}
 	}
 
 	return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds));
 }
 
 int
 sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
 {
 	struct filedesc *fdp;
 	struct filedescent *fdep;
 	u_long *cmdsp, *dstcmds;
 	size_t maxcmds, ncmds;
 	int16_t count;
 	int error, fd;
 
 	fd = uap->fd;
 	dstcmds = uap->cmds;
 	maxcmds = uap->maxcmds;
 
 	AUDIT_ARG_FD(fd);
 
 	fdp = td->td_proc->p_fd;
 
 	cmdsp = NULL;
 	if (dstcmds != NULL) {
 		cmdsp = malloc(sizeof(cmdsp[0]) * IOCTLS_MAX_COUNT, M_FILECAPS,
 		    M_WAITOK | M_ZERO);
 	}
 
 	FILEDESC_SLOCK(fdp);
 	fdep = fdeget_locked(fdp, fd);
 	if (fdep == NULL) {
 		error = EBADF;
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
 	count = fdep->fde_nioctls;
 	if (count != -1 && cmdsp != NULL) {
 		ncmds = MIN(count, maxcmds);
 		memcpy(cmdsp, fdep->fde_ioctls, sizeof(cmdsp[0]) * ncmds);
 	}
 	FILEDESC_SUNLOCK(fdp);
 
 	/*
 	 * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL)
 	 * the only sane thing we can do is to not populate the given array and
 	 * return CAP_IOCTLS_ALL.
 	 */
 	if (count != -1) {
 		if (cmdsp != NULL) {
 			error = copyout(cmdsp, dstcmds,
 			    sizeof(cmdsp[0]) * ncmds);
 			if (error != 0)
 				goto out;
 		}
 		td->td_retval[0] = count;
 	} else {
 		td->td_retval[0] = CAP_IOCTLS_ALL;
 	}
 
 	error = 0;
 out:
 	free(cmdsp, M_FILECAPS);
 	return (error);
 }
 
 /*
  * Test whether a capability grants the given fcntl command.
  */
 int
 cap_fcntl_check_fde(struct filedescent *fde, int cmd)
 {
 	uint32_t fcntlcap;
 
 	fcntlcap = (1 << cmd);
 	KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0,
 	    ("Unsupported fcntl=%d.", cmd));
 
 	if ((fde->fde_fcntls & fcntlcap) != 0)
 		return (0);
 
 	return (ENOTCAPABLE);
 }
 
 int
 cap_fcntl_check(struct filedesc *fdp, int fd, int cmd)
 {
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("%s: invalid fd=%d", __func__, fd));
 
 	return (cap_fcntl_check_fde(&fdp->fd_ofiles[fd], cmd));
 }
 
 int
 sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
 {
 	struct filedesc *fdp;
 	uint32_t fcntlrights;
 	int fd;
 
 	fd = uap->fd;
 	fcntlrights = uap->fcntlrights;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_FCNTL_RIGHTS(fcntlrights);
 
 	if ((fcntlrights & ~CAP_FCNTL_ALL) != 0)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 
 	if (fget_locked(fdp, fd) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	if ((fcntlrights & ~fdp->fd_ofiles[fd].fde_fcntls) != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (ENOTCAPABLE);
 	}
 
 	fdp->fd_ofiles[fd].fde_fcntls = fcntlrights;
 	FILEDESC_XUNLOCK(fdp);
 
 	return (0);
 }
 
 int
 sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
 {
 	struct filedesc *fdp;
 	uint32_t rights;
 	int fd;
 
 	fd = uap->fd;
 
 	AUDIT_ARG_FD(fd);
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	if (fget_locked(fdp, fd) == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		return (EBADF);
 	}
 	rights = fdp->fd_ofiles[fd].fde_fcntls;
 	FILEDESC_SUNLOCK(fdp);
 
 	return (copyout(&rights, uap->fcntlrightsp, sizeof(rights)));
 }
 
 #else /* !CAPABILITIES */
 
 /*
  * Stub Capability functions for when options CAPABILITIES isn't compiled
  * into the kernel.
  */
 
 int
 sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #endif /* CAPABILITIES */
Index: head/sys/kern/sys_pipe.c
===================================================================
--- head/sys/kern/sys_pipe.c	(revision 326270)
+++ head/sys/kern/sys_pipe.c	(revision 326271)
@@ -1,1841 +1,1843 @@
 /*-
+ * SPDX-License-Identifier: BSD-4-Clause
+ *
  * Copyright (c) 1996 John S. Dyson
  * Copyright (c) 2012 Giovanni Trematerra
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  */
 
 /*
  * This file contains a high-performance replacement for the socket-based
  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
  * all features of sockets, but does do everything that pipes normally
  * do.
  */
 
 /*
  * This code has two modes of operation, a small write mode and a large
  * write mode.  The small write mode acts like conventional pipes with
  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
  * and PIPE_SIZE in size, the sending process pins the underlying pages in
  * memory, and the receiving process copies directly from these pinned pages
  * in the sending process.
  *
  * If the sending process receives a signal, it is possible that it will
  * go away, and certainly its address space can change, because control
  * is returned back to the user-mode side.  In that case, the pipe code
  * arranges to copy the buffer supplied by the user process, to a pageable
  * kernel buffer, and the receiving process will grab the data from the
  * pageable kernel buffer.  Since signals don't happen all that often,
  * the copy operation is normally eliminated.
  *
  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
  * happen for small transfers so that the system will not spend all of
  * its time context switching.
  *
  * In order to limit the resource use of pipes, two sysctls exist:
  *
  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
  * address space available to us in pipe_map. This value is normally
  * autotuned, but may also be loader tuned.
  *
  * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
  * memory in use by pipes.
  *
  * Based on how large pipekva is relative to maxpipekva, the following
  * will happen:
  *
  * 0% - 50%:
  *     New pipes are given 16K of memory backing, pipes may dynamically
  *     grow to as large as 64K where needed.
  * 50% - 75%:
  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
  *     existing pipes may NOT grow.
  * 75% - 100%:
  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
  *     existing pipes will be shrunk down to 4K whenever possible.
  *
  * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
  * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
  * resize which MUST occur for reverse-direction pipes when they are
  * first used.
  *
  * Additional information about the current state of pipes may be obtained
  * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
  * and kern.ipc.piperesizefail.
  *
  * Locking rules:  There are two locks present here:  A mutex, used via
  * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
  * the flag, as mutexes can not persist over uiomove.  The mutex
  * exists only to guard access to the flag, and is not in itself a
  * locking mechanism.  Also note that there is only a single mutex for
  * both directions of a pipe.
  *
  * As pipelock() may have to sleep before it can acquire the flag, it
  * is important to reread all data after a call to pipelock(); everything
  * in the structure may have changed.
  */
 
 #include "opt_compat.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/ttycom.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/pipe.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #include <sys/event.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 /*
  * Use this define if you want to disable *fancy* VM things.  Expect an
  * approx 30% decrease in transfer rate.  This could be useful for
  * NetBSD or OpenBSD.
  */
 /* #define PIPE_NODIRECT */
 
 #define PIPE_PEER(pipe)	\
 	(((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer))
 
 /*
  * interfaces to the outside world
  */
 static fo_rdwr_t	pipe_read;
 static fo_rdwr_t	pipe_write;
 static fo_truncate_t	pipe_truncate;
 static fo_ioctl_t	pipe_ioctl;
 static fo_poll_t	pipe_poll;
 static fo_kqfilter_t	pipe_kqfilter;
 static fo_stat_t	pipe_stat;
 static fo_close_t	pipe_close;
 static fo_chmod_t	pipe_chmod;
 static fo_chown_t	pipe_chown;
 static fo_fill_kinfo_t	pipe_fill_kinfo;
 
 struct fileops pipeops = {
 	.fo_read = pipe_read,
 	.fo_write = pipe_write,
 	.fo_truncate = pipe_truncate,
 	.fo_ioctl = pipe_ioctl,
 	.fo_poll = pipe_poll,
 	.fo_kqfilter = pipe_kqfilter,
 	.fo_stat = pipe_stat,
 	.fo_close = pipe_close,
 	.fo_chmod = pipe_chmod,
 	.fo_chown = pipe_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = pipe_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_pipedetach(struct knote *kn);
 static void	filt_pipedetach_notsup(struct knote *kn);
 static int	filt_pipenotsup(struct knote *kn, long hint);
 static int	filt_piperead(struct knote *kn, long hint);
 static int	filt_pipewrite(struct knote *kn, long hint);
 
 static struct filterops pipe_nfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach_notsup,
 	.f_event = filt_pipenotsup
 };
 static struct filterops pipe_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
 	.f_event = filt_piperead
 };
 static struct filterops pipe_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
 	.f_event = filt_pipewrite
 };
 
 /*
  * Default pipe buffer size(s), this can be kind-of large now because pipe
  * space is pageable.  The pipe code will try to maintain locality of
  * reference for performance reasons, so small amounts of outstanding I/O
  * will not wipe the cache.
  */
 #define MINPIPESIZE (PIPE_SIZE/3)
 #define MAXPIPESIZE (2*PIPE_SIZE/3)
 
 static long amountpipekva;
 static int pipefragretry;
 static int pipeallocfail;
 static int piperesizefail;
 static int piperesizeallowed = 1;
 
 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 	   &maxpipekva, 0, "Pipe KVA limit");
 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
 	   &amountpipekva, 0, "Pipe KVA usage");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
 	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
 	  &pipeallocfail, 0, "Pipe allocation failures");
 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
 	  &piperesizefail, 0, "Pipe resize failures");
 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
 	  &piperesizeallowed, 0, "Pipe resizing allowed");
 
 static void pipeinit(void *dummy __unused);
 static void pipeclose(struct pipe *cpipe);
 static void pipe_free_kmem(struct pipe *cpipe);
 static void pipe_create(struct pipe *pipe, int backing);
 static void pipe_paircreate(struct thread *td, struct pipepair **p_pp);
 static __inline int pipelock(struct pipe *cpipe, int catch);
 static __inline void pipeunlock(struct pipe *cpipe);
 #ifndef PIPE_NODIRECT
 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
 static void pipe_destroy_write_buffer(struct pipe *wpipe);
 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
 static void pipe_clone_write_buffer(struct pipe *wpipe);
 #endif
 static int pipespace(struct pipe *cpipe, int size);
 static int pipespace_new(struct pipe *cpipe, int size);
 
 static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
 static int	pipe_zone_init(void *mem, int size, int flags);
 static void	pipe_zone_fini(void *mem, int size);
 
 static uma_zone_t pipe_zone;
 static struct unrhdr *pipeino_unr;
 static dev_t pipedev_ino;
 
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 
 static void
 pipeinit(void *dummy __unused)
 {
 
 	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
 	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
 	    UMA_ALIGN_PTR, 0);
 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
 	pipeino_unr = new_unrhdr(1, INT32_MAX, NULL);
 	KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized"));
 	pipedev_ino = devfs_alloc_cdp_inode();
 	KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized"));
 }
 
 static int
 pipe_zone_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	/*
 	 * We zero both pipe endpoints to make sure all the kmem pointers
 	 * are NULL, flag fields are zero'd, etc.  We timestamp both
 	 * endpoints with the same time.
 	 */
 	rpipe = &pp->pp_rpipe;
 	bzero(rpipe, sizeof(*rpipe));
 	vfs_timestamp(&rpipe->pipe_ctime);
 	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
 
 	wpipe = &pp->pp_wpipe;
 	bzero(wpipe, sizeof(*wpipe));
 	wpipe->pipe_ctime = rpipe->pipe_ctime;
 	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
 
 	rpipe->pipe_peer = wpipe;
 	rpipe->pipe_pair = pp;
 	wpipe->pipe_peer = rpipe;
 	wpipe->pipe_pair = pp;
 
 	/*
 	 * Mark both endpoints as present; they will later get free'd
 	 * one at a time.  When both are free'd, then the whole pair
 	 * is released.
 	 */
 	rpipe->pipe_present = PIPE_ACTIVE;
 	wpipe->pipe_present = PIPE_ACTIVE;
 
 	/*
 	 * Eventually, the MAC Framework may initialize the label
 	 * in ctor or init, but for now we do it elswhere to avoid
 	 * blocking in ctor or init.
 	 */
 	pp->pp_label = NULL;
 
 	return (0);
 }
 
 static int
 pipe_zone_init(void *mem, int size, int flags)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW);
 	return (0);
 }
 
 static void
 pipe_zone_fini(void *mem, int size)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	mtx_destroy(&pp->pp_mtx);
 }
 
 static void
 pipe_paircreate(struct thread *td, struct pipepair **p_pp)
 {
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
 
 	*p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK);
 #ifdef MAC
 	/*
 	 * The MAC label is shared between the connected endpoints.  As a
 	 * result mac_pipe_init() and mac_pipe_create() are called once
 	 * for the pair, and not on the endpoints.
 	 */
 	mac_pipe_init(pp);
 	mac_pipe_create(td->td_ucred, pp);
 #endif
 	rpipe = &pp->pp_rpipe;
 	wpipe = &pp->pp_wpipe;
 
 	knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
 	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
 
 	/* Only the forward direction pipe is backed by default */
 	pipe_create(rpipe, 1);
 	pipe_create(wpipe, 0);
 
 	rpipe->pipe_state |= PIPE_DIRECTOK;
 	wpipe->pipe_state |= PIPE_DIRECTOK;
 }
 
 void
 pipe_named_ctor(struct pipe **ppipe, struct thread *td)
 {
 	struct pipepair *pp;
 
 	pipe_paircreate(td, &pp);
 	pp->pp_rpipe.pipe_state |= PIPE_NAMED;
 	*ppipe = &pp->pp_rpipe;
 }
 
 void
 pipe_dtor(struct pipe *dpipe)
 {
 	struct pipe *peer;
 	ino_t ino;
 
 	ino = dpipe->pipe_ino;
 	peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL;
 	funsetown(&dpipe->pipe_sigio);
 	pipeclose(dpipe);
 	if (peer != NULL) {
 		funsetown(&peer->pipe_sigio);
 		pipeclose(peer);
 	}
 	if (ino != 0 && ino != (ino_t)-1)
 		free_unr(pipeino_unr, ino);
 }
 
 /*
  * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
  * the zone pick up the pieces via pipeclose().
  */
 int
 kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1,
     struct filecaps *fcaps2)
 {
 	struct file *rf, *wf;
 	struct pipe *rpipe, *wpipe;
 	struct pipepair *pp;
 	int fd, fflags, error;
 
 	pipe_paircreate(td, &pp);
 	rpipe = &pp->pp_rpipe;
 	wpipe = &pp->pp_wpipe;
 	error = falloc_caps(td, &rf, &fd, flags, fcaps1);
 	if (error) {
 		pipeclose(rpipe);
 		pipeclose(wpipe);
 		return (error);
 	}
 	/* An extra reference on `rf' has been held for us by falloc_caps(). */
 	fildes[0] = fd;
 
 	fflags = FREAD | FWRITE;
 	if ((flags & O_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 
 	/*
 	 * Warning: once we've gotten past allocation of the fd for the
 	 * read-side, we can only drop the read side via fdrop() in order
 	 * to avoid races against processes which manage to dup() the read
 	 * side while we are blocked trying to allocate the write side.
 	 */
 	finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
 	error = falloc_caps(td, &wf, &fd, flags, fcaps2);
 	if (error) {
 		fdclose(td, rf, fildes[0]);
 		fdrop(rf, td);
 		/* rpipe has been closed by fdrop(). */
 		pipeclose(wpipe);
 		return (error);
 	}
 	/* An extra reference on `wf' has been held for us by falloc_caps(). */
 	finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops);
 	fdrop(wf, td);
 	fildes[1] = fd;
 	fdrop(rf, td);
 
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD10
 /* ARGSUSED */
 int
 freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused)
 {
 	int error;
 	int fildes[2];
 
 	error = kern_pipe(td, fildes, 0, NULL, NULL);
 	if (error)
 		return (error);
 
 	td->td_retval[0] = fildes[0];
 	td->td_retval[1] = fildes[1];
 
 	return (0);
 }
 #endif
 
 int
 sys_pipe2(struct thread *td, struct pipe2_args *uap)
 {
 	int error, fildes[2];
 
 	if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
 		return (EINVAL);
 	error = kern_pipe(td, fildes, uap->flags, NULL, NULL);
 	if (error)
 		return (error);
 	error = copyout(fildes, uap->fildes, 2 * sizeof(int));
 	if (error) {
 		(void)kern_close(td, fildes[0]);
 		(void)kern_close(td, fildes[1]);
 	}
 	return (error);
 }
 
 /*
  * Allocate kva for pipe circular buffer, the space is pageable
  * This routine will 'realloc' the size of a pipe safely, if it fails
  * it will retain the old buffer.
  * If it fails it will return ENOMEM.
  */
 static int
 pipespace_new(cpipe, size)
 	struct pipe *cpipe;
 	int size;
 {
 	caddr_t buffer;
 	int error, cnt, firstseg;
 	static int curfail = 0;
 	static struct timeval lastfail;
 
 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
 	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
 		("pipespace: resize of direct writes not allowed"));
 retry:
 	cnt = cpipe->pipe_buffer.cnt;
 	if (cnt > size)
 		size = cnt;
 
 	size = round_page(size);
 	buffer = (caddr_t) vm_map_min(pipe_map);
 
 	error = vm_map_find(pipe_map, NULL, 0,
 		(vm_offset_t *) &buffer, size, 0, VMFS_ANY_SPACE,
 		VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error != KERN_SUCCESS) {
 		if ((cpipe->pipe_buffer.buffer == NULL) &&
 			(size > SMALL_PIPE_SIZE)) {
 			size = SMALL_PIPE_SIZE;
 			pipefragretry++;
 			goto retry;
 		}
 		if (cpipe->pipe_buffer.buffer == NULL) {
 			pipeallocfail++;
 			if (ppsratecheck(&lastfail, &curfail, 1))
 				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
 		} else {
 			piperesizefail++;
 		}
 		return (ENOMEM);
 	}
 
 	/* copy data, then free old resources if we're resizing */
 	if (cnt > 0) {
 		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
 			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
 				buffer, firstseg);
 			if ((cnt - firstseg) > 0)
 				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
 					cpipe->pipe_buffer.in);
 		} else {
 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
 				buffer, cnt);
 		}
 	}
 	pipe_free_kmem(cpipe);
 	cpipe->pipe_buffer.buffer = buffer;
 	cpipe->pipe_buffer.size = size;
 	cpipe->pipe_buffer.in = cnt;
 	cpipe->pipe_buffer.out = 0;
 	cpipe->pipe_buffer.cnt = cnt;
 	atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
 	return (0);
 }
 
 /*
  * Wrapper for pipespace_new() that performs locking assertions.
  */
 static int
 pipespace(cpipe, size)
 	struct pipe *cpipe;
 	int size;
 {
 
 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
 		("Unlocked pipe passed to pipespace"));
 	return (pipespace_new(cpipe, size));
 }
 
 /*
  * lock a pipe for I/O, blocking other access
  */
 static __inline int
 pipelock(cpipe, catch)
 	struct pipe *cpipe;
 	int catch;
 {
 	int error;
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	while (cpipe->pipe_state & PIPE_LOCKFL) {
 		cpipe->pipe_state |= PIPE_LWANT;
 		error = msleep(cpipe, PIPE_MTX(cpipe),
 		    catch ? (PRIBIO | PCATCH) : PRIBIO,
 		    "pipelk", 0);
 		if (error != 0)
 			return (error);
 	}
 	cpipe->pipe_state |= PIPE_LOCKFL;
 	return (0);
 }
 
 /*
  * unlock a pipe I/O lock
  */
 static __inline void
 pipeunlock(cpipe)
 	struct pipe *cpipe;
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
 		("Unlocked pipe passed to pipeunlock"));
 	cpipe->pipe_state &= ~PIPE_LOCKFL;
 	if (cpipe->pipe_state & PIPE_LWANT) {
 		cpipe->pipe_state &= ~PIPE_LWANT;
 		wakeup(cpipe);
 	}
 }
 
 void
 pipeselwakeup(cpipe)
 	struct pipe *cpipe;
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	if (cpipe->pipe_state & PIPE_SEL) {
 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
 		if (!SEL_WAITING(&cpipe->pipe_sel))
 			cpipe->pipe_state &= ~PIPE_SEL;
 	}
 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
 	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
 }
 
 /*
  * Initialize and allocate VM and memory for pipe.  The structure
  * will start out zero'd from the ctor, so we just manage the kmem.
  */
 static void
 pipe_create(pipe, backing)
 	struct pipe *pipe;
 	int backing;
 {
 
 	if (backing) {
 		/*
 		 * Note that these functions can fail if pipe map is exhausted
 		 * (as a result of too many pipes created), but we ignore the
 		 * error as it is not fatal and could be provoked by
 		 * unprivileged users. The only consequence is worse performance
 		 * with given pipe.
 		 */
 		if (amountpipekva > maxpipekva / 2)
 			(void)pipespace_new(pipe, SMALL_PIPE_SIZE);
 		else
 			(void)pipespace_new(pipe, PIPE_SIZE);
 	}
 
 	pipe->pipe_ino = -1;
 }
 
 /* ARGSUSED */
 static int
 pipe_read(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	struct pipe *rpipe;
 	int error;
 	int nread = 0;
 	int size;
 
 	rpipe = fp->f_data;
 	PIPE_LOCK(rpipe);
 	++rpipe->pipe_busy;
 	error = pipelock(rpipe, 1);
 	if (error)
 		goto unlocked_error;
 
 #ifdef MAC
 	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
 	if (error)
 		goto locked_error;
 #endif
 	if (amountpipekva > (3 * maxpipekva) / 4) {
 		if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
 			(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
 			(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
 			(piperesizeallowed == 1)) {
 			PIPE_UNLOCK(rpipe);
 			pipespace(rpipe, SMALL_PIPE_SIZE);
 			PIPE_LOCK(rpipe);
 		}
 	}
 
 	while (uio->uio_resid) {
 		/*
 		 * normal pipe buffer receive
 		 */
 		if (rpipe->pipe_buffer.cnt > 0) {
 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
 			if (size > rpipe->pipe_buffer.cnt)
 				size = rpipe->pipe_buffer.cnt;
 			if (size > uio->uio_resid)
 				size = uio->uio_resid;
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(
 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
 			    size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 
 			rpipe->pipe_buffer.out += size;
 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
 				rpipe->pipe_buffer.out = 0;
 
 			rpipe->pipe_buffer.cnt -= size;
 
 			/*
 			 * If there is no more to read in the pipe, reset
 			 * its pointers to the beginning.  This improves
 			 * cache hit stats.
 			 */
 			if (rpipe->pipe_buffer.cnt == 0) {
 				rpipe->pipe_buffer.in = 0;
 				rpipe->pipe_buffer.out = 0;
 			}
 			nread += size;
 #ifndef PIPE_NODIRECT
 		/*
 		 * Direct copy, bypassing a kernel buffer.
 		 */
 		} else if ((size = rpipe->pipe_map.cnt) &&
 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
 			if (size > uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove_fromphys(rpipe->pipe_map.ms,
 			    rpipe->pipe_map.pos, size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 			nread += size;
 			rpipe->pipe_map.pos += size;
 			rpipe->pipe_map.cnt -= size;
 			if (rpipe->pipe_map.cnt == 0) {
 				rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW);
 				wakeup(rpipe);
 			}
 #endif
 		} else {
 			/*
 			 * detect EOF condition
 			 * read returns 0 on EOF, no need to set error
 			 */
 			if (rpipe->pipe_state & PIPE_EOF)
 				break;
 
 			/*
 			 * If the "write-side" has been blocked, wake it up now.
 			 */
 			if (rpipe->pipe_state & PIPE_WANTW) {
 				rpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(rpipe);
 			}
 
 			/*
 			 * Break if some data was read.
 			 */
 			if (nread > 0)
 				break;
 
 			/*
 			 * Unlock the pipe buffer for our remaining processing.
 			 * We will either break out with an error or we will
 			 * sleep and relock to loop.
 			 */
 			pipeunlock(rpipe);
 
 			/*
 			 * Handle non-blocking mode operation or
 			 * wait for more data.
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 			} else {
 				rpipe->pipe_state |= PIPE_WANTR;
 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
 				    PRIBIO | PCATCH,
 				    "piperd", 0)) == 0)
 					error = pipelock(rpipe, 1);
 			}
 			if (error)
 				goto unlocked_error;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	pipeunlock(rpipe);
 
 	/* XXX: should probably do this before getting any locks. */
 	if (error == 0)
 		vfs_timestamp(&rpipe->pipe_atime);
 unlocked_error:
 	--rpipe->pipe_busy;
 
 	/*
 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
 	 */
 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
 		wakeup(rpipe);
 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
 		/*
 		 * Handle write blocking hysteresis.
 		 */
 		if (rpipe->pipe_state & PIPE_WANTW) {
 			rpipe->pipe_state &= ~PIPE_WANTW;
 			wakeup(rpipe);
 		}
 	}
 
 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
 		pipeselwakeup(rpipe);
 
 	PIPE_UNLOCK(rpipe);
 	return (error);
 }
 
 #ifndef PIPE_NODIRECT
 /*
  * Map the sending processes' buffer into kernel space and wire it.
  * This is similar to a physical write operation.
  */
 static int
 pipe_build_write_buffer(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
 	u_int size;
 	int i;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
 	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
 		("Clone attempt on non-direct write pipe!"));
 
 	if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size)
                 size = wpipe->pipe_buffer.size;
 	else
                 size = uio->uio_iov->iov_len;
 
 	if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
 	    wpipe->pipe_map.ms, PIPENPAGES)) < 0)
 		return (EFAULT);
 
 /*
  * set up the control block
  */
 	wpipe->pipe_map.npages = i;
 	wpipe->pipe_map.pos =
 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
 	wpipe->pipe_map.cnt = size;
 
 /*
  * and update the uio data
  */
 
 	uio->uio_iov->iov_len -= size;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
 	if (uio->uio_iov->iov_len == 0)
 		uio->uio_iov++;
 	uio->uio_resid -= size;
 	uio->uio_offset += size;
 	return (0);
 }
 
 /*
  * unmap and unwire the process buffer
  */
 static void
 pipe_destroy_write_buffer(wpipe)
 	struct pipe *wpipe;
 {
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages);
 	wpipe->pipe_map.npages = 0;
 }
 
 /*
  * In the case of a signal, the writing process might go away.  This
  * code copies the data into the circular buffer so that the source
  * pages can be freed without loss of data.
  */
 static void
 pipe_clone_write_buffer(wpipe)
 	struct pipe *wpipe;
 {
 	struct uio uio;
 	struct iovec iov;
 	int size;
 	int pos;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	size = wpipe->pipe_map.cnt;
 	pos = wpipe->pipe_map.pos;
 
 	wpipe->pipe_buffer.in = size;
 	wpipe->pipe_buffer.out = 0;
 	wpipe->pipe_buffer.cnt = size;
 	wpipe->pipe_state &= ~PIPE_DIRECTW;
 
 	PIPE_UNLOCK(wpipe);
 	iov.iov_base = wpipe->pipe_buffer.buffer;
 	iov.iov_len = size;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = size;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
 	PIPE_LOCK(wpipe);
 	pipe_destroy_write_buffer(wpipe);
 }
 
 /*
  * This implements the pipe buffer write mechanism.  Note that only
  * a direct write OR a normal pipe write can be pending at any given time.
  * If there are any characters in the pipe buffer, the direct write will
  * be deferred until the receiving process grabs all of the bytes from
  * the pipe buffer.  Then the direct mapping write is set-up.
  */
 static int
 pipe_direct_write(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
 	int error;
 
 retry:
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	error = pipelock(wpipe, 1);
 	if (error != 0)
 		goto error1;
 	if ((wpipe->pipe_state & PIPE_EOF) != 0) {
 		error = EPIPE;
 		pipeunlock(wpipe);
 		goto error1;
 	}
 	while (wpipe->pipe_state & PIPE_DIRECTW) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdww", 0);
 		if (error)
 			goto error1;
 		else
 			goto retry;
 	}
 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
 	if (wpipe->pipe_buffer.cnt > 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdwc", 0);
 		if (error)
 			goto error1;
 		else
 			goto retry;
 	}
 
 	wpipe->pipe_state |= PIPE_DIRECTW;
 
 	PIPE_UNLOCK(wpipe);
 	error = pipe_build_write_buffer(wpipe, uio);
 	PIPE_LOCK(wpipe);
 	if (error) {
 		wpipe->pipe_state &= ~PIPE_DIRECTW;
 		pipeunlock(wpipe);
 		goto error1;
 	}
 
 	error = 0;
 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
 		if (wpipe->pipe_state & PIPE_EOF) {
 			pipe_destroy_write_buffer(wpipe);
 			pipeselwakeup(wpipe);
 			pipeunlock(wpipe);
 			error = EPIPE;
 			goto error1;
 		}
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
 		    "pipdwt", 0);
 		pipelock(wpipe, 0);
 	}
 
 	if (wpipe->pipe_state & PIPE_EOF)
 		error = EPIPE;
 	if (wpipe->pipe_state & PIPE_DIRECTW) {
 		/*
 		 * this bit of trickery substitutes a kernel buffer for
 		 * the process that might be going away.
 		 */
 		pipe_clone_write_buffer(wpipe);
 	} else {
 		pipe_destroy_write_buffer(wpipe);
 	}
 	pipeunlock(wpipe);
 	return (error);
 
 error1:
 	wakeup(wpipe);
 	return (error);
 }
 #endif
 
 static int
 pipe_write(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	int error = 0;
 	int desiredsize;
 	ssize_t orig_resid;
 	struct pipe *wpipe, *rpipe;
 
 	rpipe = fp->f_data;
 	wpipe = PIPE_PEER(rpipe);
 	PIPE_LOCK(rpipe);
 	error = pipelock(wpipe, 1);
 	if (error) {
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 	/*
 	 * detect loss of pipe read side, issue SIGPIPE if lost.
 	 */
 	if (wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(rpipe);
 		return (EPIPE);
 	}
 #ifdef MAC
 	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
 	if (error) {
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 #endif
 	++wpipe->pipe_busy;
 
 	/* Choose a larger size if it's advantageous */
 	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
 	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
 		if (piperesizeallowed != 1)
 			break;
 		if (amountpipekva > maxpipekva / 2)
 			break;
 		if (desiredsize == BIG_PIPE_SIZE)
 			break;
 		desiredsize = desiredsize * 2;
 	}
 
 	/* Choose a smaller size if we're in a OOM situation */
 	if ((amountpipekva > (3 * maxpipekva) / 4) &&
 		(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
 		(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
 		(piperesizeallowed == 1))
 		desiredsize = SMALL_PIPE_SIZE;
 
 	/* Resize if the above determined that a new size was necessary */
 	if ((desiredsize != wpipe->pipe_buffer.size) &&
 		((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
 		PIPE_UNLOCK(wpipe);
 		pipespace(wpipe, desiredsize);
 		PIPE_LOCK(wpipe);
 	}
 	if (wpipe->pipe_buffer.size == 0) {
 		/*
 		 * This can only happen for reverse direction use of pipes
 		 * in a complete OOM situation.
 		 */
 		error = ENOMEM;
 		--wpipe->pipe_busy;
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(wpipe);
 		return (error);
 	}
 
 	pipeunlock(wpipe);
 
 	orig_resid = uio->uio_resid;
 
 	while (uio->uio_resid) {
 		int space;
 
 		pipelock(wpipe, 0);
 		if (wpipe->pipe_state & PIPE_EOF) {
 			pipeunlock(wpipe);
 			error = EPIPE;
 			break;
 		}
 #ifndef PIPE_NODIRECT
 		/*
 		 * If the transfer is large, we can gain performance if
 		 * we do process-to-process copies directly.
 		 * If the write is non-blocking, we don't use the
 		 * direct write mechanism.
 		 *
 		 * The direct write mechanism will detect the reader going
 		 * away on us.
 		 */
 		if (uio->uio_segflg == UIO_USERSPACE &&
 		    uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
 		    wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
 		    (fp->f_flag & FNONBLOCK) == 0) {
 			pipeunlock(wpipe);
 			error = pipe_direct_write(wpipe, uio);
 			if (error)
 				break;
 			continue;
 		}
 #endif
 
 		/*
 		 * Pipe buffered writes cannot be coincidental with
 		 * direct writes.  We wait until the currently executing
 		 * direct write is completed before we start filling the
 		 * pipe buffer.  We break out if a signal occurs or the
 		 * reader goes away.
 		 */
 		if (wpipe->pipe_state & PIPE_DIRECTW) {
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 			pipeselwakeup(wpipe);
 			wpipe->pipe_state |= PIPE_WANTW;
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
 			    "pipbww", 0);
 			if (error)
 				break;
 			else
 				continue;
 		}
 
 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 
 		/* Writes of size <= PIPE_BUF must be atomic. */
 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
 			space = 0;
 
 		if (space > 0) {
 			int size;	/* Transfer size */
 			int segsize;	/* first segment to transfer */
 
 			/*
 			 * Transfer size is minimum of uio transfer
 			 * and free space in pipe buffer.
 			 */
 			if (space > uio->uio_resid)
 				size = uio->uio_resid;
 			else
 				size = space;
 			/*
 			 * First segment to transfer is minimum of
 			 * transfer size and contiguous space in
 			 * pipe buffer.  If first segment to transfer
 			 * is less than the transfer size, we've got
 			 * a wraparound in the buffer.
 			 */
 			segsize = wpipe->pipe_buffer.size -
 				wpipe->pipe_buffer.in;
 			if (segsize > size)
 				segsize = size;
 
 			/* Transfer first segment */
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
 					segsize, uio);
 			PIPE_LOCK(rpipe);
 
 			if (error == 0 && segsize < size) {
 				KASSERT(wpipe->pipe_buffer.in + segsize ==
 					wpipe->pipe_buffer.size,
 					("Pipe buffer wraparound disappeared"));
 				/*
 				 * Transfer remaining part now, to
 				 * support atomic writes.  Wraparound
 				 * happened.
 				 */
 
 				PIPE_UNLOCK(rpipe);
 				error = uiomove(
 				    &wpipe->pipe_buffer.buffer[0],
 				    size - segsize, uio);
 				PIPE_LOCK(rpipe);
 			}
 			if (error == 0) {
 				wpipe->pipe_buffer.in += size;
 				if (wpipe->pipe_buffer.in >=
 				    wpipe->pipe_buffer.size) {
 					KASSERT(wpipe->pipe_buffer.in ==
 						size - segsize +
 						wpipe->pipe_buffer.size,
 						("Expected wraparound bad"));
 					wpipe->pipe_buffer.in = size - segsize;
 				}
 
 				wpipe->pipe_buffer.cnt += size;
 				KASSERT(wpipe->pipe_buffer.cnt <=
 					wpipe->pipe_buffer.size,
 					("Pipe buffer overflow"));
 			}
 			pipeunlock(wpipe);
 			if (error != 0)
 				break;
 		} else {
 			/*
 			 * If the "read-side" has been blocked, wake it up now.
 			 */
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 
 			/*
 			 * don't block on non-blocking I/O
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 				pipeunlock(wpipe);
 				break;
 			}
 
 			/*
 			 * We have no more space and have something to offer,
 			 * wake up select/poll.
 			 */
 			pipeselwakeup(wpipe);
 
 			wpipe->pipe_state |= PIPE_WANTW;
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe),
 			    PRIBIO | PCATCH, "pipewr", 0);
 			if (error != 0)
 				break;
 		}
 	}
 
 	pipelock(wpipe, 0);
 	--wpipe->pipe_busy;
 
 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 		wakeup(wpipe);
 	} else if (wpipe->pipe_buffer.cnt > 0) {
 		/*
 		 * If we have put any characters in the buffer, we wake up
 		 * the reader.
 		 */
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 	}
 
 	/*
 	 * Don't return EPIPE if any byte was written.
 	 * EINTR and other interrupts are handled by generic I/O layer.
 	 * Do not pretend that I/O succeeded for obvious user error
 	 * like EFAULT.
 	 */
 	if (uio->uio_resid != orig_resid && error == EPIPE)
 		error = 0;
 
 	if (error == 0)
 		vfs_timestamp(&wpipe->pipe_mtime);
 
 	/*
 	 * We have something to offer,
 	 * wake up select/poll.
 	 */
 	if (wpipe->pipe_buffer.cnt)
 		pipeselwakeup(wpipe);
 
 	pipeunlock(wpipe);
 	PIPE_UNLOCK(rpipe);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 pipe_truncate(fp, length, active_cred, td)
 	struct file *fp;
 	off_t length;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_state & PIPE_NAMED)
 		error = vnops.fo_truncate(fp, length, active_cred, td);
 	else
 		error = invfo_truncate(fp, length, active_cred, td);
 	return (error);
 }
 
 /*
  * we implement a very minimal set of ioctls for compatibility with sockets.
  */
 static int
 pipe_ioctl(fp, cmd, data, active_cred, td)
 	struct file *fp;
 	u_long cmd;
 	void *data;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *mpipe = fp->f_data;
 	int error;
 
 	PIPE_LOCK(mpipe);
 
 #ifdef MAC
 	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
 	if (error) {
 		PIPE_UNLOCK(mpipe);
 		return (error);
 	}
 #endif
 
 	error = 0;
 	switch (cmd) {
 
 	case FIONBIO:
 		break;
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			mpipe->pipe_state |= PIPE_ASYNC;
 		} else {
 			mpipe->pipe_state &= ~PIPE_ASYNC;
 		}
 		break;
 
 	case FIONREAD:
 		if (!(fp->f_flag & FREAD)) {
 			*(int *)data = 0;
 			PIPE_UNLOCK(mpipe);
 			return (0);
 		}
 		if (mpipe->pipe_state & PIPE_DIRECTW)
 			*(int *)data = mpipe->pipe_map.cnt;
 		else
 			*(int *)data = mpipe->pipe_buffer.cnt;
 		break;
 
 	case FIOSETOWN:
 		PIPE_UNLOCK(mpipe);
 		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
 		goto out_unlocked;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&mpipe->pipe_sigio);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		PIPE_UNLOCK(mpipe);
 		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
 		goto out_unlocked;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
 		break;
 
 	default:
 		error = ENOTTY;
 		break;
 	}
 	PIPE_UNLOCK(mpipe);
 out_unlocked:
 	return (error);
 }
 
 static int
 pipe_poll(fp, events, active_cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *rpipe;
 	struct pipe *wpipe;
 	int levents, revents;
 #ifdef MAC
 	int error;
 #endif
 
 	revents = 0;
 	rpipe = fp->f_data;
 	wpipe = PIPE_PEER(rpipe);
 	PIPE_LOCK(rpipe);
 #ifdef MAC
 	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
 	if (error)
 		goto locked_error;
 #endif
 	if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM))
 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
 		    (rpipe->pipe_buffer.cnt > 0))
 			revents |= events & (POLLIN | POLLRDNORM);
 
 	if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM))
 		if (wpipe->pipe_present != PIPE_ACTIVE ||
 		    (wpipe->pipe_state & PIPE_EOF) ||
 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
 		     ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF ||
 			 wpipe->pipe_buffer.size == 0)))
 			revents |= events & (POLLOUT | POLLWRNORM);
 
 	levents = events &
 	    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
 	if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents &&
 	    fp->f_seqcount == rpipe->pipe_wgen)
 		events |= POLLINIGNEOF;
 
 	if ((events & POLLINIGNEOF) == 0) {
 		if (rpipe->pipe_state & PIPE_EOF) {
 			revents |= (events & (POLLIN | POLLRDNORM));
 			if (wpipe->pipe_present != PIPE_ACTIVE ||
 			    (wpipe->pipe_state & PIPE_EOF))
 				revents |= POLLHUP;
 		}
 	}
 
 	if (revents == 0) {
 		if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) {
 			selrecord(td, &rpipe->pipe_sel);
 			if (SEL_WAITING(&rpipe->pipe_sel))
 				rpipe->pipe_state |= PIPE_SEL;
 		}
 
 		if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) {
 			selrecord(td, &wpipe->pipe_sel);
 			if (SEL_WAITING(&wpipe->pipe_sel))
 				wpipe->pipe_state |= PIPE_SEL;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	PIPE_UNLOCK(rpipe);
 
 	return (revents);
 }
 
 /*
  * We shouldn't need locks here as we're doing a read and this should
  * be a natural race.
  */
 static int
 pipe_stat(fp, ub, active_cred, td)
 	struct file *fp;
 	struct stat *ub;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *pipe;
 	int new_unr;
 #ifdef MAC
 	int error;
 #endif
 
 	pipe = fp->f_data;
 	PIPE_LOCK(pipe);
 #ifdef MAC
 	error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
 	if (error) {
 		PIPE_UNLOCK(pipe);
 		return (error);
 	}
 #endif
 
 	/* For named pipes ask the underlying filesystem. */
 	if (pipe->pipe_state & PIPE_NAMED) {
 		PIPE_UNLOCK(pipe);
 		return (vnops.fo_stat(fp, ub, active_cred, td));
 	}
 
 	/*
 	 * Lazily allocate an inode number for the pipe.  Most pipe
 	 * users do not call fstat(2) on the pipe, which means that
 	 * postponing the inode allocation until it is must be
 	 * returned to userland is useful.  If alloc_unr failed,
 	 * assign st_ino zero instead of returning an error.
 	 * Special pipe_ino values:
 	 *  -1 - not yet initialized;
 	 *  0  - alloc_unr failed, return 0 as st_ino forever.
 	 */
 	if (pipe->pipe_ino == (ino_t)-1) {
 		new_unr = alloc_unr(pipeino_unr);
 		if (new_unr != -1)
 			pipe->pipe_ino = new_unr;
 		else
 			pipe->pipe_ino = 0;
 	}
 	PIPE_UNLOCK(pipe);
 
 	bzero(ub, sizeof(*ub));
 	ub->st_mode = S_IFIFO;
 	ub->st_blksize = PAGE_SIZE;
 	if (pipe->pipe_state & PIPE_DIRECTW)
 		ub->st_size = pipe->pipe_map.cnt;
 	else
 		ub->st_size = pipe->pipe_buffer.cnt;
 	ub->st_blocks = howmany(ub->st_size, ub->st_blksize);
 	ub->st_atim = pipe->pipe_atime;
 	ub->st_mtim = pipe->pipe_mtime;
 	ub->st_ctim = pipe->pipe_ctime;
 	ub->st_uid = fp->f_cred->cr_uid;
 	ub->st_gid = fp->f_cred->cr_gid;
 	ub->st_dev = pipedev_ino;
 	ub->st_ino = pipe->pipe_ino;
 	/*
 	 * Left as 0: st_nlink, st_rdev, st_flags, st_gen.
 	 */
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pipe_close(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 
 	if (fp->f_vnode != NULL) 
 		return vnops.fo_close(fp, td);
 	fp->f_ops = &badfileops;
 	pipe_dtor(fp->f_data);
 	fp->f_data = NULL;
 	return (0);
 }
 
 static int
 pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_state & PIPE_NAMED)
 		error = vn_chmod(fp, mode, active_cred, td);
 	else
 		error = invfo_chmod(fp, mode, active_cred, td);
 	return (error);
 }
 
 static int
 pipe_chown(fp, uid, gid, active_cred, td)
 	struct file *fp;
 	uid_t uid;
 	gid_t gid;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_state & PIPE_NAMED)
 		error = vn_chown(fp, uid, gid, active_cred, td);
 	else
 		error = invfo_chown(fp, uid, gid, active_cred, td);
 	return (error);
 }
 
 static int
 pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct pipe *pi;
 
 	if (fp->f_type == DTYPE_FIFO)
 		return (vn_fill_kinfo(fp, kif, fdp));
 	kif->kf_type = KF_TYPE_PIPE;
 	pi = fp->f_data;
 	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
 	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
 	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
 	return (0);
 }
 
 static void
 pipe_free_kmem(cpipe)
 	struct pipe *cpipe;
 {
 
 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
 	    ("pipe_free_kmem: pipe mutex locked"));
 
 	if (cpipe->pipe_buffer.buffer != NULL) {
 		atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
 		vm_map_remove(pipe_map,
 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
 		cpipe->pipe_buffer.buffer = NULL;
 	}
 #ifndef PIPE_NODIRECT
 	{
 		cpipe->pipe_map.cnt = 0;
 		cpipe->pipe_map.pos = 0;
 		cpipe->pipe_map.npages = 0;
 	}
 #endif
 }
 
 /*
  * shutdown the pipe
  */
 static void
 pipeclose(cpipe)
 	struct pipe *cpipe;
 {
 	struct pipepair *pp;
 	struct pipe *ppipe;
 
 	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
 
 	PIPE_LOCK(cpipe);
 	pipelock(cpipe, 0);
 	pp = cpipe->pipe_pair;
 
 	pipeselwakeup(cpipe);
 
 	/*
 	 * If the other side is blocked, wake it up saying that
 	 * we want to close it down.
 	 */
 	cpipe->pipe_state |= PIPE_EOF;
 	while (cpipe->pipe_busy) {
 		wakeup(cpipe);
 		cpipe->pipe_state |= PIPE_WANT;
 		pipeunlock(cpipe);
 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
 		pipelock(cpipe, 0);
 	}
 
 
 	/*
 	 * Disconnect from peer, if any.
 	 */
 	ppipe = cpipe->pipe_peer;
 	if (ppipe->pipe_present == PIPE_ACTIVE) {
 		pipeselwakeup(ppipe);
 
 		ppipe->pipe_state |= PIPE_EOF;
 		wakeup(ppipe);
 		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
 	}
 
 	/*
 	 * Mark this endpoint as free.  Release kmem resources.  We
 	 * don't mark this endpoint as unused until we've finished
 	 * doing that, or the pipe might disappear out from under
 	 * us.
 	 */
 	PIPE_UNLOCK(cpipe);
 	pipe_free_kmem(cpipe);
 	PIPE_LOCK(cpipe);
 	cpipe->pipe_present = PIPE_CLOSING;
 	pipeunlock(cpipe);
 
 	/*
 	 * knlist_clear() may sleep dropping the PIPE_MTX. Set the
 	 * PIPE_FINALIZED, that allows other end to free the
 	 * pipe_pair, only after the knotes are completely dismantled.
 	 */
 	knlist_clear(&cpipe->pipe_sel.si_note, 1);
 	cpipe->pipe_present = PIPE_FINALIZED;
 	seldrain(&cpipe->pipe_sel);
 	knlist_destroy(&cpipe->pipe_sel.si_note);
 
 	/*
 	 * If both endpoints are now closed, release the memory for the
 	 * pipe pair.  If not, unlock.
 	 */
 	if (ppipe->pipe_present == PIPE_FINALIZED) {
 		PIPE_UNLOCK(cpipe);
 #ifdef MAC
 		mac_pipe_destroy(pp);
 #endif
 		uma_zfree(pipe_zone, cpipe->pipe_pair);
 	} else
 		PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 pipe_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct pipe *cpipe;
 
 	/*
 	 * If a filter is requested that is not supported by this file
 	 * descriptor, don't return an error, but also don't ever generate an
 	 * event.
 	 */
 	if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
 		kn->kn_fop = &pipe_nfiltops;
 		return (0);
 	}
 	if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
 		kn->kn_fop = &pipe_nfiltops;
 		return (0);
 	}
 	cpipe = fp->f_data;
 	PIPE_LOCK(cpipe);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pipe_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pipe_wfiltops;
 		if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
 			/* other end of pipe has been closed */
 			PIPE_UNLOCK(cpipe);
 			return (EPIPE);
 		}
 		cpipe = PIPE_PEER(cpipe);
 		break;
 	default:
 		PIPE_UNLOCK(cpipe);
 		return (EINVAL);
 	}
 
 	kn->kn_hook = cpipe; 
 	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 	return (0);
 }
 
 static void
 filt_pipedetach(struct knote *kn)
 {
 	struct pipe *cpipe = kn->kn_hook;
 
 	PIPE_LOCK(cpipe);
 	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 filt_piperead(struct knote *kn, long hint)
 {
 	struct pipe *rpipe = kn->kn_hook;
 	struct pipe *wpipe = rpipe->pipe_peer;
 	int ret;
 
 	PIPE_LOCK_ASSERT(rpipe, MA_OWNED);
 	kn->kn_data = rpipe->pipe_buffer.cnt;
 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
 		kn->kn_data = rpipe->pipe_map.cnt;
 
 	if ((rpipe->pipe_state & PIPE_EOF) ||
 	    wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	ret = kn->kn_data > 0;
 	return ret;
 }
 
 /*ARGSUSED*/
 static int
 filt_pipewrite(struct knote *kn, long hint)
 {
 	struct pipe *wpipe;
    
 	wpipe = kn->kn_hook;
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	if (wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_data = 0;
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	kn->kn_data = (wpipe->pipe_buffer.size > 0) ?
 	    (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF;
 	if (wpipe->pipe_state & PIPE_DIRECTW)
 		kn->kn_data = 0;
 
 	return (kn->kn_data >= PIPE_BUF);
 }
 
 static void
 filt_pipedetach_notsup(struct knote *kn)
 {
 
 }
 
 static int
 filt_pipenotsup(struct knote *kn, long hint)
 {
 
 	return (0);
 }
Index: head/sys/kern/sys_procdesc.c
===================================================================
--- head/sys/kern/sys_procdesc.c	(revision 326270)
+++ head/sys/kern/sys_procdesc.c	(revision 326271)
@@ -1,571 +1,573 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2009, 2016 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*-
  * FreeBSD process descriptor facility.
  *
  * Some processes are represented by a file descriptor, which will be used in
  * preference to signaling and pids for the purposes of process management,
  * and is, in effect, a form of capability.  When a process descriptor is
  * used with a process, it ceases to be visible to certain traditional UNIX
  * process facilities, such as waitpid(2).
  *
  * Some semantics:
  *
  * - At most one process descriptor will exist for any process, although
  *   references to that descriptor may be held from many processes (or even
  *   be in flight between processes over a local domain socket).
  * - Last close on the process descriptor will terminate the process using
  *   SIGKILL and reparent it to init so that there's a process to reap it
  *   when it's done exiting.
  * - If the process exits before the descriptor is closed, it will not
  *   generate SIGCHLD on termination, or be picked up by waitpid().
  * - The pdkill(2) system call may be used to deliver a signal to the process
  *   using its process descriptor.
  * - The pdwait4(2) system call may be used to block (or not) on a process
  *   descriptor to collect termination information.
  *
  * Open questions:
  *
  * - How to handle ptrace(2)?
  * - Will we want to add a pidtoprocdesc(2) system call to allow process
  *   descriptors to be created for processes without pdfork(2)?
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/sysproto.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/ucred.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 
 FEATURE(process_descriptors, "Process Descriptors");
 
 static uma_zone_t procdesc_zone;
 
 static fo_poll_t	procdesc_poll;
 static fo_kqfilter_t	procdesc_kqfilter;
 static fo_stat_t	procdesc_stat;
 static fo_close_t	procdesc_close;
 static fo_fill_kinfo_t	procdesc_fill_kinfo;
 
 static struct fileops procdesc_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = procdesc_poll,
 	.fo_kqfilter = procdesc_kqfilter,
 	.fo_stat = procdesc_stat,
 	.fo_close = procdesc_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = procdesc_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 /*
  * Initialize with VFS so that process descriptors are available along with
  * other file descriptor types.  As long as it runs before init(8) starts,
  * there shouldn't be a problem.
  */
 static void
 procdesc_init(void *dummy __unused)
 {
 
 	procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	if (procdesc_zone == NULL)
 		panic("procdesc_init: procdesc_zone not initialized");
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL);
 
 /*
  * Return a locked process given a process descriptor, or ESRCH if it has
  * died.
  */
 int
 procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp,
     struct proc **p)
 {
 	struct procdesc *pd;
 	struct file *fp;
 	int error;
 
 	error = fget(td, fd, rightsp, &fp);
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
 		error = EBADF;
 		goto out;
 	}
 	pd = fp->f_data;
 	sx_slock(&proctree_lock);
 	if (pd->pd_proc != NULL) {
 		*p = pd->pd_proc;
 		PROC_LOCK(*p);
 	} else
 		error = ESRCH;
 	sx_sunlock(&proctree_lock);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Function to be used by procstat(1) sysctls when returning procdesc
  * information.
  */
 pid_t
 procdesc_pid(struct file *fp_procdesc)
 {
 	struct procdesc *pd;
 
 	KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC,
 	   ("procdesc_pid: !procdesc"));
 
 	pd = fp_procdesc->f_data;
 	return (pd->pd_pid);
 }
 
 /*
  * Retrieve the PID associated with a process descriptor.
  */
 int
 kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget(td, fd, rightsp, &fp);
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
 		error = EBADF;
 		goto out;
 	}
 	*pidp = procdesc_pid(fp);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * System call to return the pid of a process given its process descriptor.
  */
 int
 sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
 {
 	cap_rights_t rights;
 	pid_t pid;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	error = kern_pdgetpid(td, uap->fd,
 	    cap_rights_init(&rights, CAP_PDGETPID), &pid);
 	if (error == 0)
 		error = copyout(&pid, uap->pidp, sizeof(pid));
 	return (error);
 }
 
 /*
  * When a new process is forked by pdfork(), a file descriptor is allocated
  * by the fork code first, then the process is forked, and then we get a
  * chance to set up the process descriptor.  Failure is not permitted at this
  * point, so procdesc_new() must succeed.
  */
 void
 procdesc_new(struct proc *p, int flags)
 {
 	struct procdesc *pd;
 
 	pd = uma_zalloc(procdesc_zone, M_WAITOK | M_ZERO);
 	pd->pd_proc = p;
 	pd->pd_pid = p->p_pid;
 	p->p_procdesc = pd;
 	pd->pd_flags = 0;
 	if (flags & PD_DAEMON)
 		pd->pd_flags |= PDF_DAEMON;
 	PROCDESC_LOCK_INIT(pd);
 	knlist_init_mtx(&pd->pd_selinfo.si_note, &pd->pd_lock);
 
 	/*
 	 * Process descriptors start out with two references: one from their
 	 * struct file, and the other from their struct proc.
 	 */
 	refcount_init(&pd->pd_refcount, 2);
 }
 
 /*
  * Create a new process decriptor for the process that refers to it.
  */
 int
 procdesc_falloc(struct thread *td, struct file **resultfp, int *resultfd,
     int flags, struct filecaps *fcaps)
 {
 	int fflags;
 
 	fflags = 0;
 	if (flags & PD_CLOEXEC)
 		fflags = O_CLOEXEC;
 
 	return (falloc_caps(td, resultfp, resultfd, fflags, fcaps));
 }
 
 /*
  * Initialize a file with a process descriptor.
  */
 void
 procdesc_finit(struct procdesc *pdp, struct file *fp)
 {
 
 	finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops);
 }
 
 static void
 procdesc_free(struct procdesc *pd)
 {
 
 	/*
 	 * When the last reference is released, we assert that the descriptor
 	 * has been closed, but not that the process has exited, as we will
 	 * detach the descriptor before the process dies if the descript is
 	 * closed, as we can't wait synchronously.
 	 */
 	if (refcount_release(&pd->pd_refcount)) {
 		KASSERT(pd->pd_proc == NULL,
 		    ("procdesc_free: pd_proc != NULL"));
 		KASSERT((pd->pd_flags & PDF_CLOSED),
 		    ("procdesc_free: !PDF_CLOSED"));
 
 		knlist_destroy(&pd->pd_selinfo.si_note);
 		PROCDESC_LOCK_DESTROY(pd);
 		uma_zfree(procdesc_zone, pd);
 	}
 }
 
 /*
  * procdesc_exit() - notify a process descriptor that its process is exiting.
  * We use the proctree_lock to ensure that process exit either happens
  * strictly before or strictly after a concurrent call to procdesc_close().
  */
 int
 procdesc_exit(struct proc *p)
 {
 	struct procdesc *pd;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
 
 	pd = p->p_procdesc;
 
 	PROCDESC_LOCK(pd);
 	KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc,
 	    ("procdesc_exit: closed && parent not init"));
 
 	pd->pd_flags |= PDF_EXITED;
 	pd->pd_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig);
 
 	/*
 	 * If the process descriptor has been closed, then we have nothing
 	 * to do; return 1 so that init will get SIGCHLD and do the reaping.
 	 * Clean up the procdesc now rather than letting it happen during
 	 * that reap.
 	 */
 	if (pd->pd_flags & PDF_CLOSED) {
 		PROCDESC_UNLOCK(pd);
 		pd->pd_proc = NULL;
 		p->p_procdesc = NULL;
 		procdesc_free(pd);
 		return (1);
 	}
 	if (pd->pd_flags & PDF_SELECTED) {
 		pd->pd_flags &= ~PDF_SELECTED;
 		selwakeup(&pd->pd_selinfo);
 	}
 	KNOTE_LOCKED(&pd->pd_selinfo.si_note, NOTE_EXIT);
 	PROCDESC_UNLOCK(pd);
 	return (0);
 }
 
 /*
  * When a process descriptor is reaped, perhaps as a result of close() or
  * pdwait4(), release the process's reference on the process descriptor.
  */
 void
 procdesc_reap(struct proc *p)
 {
 	struct procdesc *pd;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
 
 	pd = p->p_procdesc;
 	pd->pd_proc = NULL;
 	p->p_procdesc = NULL;
 	procdesc_free(pd);
 }
 
 /*
  * procdesc_close() - last close on a process descriptor.  If the process is
  * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let
  * init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
  */
 static int
 procdesc_close(struct file *fp, struct thread *td)
 {
 	struct procdesc *pd;
 	struct proc *p;
 
 	KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc"));
 
 	pd = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	sx_xlock(&proctree_lock);
 	PROCDESC_LOCK(pd);
 	pd->pd_flags |= PDF_CLOSED;
 	PROCDESC_UNLOCK(pd);
 	p = pd->pd_proc;
 	if (p == NULL) {
 		/*
 		 * This is the case where process' exit status was already
 		 * collected and procdesc_reap() was already called.
 		 */
 		sx_xunlock(&proctree_lock);
 	} else {
 		PROC_LOCK(p);
 		AUDIT_ARG_PROCESS(p);
 		if (p->p_state == PRS_ZOMBIE) {
 			/*
 			 * If the process is already dead and just awaiting
 			 * reaping, do that now.  This will release the
 			 * process's reference to the process descriptor when it
 			 * calls back into procdesc_reap().
 			 */
 			PROC_SLOCK(p);
 			proc_reap(curthread, p, NULL, 0);
 		} else {
 			/*
 			 * If the process is not yet dead, we need to kill it,
 			 * but we can't wait around synchronously for it to go
 			 * away, as that path leads to madness (and deadlocks).
 			 * First, detach the process from its descriptor so that
 			 * its exit status will be reported normally.
 			 */
 			pd->pd_proc = NULL;
 			p->p_procdesc = NULL;
 			procdesc_free(pd);
 
 			/*
 			 * Next, reparent it to init(8) so that there's someone
 			 * to pick up the pieces; finally, terminate with
 			 * prejudice.
 			 */
 			p->p_sigparent = SIGCHLD;
 			proc_reparent(p, initproc);
 			if ((pd->pd_flags & PDF_DAEMON) == 0)
 				kern_psignal(p, SIGKILL);
 			PROC_UNLOCK(p);
 			sx_xunlock(&proctree_lock);
 		}
 	}
 
 	/*
 	 * Release the file descriptor's reference on the process descriptor.
 	 */
 	procdesc_free(pd);
 	return (0);
 }
 
 static int
 procdesc_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct procdesc *pd;
 	int revents;
 
 	revents = 0;
 	pd = fp->f_data;
 	PROCDESC_LOCK(pd);
 	if (pd->pd_flags & PDF_EXITED)
 		revents |= POLLHUP;
 	if (revents == 0) {
 		selrecord(td, &pd->pd_selinfo);
 		pd->pd_flags |= PDF_SELECTED;
 	}
 	PROCDESC_UNLOCK(pd);
 	return (revents);
 }
 
 static void
 procdesc_kqops_detach(struct knote *kn)
 {
 	struct procdesc *pd;
 
 	pd = kn->kn_fp->f_data;
 	knlist_remove(&pd->pd_selinfo.si_note, kn, 0);
 }
 
 static int
 procdesc_kqops_event(struct knote *kn, long hint)
 {
 	struct procdesc *pd;
 	u_int event;
 
 	pd = kn->kn_fp->f_data;
 	if (hint == 0) {
 		/*
 		 * Initial test after registration. Generate a NOTE_EXIT in
 		 * case the process already terminated before registration.
 		 */
 		event = pd->pd_flags & PDF_EXITED ? NOTE_EXIT : 0;
 	} else {
 		/* Mask off extra data. */
 		event = (u_int)hint & NOTE_PCTRLMASK;
 	}
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		if (kn->kn_fflags & NOTE_EXIT)
 			kn->kn_data = pd->pd_xstat;
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 static struct filterops procdesc_kqops = {
 	.f_isfd = 1,
 	.f_detach = procdesc_kqops_detach,
 	.f_event = procdesc_kqops_event,
 };
 
 static int
 procdesc_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct procdesc *pd;
 
 	pd = fp->f_data;
 	switch (kn->kn_filter) {
 	case EVFILT_PROCDESC:
 		kn->kn_fop = &procdesc_kqops;
 		kn->kn_flags |= EV_CLEAR;
 		knlist_add(&pd->pd_selinfo.si_note, kn, 0);
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 static int
 procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct procdesc *pd;
 	struct timeval pstart, boottime;
 
 	/*
 	 * XXXRW: Perhaps we should cache some more information from the
 	 * process so that we can return it reliably here even after it has
 	 * died.  For example, caching its credential data.
 	 */
 	bzero(sb, sizeof(*sb));
 	pd = fp->f_data;
 	sx_slock(&proctree_lock);
 	if (pd->pd_proc != NULL) {
 		PROC_LOCK(pd->pd_proc);
 		AUDIT_ARG_PROCESS(pd->pd_proc);
 
 		/* Set birth and [acm] times to process start time. */
 		pstart = pd->pd_proc->p_stats->p_start;
 		getboottime(&boottime);
 		timevaladd(&pstart, &boottime);
 		TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim);
 		sb->st_atim = sb->st_birthtim;
 		sb->st_ctim = sb->st_birthtim;
 		sb->st_mtim = sb->st_birthtim;
 		if (pd->pd_proc->p_state != PRS_ZOMBIE)
 			sb->st_mode = S_IFREG | S_IRWXU;
 		else
 			sb->st_mode = S_IFREG;
 		sb->st_uid = pd->pd_proc->p_ucred->cr_ruid;
 		sb->st_gid = pd->pd_proc->p_ucred->cr_rgid;
 		PROC_UNLOCK(pd->pd_proc);
 	} else
 		sb->st_mode = S_IFREG;
 	sx_sunlock(&proctree_lock);
 	return (0);
 }
 
 static int
 procdesc_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp)
 {
 	struct procdesc *pdp;
 
 	kif->kf_type = KF_TYPE_PROCDESC;
 	pdp = fp->f_data;
 	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
 	return (0);
 }
Index: head/sys/kern/sys_process.c
===================================================================
--- head/sys/kern/sys_process.c	(revision 326270)
+++ head/sys/kern/sys_process.c	(revision 326271)
@@ -1,1486 +1,1488 @@
 /*-
+ * SPDX-License-Identifier: BSD-4-Clause
+ *
  * Copyright (c) 1994, Sean Eric Fagan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Sean Eric Fagan.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/pioctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/ptrace.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/malloc.h>
 #include <sys/signalvar.h>
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/procfs.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 
 struct ptrace_io_desc32 {
 	int		piod_op;
 	uint32_t	piod_offs;
 	uint32_t	piod_addr;
 	uint32_t	piod_len;
 };
 
 struct ptrace_vm_entry32 {
 	int		pve_entry;
 	int		pve_timestamp;
 	uint32_t	pve_start;
 	uint32_t	pve_end;
 	uint32_t	pve_offset;
 	u_int		pve_prot;
 	u_int		pve_pathlen;
 	int32_t		pve_fileid;
 	u_int		pve_fsid;
 	uint32_t	pve_path;
 };
 #endif
 
 /*
  * Functions implemented using PROC_ACTION():
  *
  * proc_read_regs(proc, regs)
  *	Get the current user-visible register set from the process
  *	and copy it into the regs structure (<machine/reg.h>).
  *	The process is stopped at the time read_regs is called.
  *
  * proc_write_regs(proc, regs)
  *	Update the current register set from the passed in regs
  *	structure.  Take care to avoid clobbering special CPU
  *	registers or privileged bits in the PSL.
  *	Depending on the architecture this may have fix-up work to do,
  *	especially if the IAR or PCW are modified.
  *	The process is stopped at the time write_regs is called.
  *
  * proc_read_fpregs, proc_write_fpregs
  *	deal with the floating point register set, otherwise as above.
  *
  * proc_read_dbregs, proc_write_dbregs
  *	deal with the processor debug register set, otherwise as above.
  *
  * proc_sstep(proc)
  *	Arrange for the process to trap after executing a single instruction.
  */
 
 #define	PROC_ACTION(action) do {					\
 	int error;							\
 									\
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);			\
 	if ((td->td_proc->p_flag & P_INMEM) == 0)			\
 		error = EIO;						\
 	else								\
 		error = (action);					\
 	return (error);							\
 } while(0)
 
 int
 proc_read_regs(struct thread *td, struct reg *regs)
 {
 
 	PROC_ACTION(fill_regs(td, regs));
 }
 
 int
 proc_write_regs(struct thread *td, struct reg *regs)
 {
 
 	PROC_ACTION(set_regs(td, regs));
 }
 
 int
 proc_read_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	PROC_ACTION(fill_dbregs(td, dbregs));
 }
 
 int
 proc_write_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	PROC_ACTION(set_dbregs(td, dbregs));
 }
 
 /*
  * Ptrace doesn't support fpregs at all, and there are no security holes
  * or translations for fpregs, so we can just copy them.
  */
 int
 proc_read_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	PROC_ACTION(fill_fpregs(td, fpregs));
 }
 
 int
 proc_write_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	PROC_ACTION(set_fpregs(td, fpregs));
 }
 
 #ifdef COMPAT_FREEBSD32
 /* For 32 bit binaries, we need to expose the 32 bit regs layouts. */
 int
 proc_read_regs32(struct thread *td, struct reg32 *regs32)
 {
 
 	PROC_ACTION(fill_regs32(td, regs32));
 }
 
 int
 proc_write_regs32(struct thread *td, struct reg32 *regs32)
 {
 
 	PROC_ACTION(set_regs32(td, regs32));
 }
 
 int
 proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
 {
 
 	PROC_ACTION(fill_dbregs32(td, dbregs32));
 }
 
 int
 proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
 {
 
 	PROC_ACTION(set_dbregs32(td, dbregs32));
 }
 
 int
 proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
 {
 
 	PROC_ACTION(fill_fpregs32(td, fpregs32));
 }
 
 int
 proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
 {
 
 	PROC_ACTION(set_fpregs32(td, fpregs32));
 }
 #endif
 
 int
 proc_sstep(struct thread *td)
 {
 
 	PROC_ACTION(ptrace_single_step(td));
 }
 
 int
 proc_rwmem(struct proc *p, struct uio *uio)
 {
 	vm_map_t map;
 	vm_offset_t pageno;		/* page number */
 	vm_prot_t reqprot;
 	int error, fault_flags, page_offset, writing;
 
 	/*
 	 * Assert that someone has locked this vmspace.  (Should be
 	 * curthread but we can't assert that.)  This keeps the process
 	 * from exiting out from under us until this operation completes.
 	 */
 	PROC_ASSERT_HELD(p);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 
 	/*
 	 * The map we want...
 	 */
 	map = &p->p_vmspace->vm_map;
 
 	/*
 	 * If we are writing, then we request vm_fault() to create a private
 	 * copy of each page.  Since these copies will not be writeable by the
 	 * process, we must explicity request that they be dirtied.
 	 */
 	writing = uio->uio_rw == UIO_WRITE;
 	reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
 	fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
 
 	/*
 	 * Only map in one page at a time.  We don't have to, but it
 	 * makes things easier.  This way is trivial - right?
 	 */
 	do {
 		vm_offset_t uva;
 		u_int len;
 		vm_page_t m;
 
 		uva = (vm_offset_t)uio->uio_offset;
 
 		/*
 		 * Get the page number of this segment.
 		 */
 		pageno = trunc_page(uva);
 		page_offset = uva - pageno;
 
 		/*
 		 * How many bytes to copy
 		 */
 		len = min(PAGE_SIZE - page_offset, uio->uio_resid);
 
 		/*
 		 * Fault and hold the page on behalf of the process.
 		 */
 		error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
 		if (error != KERN_SUCCESS) {
 			if (error == KERN_RESOURCE_SHORTAGE)
 				error = ENOMEM;
 			else
 				error = EFAULT;
 			break;
 		}
 
 		/*
 		 * Now do the i/o move.
 		 */
 		error = uiomove_fromphys(&m, page_offset, len, uio);
 
 		/* Make the I-cache coherent for breakpoints. */
 		if (writing && error == 0) {
 			vm_map_lock_read(map);
 			if (vm_map_check_protection(map, pageno, pageno +
 			    PAGE_SIZE, VM_PROT_EXECUTE))
 				vm_sync_icache(map, uva, len);
 			vm_map_unlock_read(map);
 		}
 
 		/*
 		 * Release the page.
 		 */
 		vm_page_lock(m);
 		vm_page_unhold(m);
 		vm_page_unlock(m);
 
 	} while (error == 0 && uio->uio_resid > 0);
 
 	return (error);
 }
 
 static ssize_t
 proc_iop(struct thread *td, struct proc *p, vm_offset_t va, void *buf,
     size_t len, enum uio_rw rw)
 {
 	struct iovec iov;
 	struct uio uio;
 	ssize_t slen;
 	int error;
 
 	MPASS(len < SSIZE_MAX);
 	slen = (ssize_t)len;
 
 	iov.iov_base = (caddr_t)buf;
 	iov.iov_len = len;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = va;
 	uio.uio_resid = slen;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = rw;
 	uio.uio_td = td;
 	error = proc_rwmem(p, &uio);
 	if (uio.uio_resid == slen)
 		return (-1);
 	return (slen - uio.uio_resid);
 }
 
 ssize_t
 proc_readmem(struct thread *td, struct proc *p, vm_offset_t va, void *buf,
     size_t len)
 {
 
 	return (proc_iop(td, p, va, buf, len, UIO_READ));
 }
 
 ssize_t
 proc_writemem(struct thread *td, struct proc *p, vm_offset_t va, void *buf,
     size_t len)
 {
 
 	return (proc_iop(td, p, va, buf, len, UIO_WRITE));
 }
 
 static int
 ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve)
 {
 	struct vattr vattr;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t obj, tobj, lobj;
 	struct vmspace *vm;
 	struct vnode *vp;
 	char *freepath, *fullpath;
 	u_int pathlen;
 	int error, index;
 
 	error = 0;
 	obj = NULL;
 
 	vm = vmspace_acquire_ref(p);
 	map = &vm->vm_map;
 	vm_map_lock_read(map);
 
 	do {
 		entry = map->header.next;
 		index = 0;
 		while (index < pve->pve_entry && entry != &map->header) {
 			entry = entry->next;
 			index++;
 		}
 		if (index != pve->pve_entry) {
 			error = EINVAL;
 			break;
 		}
 		while (entry != &map->header &&
 		    (entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
 			entry = entry->next;
 			index++;
 		}
 		if (entry == &map->header) {
 			error = ENOENT;
 			break;
 		}
 
 		/* We got an entry. */
 		pve->pve_entry = index + 1;
 		pve->pve_timestamp = map->timestamp;
 		pve->pve_start = entry->start;
 		pve->pve_end = entry->end - 1;
 		pve->pve_offset = entry->offset;
 		pve->pve_prot = entry->protection;
 
 		/* Backing object's path needed? */
 		if (pve->pve_pathlen == 0)
 			break;
 
 		pathlen = pve->pve_pathlen;
 		pve->pve_pathlen = 0;
 
 		obj = entry->object.vm_object;
 		if (obj != NULL)
 			VM_OBJECT_RLOCK(obj);
 	} while (0);
 
 	vm_map_unlock_read(map);
 
 	pve->pve_fsid = VNOVAL;
 	pve->pve_fileid = VNOVAL;
 
 	if (error == 0 && obj != NULL) {
 		lobj = obj;
 		for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
 			if (tobj != obj)
 				VM_OBJECT_RLOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 			lobj = tobj;
 			pve->pve_offset += tobj->backing_object_offset;
 		}
 		vp = vm_object_vnode(lobj);
 		if (vp != NULL)
 			vref(vp);
 		if (lobj != obj)
 			VM_OBJECT_RUNLOCK(lobj);
 		VM_OBJECT_RUNLOCK(obj);
 
 		if (vp != NULL) {
 			freepath = NULL;
 			fullpath = NULL;
 			vn_fullpath(td, vp, &fullpath, &freepath);
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) {
 				pve->pve_fileid = vattr.va_fileid;
 				pve->pve_fsid = vattr.va_fsid;
 			}
 			vput(vp);
 
 			if (fullpath != NULL) {
 				pve->pve_pathlen = strlen(fullpath) + 1;
 				if (pve->pve_pathlen <= pathlen) {
 					error = copyout(fullpath, pve->pve_path,
 					    pve->pve_pathlen);
 				} else
 					error = ENAMETOOLONG;
 			}
 			if (freepath != NULL)
 				free(freepath, M_TEMP);
 		}
 	}
 	vmspace_free(vm);
 	if (error == 0)
 		CTR3(KTR_PTRACE, "PT_VM_ENTRY: pid %d, entry %d, start %p",
 		    p->p_pid, pve->pve_entry, pve->pve_start);
 
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 static int
 ptrace_vm_entry32(struct thread *td, struct proc *p,
     struct ptrace_vm_entry32 *pve32)
 {
 	struct ptrace_vm_entry pve;
 	int error;
 
 	pve.pve_entry = pve32->pve_entry;
 	pve.pve_pathlen = pve32->pve_pathlen;
 	pve.pve_path = (void *)(uintptr_t)pve32->pve_path;
 
 	error = ptrace_vm_entry(td, p, &pve);
 	if (error == 0) {
 		pve32->pve_entry = pve.pve_entry;
 		pve32->pve_timestamp = pve.pve_timestamp;
 		pve32->pve_start = pve.pve_start;
 		pve32->pve_end = pve.pve_end;
 		pve32->pve_offset = pve.pve_offset;
 		pve32->pve_prot = pve.pve_prot;
 		pve32->pve_fileid = pve.pve_fileid;
 		pve32->pve_fsid = pve.pve_fsid;
 	}
 
 	pve32->pve_pathlen = pve.pve_pathlen;
 	return (error);
 }
 
 static void
 ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl,
     struct ptrace_lwpinfo32 *pl32)
 {
 
 	bzero(pl32, sizeof(*pl32));
 	pl32->pl_lwpid = pl->pl_lwpid;
 	pl32->pl_event = pl->pl_event;
 	pl32->pl_flags = pl->pl_flags;
 	pl32->pl_sigmask = pl->pl_sigmask;
 	pl32->pl_siglist = pl->pl_siglist;
 	siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
 	strcpy(pl32->pl_tdname, pl->pl_tdname);
 	pl32->pl_child_pid = pl->pl_child_pid;
 	pl32->pl_syscall_code = pl->pl_syscall_code;
 	pl32->pl_syscall_narg = pl->pl_syscall_narg;
 }
 #endif /* COMPAT_FREEBSD32 */
 
 /*
  * Process debugging system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ptrace_args {
 	int	req;
 	pid_t	pid;
 	caddr_t	addr;
 	int	data;
 };
 #endif
 
 #ifdef COMPAT_FREEBSD32
 /*
  * This CPP subterfuge is to try and reduce the number of ifdefs in
  * the body of the code.
  *   COPYIN(uap->addr, &r.reg, sizeof r.reg);
  * becomes either:
  *   copyin(uap->addr, &r.reg, sizeof r.reg);
  * or
  *   copyin(uap->addr, &r.reg32, sizeof r.reg32);
  * .. except this is done at runtime.
  */
 #define	COPYIN(u, k, s)		wrap32 ? \
 	copyin(u, k ## 32, s ## 32) : \
 	copyin(u, k, s)
 #define	COPYOUT(k, u, s)	wrap32 ? \
 	copyout(k ## 32, u, s ## 32) : \
 	copyout(k, u, s)
 #else
 #define	COPYIN(u, k, s)		copyin(u, k, s)
 #define	COPYOUT(k, u, s)	copyout(k, u, s)
 #endif
 int
 sys_ptrace(struct thread *td, struct ptrace_args *uap)
 {
 	/*
 	 * XXX this obfuscation is to reduce stack usage, but the register
 	 * structs may be too large to put on the stack anyway.
 	 */
 	union {
 		struct ptrace_io_desc piod;
 		struct ptrace_lwpinfo pl;
 		struct ptrace_vm_entry pve;
 		struct dbreg dbreg;
 		struct fpreg fpreg;
 		struct reg reg;
 #ifdef COMPAT_FREEBSD32
 		struct dbreg32 dbreg32;
 		struct fpreg32 fpreg32;
 		struct reg32 reg32;
 		struct ptrace_io_desc32 piod32;
 		struct ptrace_lwpinfo32 pl32;
 		struct ptrace_vm_entry32 pve32;
 #endif
 		char args[nitems(td->td_sa.args) * sizeof(register_t)];
 		int ptevents;
 	} r;
 	void *addr;
 	int error = 0;
 #ifdef COMPAT_FREEBSD32
 	int wrap32 = 0;
 
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		wrap32 = 1;
 #endif
 	AUDIT_ARG_PID(uap->pid);
 	AUDIT_ARG_CMD(uap->req);
 	AUDIT_ARG_VALUE(uap->data);
 	addr = &r;
 	switch (uap->req) {
 	case PT_GET_EVENT_MASK:
 	case PT_GETREGS:
 	case PT_GETFPREGS:
 	case PT_GETDBREGS:
 	case PT_LWPINFO:
 	case PT_GET_SC_ARGS:
 		break;
 	case PT_SETREGS:
 		error = COPYIN(uap->addr, &r.reg, sizeof r.reg);
 		break;
 	case PT_SETFPREGS:
 		error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg);
 		break;
 	case PT_SETDBREGS:
 		error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg);
 		break;
 	case PT_SET_EVENT_MASK:
 		if (uap->data != sizeof(r.ptevents))
 			error = EINVAL;
 		else
 			error = copyin(uap->addr, &r.ptevents, uap->data);
 		break;
 	case PT_IO:
 		error = COPYIN(uap->addr, &r.piod, sizeof r.piod);
 		break;
 	case PT_VM_ENTRY:
 		error = COPYIN(uap->addr, &r.pve, sizeof r.pve);
 		break;
 	default:
 		addr = uap->addr;
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data);
 	if (error)
 		return (error);
 
 	switch (uap->req) {
 	case PT_VM_ENTRY:
 		error = COPYOUT(&r.pve, uap->addr, sizeof r.pve);
 		break;
 	case PT_IO:
 		error = COPYOUT(&r.piod, uap->addr, sizeof r.piod);
 		break;
 	case PT_GETREGS:
 		error = COPYOUT(&r.reg, uap->addr, sizeof r.reg);
 		break;
 	case PT_GETFPREGS:
 		error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg);
 		break;
 	case PT_GETDBREGS:
 		error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg);
 		break;
 	case PT_GET_EVENT_MASK:
 		/* NB: The size in uap->data is validated in kern_ptrace(). */
 		error = copyout(&r.ptevents, uap->addr, uap->data);
 		break;
 	case PT_LWPINFO:
 		/* NB: The size in uap->data is validated in kern_ptrace(). */
 		error = copyout(&r.pl, uap->addr, uap->data);
 		break;
 	case PT_GET_SC_ARGS:
 		error = copyout(r.args, uap->addr, MIN(uap->data,
 		    sizeof(r.args)));
 		break;
 	}
 
 	return (error);
 }
 #undef COPYIN
 #undef COPYOUT
 
 #ifdef COMPAT_FREEBSD32
 /*
  *   PROC_READ(regs, td2, addr);
  * becomes either:
  *   proc_read_regs(td2, addr);
  * or
  *   proc_read_regs32(td2, addr);
  * .. except this is done at runtime.  There is an additional
  * complication in that PROC_WRITE disallows 32 bit consumers
  * from writing to 64 bit address space targets.
  */
 #define	PROC_READ(w, t, a)	wrap32 ? \
 	proc_read_ ## w ## 32(t, a) : \
 	proc_read_ ## w (t, a)
 #define	PROC_WRITE(w, t, a)	wrap32 ? \
 	(safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \
 	proc_write_ ## w (t, a)
 #else
 #define	PROC_READ(w, t, a)	proc_read_ ## w (t, a)
 #define	PROC_WRITE(w, t, a)	proc_write_ ## w (t, a)
 #endif
 
 void
 proc_set_traced(struct proc *p, bool stop)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag |= P_TRACED;
 	if (stop)
 		p->p_flag2 |= P2_PTRACE_FSTP;
 	p->p_ptevents = PTRACE_DEFAULT;
 	p->p_oppid = p->p_pptr->p_pid;
 }
 
 int
 kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 {
 	struct iovec iov;
 	struct uio uio;
 	struct proc *curp, *p, *pp;
 	struct thread *td2 = NULL, *td3;
 	struct ptrace_io_desc *piod = NULL;
 	struct ptrace_lwpinfo *pl;
 	int error, num, tmp;
 	int proctree_locked = 0;
 	lwpid_t tid = 0, *buf;
 #ifdef COMPAT_FREEBSD32
 	int wrap32 = 0, safe = 0;
 	struct ptrace_io_desc32 *piod32 = NULL;
 	struct ptrace_lwpinfo32 *pl32 = NULL;
 	struct ptrace_lwpinfo plr;
 #endif
 
 	curp = td->td_proc;
 
 	/* Lock proctree before locking the process. */
 	switch (req) {
 	case PT_TRACE_ME:
 	case PT_ATTACH:
 	case PT_STEP:
 	case PT_CONTINUE:
 	case PT_TO_SCE:
 	case PT_TO_SCX:
 	case PT_SYSCALL:
 	case PT_FOLLOW_FORK:
 	case PT_LWP_EVENTS:
 	case PT_GET_EVENT_MASK:
 	case PT_SET_EVENT_MASK:
 	case PT_DETACH:
 	case PT_GET_SC_ARGS:
 		sx_xlock(&proctree_lock);
 		proctree_locked = 1;
 		break;
 	default:
 		break;
 	}
 
 	if (req == PT_TRACE_ME) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		if (pid <= PID_MAX) {
 			if ((p = pfind(pid)) == NULL) {
 				if (proctree_locked)
 					sx_xunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		} else {
 			td2 = tdfind(pid, -1);
 			if (td2 == NULL) {
 				if (proctree_locked)
 					sx_xunlock(&proctree_lock);
 				return (ESRCH);
 			}
 			p = td2->td_proc;
 			tid = pid;
 			pid = p->p_pid;
 		}
 	}
 	AUDIT_ARG_PROCESS(p);
 
 	if ((p->p_flag & P_WEXIT) != 0) {
 		error = ESRCH;
 		goto fail;
 	}
 	if ((error = p_cansee(td, p)) != 0)
 		goto fail;
 
 	if ((error = p_candebug(td, p)) != 0)
 		goto fail;
 
 	/*
 	 * System processes can't be debugged.
 	 */
 	if ((p->p_flag & P_SYSTEM) != 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (tid == 0) {
 		if ((p->p_flag & P_STOPPED_TRACE) != 0) {
 			KASSERT(p->p_xthread != NULL, ("NULL p_xthread"));
 			td2 = p->p_xthread;
 		} else {
 			td2 = FIRST_THREAD_IN_PROC(p);
 		}
 		tid = td2->td_tid;
 	}
 
 #ifdef COMPAT_FREEBSD32
 	/*
 	 * Test if we're a 32 bit client and what the target is.
 	 * Set the wrap controls accordingly.
 	 */
 	if (SV_CURPROC_FLAG(SV_ILP32)) {
 		if (SV_PROC_FLAG(td2->td_proc, SV_ILP32))
 			safe = 1;
 		wrap32 = 1;
 	}
 #endif
 	/*
 	 * Permissions check
 	 */
 	switch (req) {
 	case PT_TRACE_ME:
 		/*
 		 * Always legal, when there is a parent process which
 		 * could trace us.  Otherwise, reject.
 		 */
 		if ((p->p_flag & P_TRACED) != 0) {
 			error = EBUSY;
 			goto fail;
 		}
 		if (p->p_pptr == initproc) {
 			error = EPERM;
 			goto fail;
 		}
 		break;
 
 	case PT_ATTACH:
 		/* Self */
 		if (p == td->td_proc) {
 			error = EINVAL;
 			goto fail;
 		}
 
 		/* Already traced */
 		if (p->p_flag & P_TRACED) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		/* Can't trace an ancestor if you're being traced. */
 		if (curp->p_flag & P_TRACED) {
 			for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
 				if (pp == p) {
 					error = EINVAL;
 					goto fail;
 				}
 			}
 		}
 
 
 		/* OK */
 		break;
 
 	case PT_CLEARSTEP:
 		/* Allow thread to clear single step for itself */
 		if (td->td_tid == tid)
 			break;
 
 		/* FALLTHROUGH */
 	default:
 		/* not being traced... */
 		if ((p->p_flag & P_TRACED) == 0) {
 			error = EPERM;
 			goto fail;
 		}
 
 		/* not being traced by YOU */
 		if (p->p_pptr != td->td_proc) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		/* not currently stopped */
 		if ((p->p_flag & P_STOPPED_TRACE) == 0 ||
 		    p->p_suspcount != p->p_numthreads  ||
 		    (p->p_flag & P_WAITED) == 0) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		/* OK */
 		break;
 	}
 
 	/* Keep this process around until we finish this request. */
 	_PHOLD(p);
 
 #ifdef FIX_SSTEP
 	/*
 	 * Single step fixup ala procfs
 	 */
 	FIX_SSTEP(td2);
 #endif
 
 	/*
 	 * Actually do the requests
 	 */
 
 	td->td_retval[0] = 0;
 
 	switch (req) {
 	case PT_TRACE_ME:
 		/* set my trace flag and "owner" so it can read/write me */
 		proc_set_traced(p, false);
 		if (p->p_flag & P_PPWAIT)
 			p->p_flag |= P_PPTRACE;
 		CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid);
 		break;
 
 	case PT_ATTACH:
 		/* security check done above */
 		/*
 		 * It would be nice if the tracing relationship was separate
 		 * from the parent relationship but that would require
 		 * another set of links in the proc struct or for "wait"
 		 * to scan the entire proc table.  To make life easier,
 		 * we just re-parent the process we're trying to trace.
 		 * The old parent is remembered so we can put things back
 		 * on a "detach".
 		 */
 		proc_set_traced(p, true);
 		if (p->p_pptr != td->td_proc) {
 			proc_reparent(p, td->td_proc);
 		}
 		CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid,
 		    p->p_oppid);
 
 		sx_xunlock(&proctree_lock);
 		proctree_locked = 0;
 		MPASS(p->p_xthread == NULL);
 		MPASS((p->p_flag & P_STOPPED_TRACE) == 0);
 
 		/*
 		 * If already stopped due to a stop signal, clear the
 		 * existing stop before triggering a traced SIGSTOP.
 		 */
 		if ((p->p_flag & P_STOPPED_SIG) != 0) {
 			PROC_SLOCK(p);
 			p->p_flag &= ~(P_STOPPED_SIG | P_WAITED);
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 		}
 
 		kern_psignal(p, SIGSTOP);
 		break;
 
 	case PT_CLEARSTEP:
 		CTR2(KTR_PTRACE, "PT_CLEARSTEP: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = ptrace_clear_single_step(td2);
 		break;
 
 	case PT_SETSTEP:
 		CTR2(KTR_PTRACE, "PT_SETSTEP: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = ptrace_single_step(td2);
 		break;
 
 	case PT_SUSPEND:
 		CTR2(KTR_PTRACE, "PT_SUSPEND: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags |= TDB_SUSPEND;
 		thread_lock(td2);
 		td2->td_flags |= TDF_NEEDSUSPCHK;
 		thread_unlock(td2);
 		break;
 
 	case PT_RESUME:
 		CTR2(KTR_PTRACE, "PT_RESUME: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags &= ~TDB_SUSPEND;
 		break;
 
 	case PT_FOLLOW_FORK:
 		CTR3(KTR_PTRACE, "PT_FOLLOW_FORK: pid %d %s -> %s", p->p_pid,
 		    p->p_ptevents & PTRACE_FORK ? "enabled" : "disabled",
 		    data ? "enabled" : "disabled");
 		if (data)
 			p->p_ptevents |= PTRACE_FORK;
 		else
 			p->p_ptevents &= ~PTRACE_FORK;
 		break;
 
 	case PT_LWP_EVENTS:
 		CTR3(KTR_PTRACE, "PT_LWP_EVENTS: pid %d %s -> %s", p->p_pid,
 		    p->p_ptevents & PTRACE_LWP ? "enabled" : "disabled",
 		    data ? "enabled" : "disabled");
 		if (data)
 			p->p_ptevents |= PTRACE_LWP;
 		else
 			p->p_ptevents &= ~PTRACE_LWP;
 		break;
 
 	case PT_GET_EVENT_MASK:
 		if (data != sizeof(p->p_ptevents)) {
 			error = EINVAL;
 			break;
 		}
 		CTR2(KTR_PTRACE, "PT_GET_EVENT_MASK: pid %d mask %#x", p->p_pid,
 		    p->p_ptevents);
 		*(int *)addr = p->p_ptevents;
 		break;
 
 	case PT_SET_EVENT_MASK:
 		if (data != sizeof(p->p_ptevents)) {
 			error = EINVAL;
 			break;
 		}
 		tmp = *(int *)addr;
 		if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX |
 		    PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) {
 			error = EINVAL;
 			break;
 		}
 		CTR3(KTR_PTRACE, "PT_SET_EVENT_MASK: pid %d mask %#x -> %#x",
 		    p->p_pid, p->p_ptevents, tmp);
 		p->p_ptevents = tmp;
 		break;
 
 	case PT_GET_SC_ARGS:
 		CTR1(KTR_PTRACE, "PT_GET_SC_ARGS: pid %d", p->p_pid);
 		if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) == 0
 #ifdef COMPAT_FREEBSD32
 		    || (wrap32 && !safe)
 #endif
 		    ) {
 			error = EINVAL;
 			break;
 		}
 		bzero(addr, sizeof(td2->td_sa.args));
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			for (num = 0; num < nitems(td2->td_sa.args); num++)
 				((uint32_t *)addr)[num] = (uint32_t)
 				    td2->td_sa.args[num];
 		else
 #endif
 			bcopy(td2->td_sa.args, addr, td2->td_sa.narg *
 			    sizeof(register_t));
 		break;
 		
 	case PT_STEP:
 	case PT_CONTINUE:
 	case PT_TO_SCE:
 	case PT_TO_SCX:
 	case PT_SYSCALL:
 	case PT_DETACH:
 		/* Zero means do not send any signal */
 		if (data < 0 || data > _SIG_MAXSIG) {
 			error = EINVAL;
 			break;
 		}
 
 		switch (req) {
 		case PT_STEP:
 			CTR3(KTR_PTRACE, "PT_STEP: tid %d (pid %d), sig = %d",
 			    td2->td_tid, p->p_pid, data);
 			error = ptrace_single_step(td2);
 			if (error)
 				goto out;
 			break;
 		case PT_CONTINUE:
 		case PT_TO_SCE:
 		case PT_TO_SCX:
 		case PT_SYSCALL:
 			if (addr != (void *)1) {
 				error = ptrace_set_pc(td2,
 				    (u_long)(uintfptr_t)addr);
 				if (error)
 					goto out;
 			}
 			switch (req) {
 			case PT_TO_SCE:
 				p->p_ptevents |= PTRACE_SCE;
 				CTR4(KTR_PTRACE,
 		    "PT_TO_SCE: pid %d, events = %#x, PC = %#lx, sig = %d",
 				    p->p_pid, p->p_ptevents,
 				    (u_long)(uintfptr_t)addr, data);
 				break;
 			case PT_TO_SCX:
 				p->p_ptevents |= PTRACE_SCX;
 				CTR4(KTR_PTRACE,
 		    "PT_TO_SCX: pid %d, events = %#x, PC = %#lx, sig = %d",
 				    p->p_pid, p->p_ptevents,
 				    (u_long)(uintfptr_t)addr, data);
 				break;
 			case PT_SYSCALL:
 				p->p_ptevents |= PTRACE_SYSCALL;
 				CTR4(KTR_PTRACE,
 		    "PT_SYSCALL: pid %d, events = %#x, PC = %#lx, sig = %d",
 				    p->p_pid, p->p_ptevents,
 				    (u_long)(uintfptr_t)addr, data);
 				break;
 			case PT_CONTINUE:
 				CTR3(KTR_PTRACE,
 				    "PT_CONTINUE: pid %d, PC = %#lx, sig = %d",
 				    p->p_pid, (u_long)(uintfptr_t)addr, data);
 				break;
 			}
 			break;
 		case PT_DETACH:
 			/*
 			 * Reset the process parent.
 			 *
 			 * NB: This clears P_TRACED before reparenting
 			 * a detached process back to its original
 			 * parent.  Otherwise the debugee will be set
 			 * as an orphan of the debugger.
 			 */
 			p->p_flag &= ~(P_TRACED | P_WAITED);
 			if (p->p_oppid != p->p_pptr->p_pid) {
 				PROC_LOCK(p->p_pptr);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(p->p_pptr);
 
 				pp = proc_realparent(p);
 				proc_reparent(p, pp);
 				if (pp == initproc)
 					p->p_sigparent = SIGCHLD;
 				CTR3(KTR_PTRACE,
 			    "PT_DETACH: pid %d reparented to pid %d, sig %d",
 				    p->p_pid, pp->p_pid, data);
 			} else
 				CTR2(KTR_PTRACE, "PT_DETACH: pid %d, sig %d",
 				    p->p_pid, data);
 			p->p_oppid = 0;
 			p->p_ptevents = 0;
 			FOREACH_THREAD_IN_PROC(p, td3) {
 				if ((td3->td_dbgflags & TDB_FSTP) != 0) {
 					sigqueue_delete(&td3->td_sigqueue,
 					    SIGSTOP);
 				}
 				td3->td_dbgflags &= ~(TDB_XSIG | TDB_FSTP |
 				    TDB_SUSPEND);
 			}
 
 			if ((p->p_flag2 & P2_PTRACE_FSTP) != 0) {
 				sigqueue_delete(&p->p_sigqueue, SIGSTOP);
 				p->p_flag2 &= ~P2_PTRACE_FSTP;
 			}
 
 			/* should we send SIGCHLD? */
 			/* childproc_continued(p); */
 			break;
 		}
 
 		sx_xunlock(&proctree_lock);
 		proctree_locked = 0;
 
 	sendsig:
 		MPASS(proctree_locked == 0);
 		
 		/* 
 		 * Clear the pending event for the thread that just
 		 * reported its event (p_xthread).  This may not be
 		 * the thread passed to PT_CONTINUE, PT_STEP, etc. if
 		 * the debugger is resuming a different thread.
 		 *
 		 * Deliver any pending signal via the reporting thread.
 		 */
 		MPASS(p->p_xthread != NULL);
 		p->p_xthread->td_dbgflags &= ~TDB_XSIG;
 		p->p_xthread->td_xsig = data;
 		p->p_xthread = NULL;
 		p->p_xsig = data;
 
 		/*
 		 * P_WKILLED is insurance that a PT_KILL/SIGKILL
 		 * always works immediately, even if another thread is
 		 * unsuspended first and attempts to handle a
 		 * different signal or if the POSIX.1b style signal
 		 * queue cannot accommodate any new signals.
 		 */
 		if (data == SIGKILL)
 			p->p_flag |= P_WKILLED;
 
 		/*
 		 * Unsuspend all threads.  To leave a thread
 		 * suspended, use PT_SUSPEND to suspend it before
 		 * continuing the process.
 		 */
 		PROC_SLOCK(p);
 		p->p_flag &= ~(P_STOPPED_TRACE | P_STOPPED_SIG | P_WAITED);
 		thread_unsuspend(p);
 		PROC_SUNLOCK(p);
 		break;
 
 	case PT_WRITE_I:
 	case PT_WRITE_D:
 		td2->td_dbgflags |= TDB_USERWR;
 		PROC_UNLOCK(p);
 		error = 0;
 		if (proc_writemem(td, p, (off_t)(uintptr_t)addr, &data,
 		    sizeof(int)) != sizeof(int))
 			error = ENOMEM;
 		else
 			CTR3(KTR_PTRACE, "PT_WRITE: pid %d: %p <= %#x",
 			    p->p_pid, addr, data);
 		PROC_LOCK(p);
 		break;
 
 	case PT_READ_I:
 	case PT_READ_D:
 		PROC_UNLOCK(p);
 		error = tmp = 0;
 		if (proc_readmem(td, p, (off_t)(uintptr_t)addr, &tmp,
 		    sizeof(int)) != sizeof(int))
 			error = ENOMEM;
 		else
 			CTR3(KTR_PTRACE, "PT_READ: pid %d: %p >= %#x",
 			    p->p_pid, addr, tmp);
 		td->td_retval[0] = tmp;
 		PROC_LOCK(p);
 		break;
 
 	case PT_IO:
 #ifdef COMPAT_FREEBSD32
 		if (wrap32) {
 			piod32 = addr;
 			iov.iov_base = (void *)(uintptr_t)piod32->piod_addr;
 			iov.iov_len = piod32->piod_len;
 			uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs;
 			uio.uio_resid = piod32->piod_len;
 		} else
 #endif
 		{
 			piod = addr;
 			iov.iov_base = piod->piod_addr;
 			iov.iov_len = piod->piod_len;
 			uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs;
 			uio.uio_resid = piod->piod_len;
 		}
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_segflg = UIO_USERSPACE;
 		uio.uio_td = td;
 #ifdef COMPAT_FREEBSD32
 		tmp = wrap32 ? piod32->piod_op : piod->piod_op;
 #else
 		tmp = piod->piod_op;
 #endif
 		switch (tmp) {
 		case PIOD_READ_D:
 		case PIOD_READ_I:
 			CTR3(KTR_PTRACE, "PT_IO: pid %d: READ (%p, %#x)",
 			    p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid);
 			uio.uio_rw = UIO_READ;
 			break;
 		case PIOD_WRITE_D:
 		case PIOD_WRITE_I:
 			CTR3(KTR_PTRACE, "PT_IO: pid %d: WRITE (%p, %#x)",
 			    p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid);
 			td2->td_dbgflags |= TDB_USERWR;
 			uio.uio_rw = UIO_WRITE;
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 		PROC_UNLOCK(p);
 		error = proc_rwmem(p, &uio);
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			piod32->piod_len -= uio.uio_resid;
 		else
 #endif
 			piod->piod_len -= uio.uio_resid;
 		PROC_LOCK(p);
 		break;
 
 	case PT_KILL:
 		CTR1(KTR_PTRACE, "PT_KILL: pid %d", p->p_pid);
 		data = SIGKILL;
 		goto sendsig;	/* in PT_CONTINUE above */
 
 	case PT_SETREGS:
 		CTR2(KTR_PTRACE, "PT_SETREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(regs, td2, addr);
 		break;
 
 	case PT_GETREGS:
 		CTR2(KTR_PTRACE, "PT_GETREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = PROC_READ(regs, td2, addr);
 		break;
 
 	case PT_SETFPREGS:
 		CTR2(KTR_PTRACE, "PT_SETFPREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(fpregs, td2, addr);
 		break;
 
 	case PT_GETFPREGS:
 		CTR2(KTR_PTRACE, "PT_GETFPREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = PROC_READ(fpregs, td2, addr);
 		break;
 
 	case PT_SETDBREGS:
 		CTR2(KTR_PTRACE, "PT_SETDBREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(dbregs, td2, addr);
 		break;
 
 	case PT_GETDBREGS:
 		CTR2(KTR_PTRACE, "PT_GETDBREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = PROC_READ(dbregs, td2, addr);
 		break;
 
 	case PT_LWPINFO:
 		if (data <= 0 ||
 #ifdef COMPAT_FREEBSD32
 		    (!wrap32 && data > sizeof(*pl)) ||
 		    (wrap32 && data > sizeof(*pl32))) {
 #else
 		    data > sizeof(*pl)) {
 #endif
 			error = EINVAL;
 			break;
 		}
 #ifdef COMPAT_FREEBSD32
 		if (wrap32) {
 			pl = &plr;
 			pl32 = addr;
 		} else
 #endif
 		pl = addr;
 		bzero(pl, sizeof(*pl));
 		pl->pl_lwpid = td2->td_tid;
 		pl->pl_event = PL_EVENT_NONE;
 		pl->pl_flags = 0;
 		if (td2->td_dbgflags & TDB_XSIG) {
 			pl->pl_event = PL_EVENT_SIGNAL;
 			if (td2->td_si.si_signo != 0 &&
 #ifdef COMPAT_FREEBSD32
 			    ((!wrap32 && data >= offsetof(struct ptrace_lwpinfo,
 			    pl_siginfo) + sizeof(pl->pl_siginfo)) ||
 			    (wrap32 && data >= offsetof(struct ptrace_lwpinfo32,
 			    pl_siginfo) + sizeof(struct siginfo32)))
 #else
 			    data >= offsetof(struct ptrace_lwpinfo, pl_siginfo)
 			    + sizeof(pl->pl_siginfo)
 #endif
 			){
 				pl->pl_flags |= PL_FLAG_SI;
 				pl->pl_siginfo = td2->td_si;
 			}
 		}
 		if (td2->td_dbgflags & TDB_SCE)
 			pl->pl_flags |= PL_FLAG_SCE;
 		else if (td2->td_dbgflags & TDB_SCX)
 			pl->pl_flags |= PL_FLAG_SCX;
 		if (td2->td_dbgflags & TDB_EXEC)
 			pl->pl_flags |= PL_FLAG_EXEC;
 		if (td2->td_dbgflags & TDB_FORK) {
 			pl->pl_flags |= PL_FLAG_FORKED;
 			pl->pl_child_pid = td2->td_dbg_forked;
 			if (td2->td_dbgflags & TDB_VFORK)
 				pl->pl_flags |= PL_FLAG_VFORKED;
 		} else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) ==
 		    TDB_VFORK)
 			pl->pl_flags |= PL_FLAG_VFORK_DONE;
 		if (td2->td_dbgflags & TDB_CHILD)
 			pl->pl_flags |= PL_FLAG_CHILD;
 		if (td2->td_dbgflags & TDB_BORN)
 			pl->pl_flags |= PL_FLAG_BORN;
 		if (td2->td_dbgflags & TDB_EXIT)
 			pl->pl_flags |= PL_FLAG_EXITED;
 		pl->pl_sigmask = td2->td_sigmask;
 		pl->pl_siglist = td2->td_siglist;
 		strcpy(pl->pl_tdname, td2->td_name);
 		if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) != 0) {
 			pl->pl_syscall_code = td2->td_sa.code;
 			pl->pl_syscall_narg = td2->td_sa.narg;
 		} else {
 			pl->pl_syscall_code = 0;
 			pl->pl_syscall_narg = 0;
 		}
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			ptrace_lwpinfo_to32(pl, pl32);
 #endif
 		CTR6(KTR_PTRACE,
     "PT_LWPINFO: tid %d (pid %d) event %d flags %#x child pid %d syscall %d",
 		    td2->td_tid, p->p_pid, pl->pl_event, pl->pl_flags,
 		    pl->pl_child_pid, pl->pl_syscall_code);
 		break;
 
 	case PT_GETNUMLWPS:
 		CTR2(KTR_PTRACE, "PT_GETNUMLWPS: pid %d: %d threads", p->p_pid,
 		    p->p_numthreads);
 		td->td_retval[0] = p->p_numthreads;
 		break;
 
 	case PT_GETLWPLIST:
 		CTR3(KTR_PTRACE, "PT_GETLWPLIST: pid %d: data %d, actual %d",
 		    p->p_pid, data, p->p_numthreads);
 		if (data <= 0) {
 			error = EINVAL;
 			break;
 		}
 		num = imin(p->p_numthreads, data);
 		PROC_UNLOCK(p);
 		buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
 		tmp = 0;
 		PROC_LOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (tmp >= num)
 				break;
 			buf[tmp++] = td2->td_tid;
 		}
 		PROC_UNLOCK(p);
 		error = copyout(buf, addr, tmp * sizeof(lwpid_t));
 		free(buf, M_TEMP);
 		if (!error)
 			td->td_retval[0] = tmp;
 		PROC_LOCK(p);
 		break;
 
 	case PT_VM_TIMESTAMP:
 		CTR2(KTR_PTRACE, "PT_VM_TIMESTAMP: pid %d: timestamp %d",
 		    p->p_pid, p->p_vmspace->vm_map.timestamp);
 		td->td_retval[0] = p->p_vmspace->vm_map.timestamp;
 		break;
 
 	case PT_VM_ENTRY:
 		PROC_UNLOCK(p);
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			error = ptrace_vm_entry32(td, p, addr);
 		else
 #endif
 		error = ptrace_vm_entry(td, p, addr);
 		PROC_LOCK(p);
 		break;
 
 	default:
 #ifdef __HAVE_PTRACE_MACHDEP
 		if (req >= PT_FIRSTMACH) {
 			PROC_UNLOCK(p);
 			error = cpu_ptrace(td2, req, addr, data);
 			PROC_LOCK(p);
 		} else
 #endif
 			/* Unknown request. */
 			error = EINVAL;
 		break;
 	}
 
 out:
 	/* Drop our hold on this process now that the request has completed. */
 	_PRELE(p);
 fail:
 	PROC_UNLOCK(p);
 	if (proctree_locked)
 		sx_xunlock(&proctree_lock);
 	return (error);
 }
 #undef PROC_READ
 #undef PROC_WRITE
 
 /*
  * Stop a process because of a debugging event;
  * stay stopped until p->p_step is cleared
  * (cleared by PIOCCONT in procfs).
  */
 void
 stopevent(struct proc *p, unsigned int event, unsigned int val)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_step = 1;
 	CTR3(KTR_PTRACE, "stopevent: pid %d event %u val %u", p->p_pid, event,
 	    val);
 	do {
 		if (event != S_EXIT)
 			p->p_xsig = val;
 		p->p_xthread = NULL;
 		p->p_stype = event;	/* Which event caused the stop? */
 		wakeup(&p->p_stype);	/* Wake up any PIOCWAIT'ing procs */
 		msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
 	} while (p->p_step);
 }
Index: head/sys/kern/sysv_ipc.c
===================================================================
--- head/sys/kern/sysv_ipc.c	(revision 326270)
+++ head/sys/kern/sysv_ipc.c	(revision 326271)
@@ -1,246 +1,248 @@
 /*	$NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $	*/
 /*-
+ * SPDX-License-Identifier: BSD-4-Clause
+ *
  * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
  * Copyright (c) 2006 nCircle Network Security, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson for the TrustedBSD
  * Project under contract to nCircle Network Security, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Herb Peyerl.
  * 4. The name of Herb Peyerl may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sem.h>
 #include <sys/shm.h>
 #include <sys/ipc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ucred.h>
 
 void (*shmfork_hook)(struct proc *, struct proc *) = NULL;
 void (*shmexit_hook)(struct vmspace *) = NULL;
 
 /* called from kern_fork.c */
 void
 shmfork(p1, p2)
 	struct proc *p1, *p2;
 {
 
 	if (shmfork_hook != NULL)
 		shmfork_hook(p1, p2);
 	return;
 }
 
 /* called from kern_exit.c */
 void
 shmexit(struct vmspace *vm)
 {
 
 	if (shmexit_hook != NULL)
 		shmexit_hook(vm);
 	return;
 }
 
 /*
  * Check for IPC permission.
  *
  * Note: The MAC Framework does not require any modifications to the
  * ipcperm() function, as access control checks are performed throughout the
  * implementation of each primitive.  Those entry point calls complement the
  * ipcperm() discertionary checks.  Unlike file system discretionary access
  * control, the original create of an object is given the same rights as the
  * current owner.
  */
 int
 ipcperm(struct thread *td, struct ipc_perm *perm, int acc_mode)
 {
 	struct ucred *cred = td->td_ucred;
 	int error, obj_mode, dac_granted, priv_granted;
 
 	dac_granted = 0;
 	if (cred->cr_uid == perm->cuid || cred->cr_uid == perm->uid) {
 		obj_mode = perm->mode;
 		dac_granted |= IPC_M;
 	} else if (groupmember(perm->gid, cred) ||
 	    groupmember(perm->cgid, cred)) {
 		obj_mode = perm->mode;
 		obj_mode <<= 3;
 	} else {
 		obj_mode = perm->mode;
 		obj_mode <<= 6;
 	}
 
 	/*
 	 * While the System V IPC permission model allows IPC_M to be
 	 * granted, as part of the mode, our implementation requires
 	 * privilege to adminster the object if not the owner or creator.
 	 */
 #if 0
 	if (obj_mode & IPC_M)
 		dac_granted |= IPC_M;
 #endif
 	if (obj_mode & IPC_R)
 		dac_granted |= IPC_R;
 	if (obj_mode & IPC_W)
 		dac_granted |= IPC_W;
 
 	/*
 	 * Simple case: all required rights are granted by DAC.
 	 */
 	if ((dac_granted & acc_mode) == acc_mode)
 		return (0);
 
 	/*
 	 * Privilege is required to satisfy the request.
 	 */
 	priv_granted = 0;
 	if ((acc_mode & IPC_M) && !(dac_granted & IPC_M)) {
 		error = priv_check(td, PRIV_IPC_ADMIN);
 		if (error == 0)
 			priv_granted |= IPC_M;
 	}
 
 	if ((acc_mode & IPC_R) && !(dac_granted & IPC_R)) {
 		error = priv_check(td, PRIV_IPC_READ);
 		if (error == 0)
 			priv_granted |= IPC_R;
 	}
 
 	if ((acc_mode & IPC_W) && !(dac_granted & IPC_W)) {
 		error = priv_check(td, PRIV_IPC_WRITE);
 		if (error == 0)
 			priv_granted |= IPC_W;
 	}
 
 	if (((dac_granted | priv_granted) & acc_mode) == acc_mode)
 		return (0);
 	else
 		return (EACCES);
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 void
 ipcperm_old2new(struct ipc_perm_old *old, struct ipc_perm *new)
 {
 
 	new->cuid = old->cuid;
 	new->cgid = old->cgid;
 	new->uid = old->uid;
 	new->gid = old->gid;
 	new->mode = old->mode;
 	new->seq = old->seq;
 	new->key = old->key;
 }
 
 void
 ipcperm_new2old(struct ipc_perm *new, struct ipc_perm_old *old)
 {
 
 	/* XXX: How to handle ID's > USHORT_MAX? */
 	old->cuid = new->cuid;
 	old->cgid = new->cgid;
 	old->uid = new->uid;
 	old->gid = new->gid;
 	old->mode = new->mode;
 	old->seq = new->seq;
 	old->key = new->key;
 }
 #endif
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 void
 freebsd32_ipcperm_old_in(struct ipc_perm32_old *ip32, struct ipc_perm *ip)
 {
 
 	CP(*ip32, *ip, cuid);
 	CP(*ip32, *ip, cgid);
 	CP(*ip32, *ip, uid);
 	CP(*ip32, *ip, gid);
 	CP(*ip32, *ip, mode);
 	CP(*ip32, *ip, seq);
 	CP(*ip32, *ip, key);
 }
 
 void
 freebsd32_ipcperm_old_out(struct ipc_perm *ip, struct ipc_perm32_old *ip32)
 {
 
 	CP(*ip, *ip32, cuid);
 	CP(*ip, *ip32, cgid);
 	CP(*ip, *ip32, uid);
 	CP(*ip, *ip32, gid);
 	CP(*ip, *ip32, mode);
 	CP(*ip, *ip32, seq);
 	CP(*ip, *ip32, key);
 }
 #endif
 
 void
 freebsd32_ipcperm_in(struct ipc_perm32 *ip32, struct ipc_perm *ip)
 {
 
 	CP(*ip32, *ip, cuid);
 	CP(*ip32, *ip, cgid);
 	CP(*ip32, *ip, uid);
 	CP(*ip32, *ip, gid);
 	CP(*ip32, *ip, mode);
 	CP(*ip32, *ip, seq);
 	CP(*ip32, *ip, key);
 }
 
 void
 freebsd32_ipcperm_out(struct ipc_perm *ip, struct ipc_perm32 *ip32)
 {
 
 	CP(*ip, *ip32, cuid);
 	CP(*ip, *ip32, cgid);
 	CP(*ip, *ip32, uid);
 	CP(*ip, *ip32, gid);
 	CP(*ip, *ip32, mode);
 	CP(*ip, *ip32, seq);
 	CP(*ip, *ip32, key);
 }
 #endif
Index: head/sys/kern/sysv_msg.c
===================================================================
--- head/sys/kern/sysv_msg.c	(revision 326270)
+++ head/sys/kern/sysv_msg.c	(revision 326271)
@@ -1,1880 +1,1882 @@
 /*-
  * Implementation of SVID messages
  *
  * Author:  Daniel Boulet
  *
  * Copyright 1993 Daniel Boulet and RTMX Inc.
  *
  * This system call was implemented by Daniel Boulet under contract from RTMX.
  *
  * Redistribution and use in source forms, with and without modification,
  * are permitted provided that this entire comment appears intact.
  *
  * Redistribution in binary form may occur without any restrictions.
  * Obviously, it would be nice if you gave credit where credit is due
  * but requiring it would be too onerous.
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003-2005 McAfee, Inc.
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/msg.h>
 #include <sys/racct.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(sysv_msg, "System V message queues support");
 
 static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
 
 static int msginit(void);
 static int msgunload(void);
 static int sysvmsg_modload(struct module *, int, void *);
 static void msq_remove(struct msqid_kernel *);
 static struct prison *msg_find_prison(struct ucred *);
 static int msq_prison_cansee(struct prison *, struct msqid_kernel *);
 static int msg_prison_check(void *, void *);
 static int msg_prison_set(void *, void *);
 static int msg_prison_get(void *, void *);
 static int msg_prison_remove(void *, void *);
 static void msg_prison_cleanup(struct prison *);
 
 
 #ifdef MSG_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)	(void)0
 #endif
 
 static void msg_freehdr(struct msg *msghdr);
 
 #ifndef MSGSSZ
 #define MSGSSZ	8		/* Each segment must be 2^N long */
 #endif
 #ifndef MSGSEG
 #define MSGSEG	2048		/* must be less than 32767 */
 #endif
 #define MSGMAX	(MSGSSZ*MSGSEG)
 #ifndef MSGMNB
 #define MSGMNB	2048		/* max # of bytes in a queue */
 #endif
 #ifndef MSGMNI
 #define MSGMNI	40
 #endif
 #ifndef MSGTQL
 #define MSGTQL	40
 #endif
 
 /*
  * Based on the configuration parameters described in an SVR2 (yes, two)
  * config(1m) man page.
  *
  * Each message is broken up and stored in segments that are msgssz bytes
  * long.  For efficiency reasons, this should be a power of two.  Also,
  * it doesn't make sense if it is less than 8 or greater than about 256.
  * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
  * two between 8 and 1024 inclusive (and panic's if it isn't).
  */
 struct msginfo msginfo = {
                 MSGMAX,         /* max chars in a message */
                 MSGMNI,         /* # of message queue identifiers */
                 MSGMNB,         /* max chars in a queue */
                 MSGTQL,         /* max messages in system */
                 MSGSSZ,         /* size of a message segment */
                 		/* (must be small power of 2 greater than 4) */
                 MSGSEG          /* number of message segments */
 };
 
 /*
  * macros to convert between msqid_ds's and msqid's.
  * (specific to this implementation)
  */
 #define MSQID(ix,ds)	((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
 #define MSQID_IX(id)	((id) & 0xffff)
 #define MSQID_SEQ(id)	(((id) >> 16) & 0xffff)
 
 /*
  * The rest of this file is specific to this particular implementation.
  */
 
 struct msgmap {
 	short	next;		/* next segment in buffer */
     				/* -1 -> available */
     				/* 0..(MSGSEG-1) -> index of next segment */
 };
 
 #define MSG_LOCKED	01000	/* Is this msqid_ds locked? */
 
 static int nfree_msgmaps;	/* # of free map entries */
 static short free_msgmaps;	/* head of linked list of free map entries */
 static struct msg *free_msghdrs;/* list of free msg headers */
 static char *msgpool;		/* MSGMAX byte long msg buffer pool */
 static struct msgmap *msgmaps;	/* MSGSEG msgmap structures */
 static struct msg *msghdrs;	/* MSGTQL msg headers */
 static struct msqid_kernel *msqids;	/* MSGMNI msqid_kernel struct's */
 static struct mtx msq_mtx;	/* global mutex for message queues. */
 static unsigned msg_prison_slot;/* prison OSD slot */
 
 static struct syscall_helper_data msg_syscalls[] = {
 	SYSCALL_INIT_HELPER(msgctl),
 	SYSCALL_INIT_HELPER(msgget),
 	SYSCALL_INIT_HELPER(msgsnd),
 	SYSCALL_INIT_HELPER(msgrcv),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER(msgsys),
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data msg32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_msgctl),
 	SYSCALL32_INIT_HELPER(freebsd32_msgsnd),
 	SYSCALL32_INIT_HELPER(freebsd32_msgrcv),
 	SYSCALL32_INIT_HELPER_COMPAT(msgget),
 	SYSCALL32_INIT_HELPER(freebsd32_msgsys),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 msginit()
 {
 	struct prison *pr;
 	void **rsv;
 	int i, error;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_CHECK] =		msg_prison_check,
 	    [PR_METHOD_SET] =		msg_prison_set,
 	    [PR_METHOD_GET] =		msg_prison_get,
 	    [PR_METHOD_REMOVE] =	msg_prison_remove,
 	};
 
 	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
 	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
 	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
 	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
 	msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
 	    M_WAITOK);
 
 	/*
 	 * msginfo.msgssz should be a power of two for efficiency reasons.
 	 * It is also pretty silly if msginfo.msgssz is less than 8
 	 * or greater than about 256 so ...
 	 */
 
 	i = 8;
 	while (i < 1024 && i != msginfo.msgssz)
 		i <<= 1;
     	if (i != msginfo.msgssz) {
 		DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
 		    msginfo.msgssz));
 		panic("msginfo.msgssz not a small power of 2");
 	}
 
 	if (msginfo.msgseg > 32767) {
 		DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
 		panic("msginfo.msgseg > 32767");
 	}
 
 	for (i = 0; i < msginfo.msgseg; i++) {
 		if (i > 0)
 			msgmaps[i-1].next = i;
 		msgmaps[i].next = -1;	/* implies entry is available */
 	}
 	free_msgmaps = 0;
 	nfree_msgmaps = msginfo.msgseg;
 
 	for (i = 0; i < msginfo.msgtql; i++) {
 		msghdrs[i].msg_type = 0;
 		if (i > 0)
 			msghdrs[i-1].msg_next = &msghdrs[i];
 		msghdrs[i].msg_next = NULL;
 #ifdef MAC
 		mac_sysvmsg_init(&msghdrs[i]);
 #endif
     	}
 	free_msghdrs = &msghdrs[0];
 
 	for (i = 0; i < msginfo.msgmni; i++) {
 		msqids[i].u.msg_qbytes = 0;	/* implies entry is available */
 		msqids[i].u.msg_perm.seq = 0;	/* reset to a known value */
 		msqids[i].u.msg_perm.mode = 0;
 #ifdef MAC
 		mac_sysvmsq_init(&msqids[i]);
 #endif
 	}
 	mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
 
 	/* Set current prisons according to their allow.sysvipc. */
 	msg_prison_slot = osd_jail_register(NULL, methods);
 	rsv = osd_reserve(msg_prison_slot);
 	prison_lock(&prison0);
 	(void)osd_jail_set_reserved(&prison0, msg_prison_slot, rsv, &prison0);
 	prison_unlock(&prison0);
 	rsv = NULL;
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (rsv == NULL)
 			rsv = osd_reserve(msg_prison_slot);
 		prison_lock(pr);
 		if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) {
 			(void)osd_jail_set_reserved(pr, msg_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
 		}
 		prison_unlock(pr);
 	}
 	if (rsv != NULL)
 		osd_free_reserved(rsv);
 	sx_sunlock(&allprison_lock);
 
 	error = syscall_helper_register(msg_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(msg32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 msgunload()
 {
 	struct msqid_kernel *msqkptr;
 	int msqid;
 #ifdef MAC
 	int i;
 #endif
 
 	syscall_helper_unregister(msg_syscalls);
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(msg32_syscalls);
 #endif
 
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 		msqkptr = &msqids[msqid];
 		if (msqkptr->u.msg_qbytes != 0 ||
 		    (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 			break;
 	}
 	if (msqid != msginfo.msgmni)
 		return (EBUSY);
 
 	if (msg_prison_slot != 0)
 		osd_jail_deregister(msg_prison_slot);
 #ifdef MAC
 	for (i = 0; i < msginfo.msgtql; i++)
 		mac_sysvmsg_destroy(&msghdrs[i]);
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++)
 		mac_sysvmsq_destroy(&msqids[msqid]);
 #endif
 	free(msgpool, M_MSG);
 	free(msgmaps, M_MSG);
 	free(msghdrs, M_MSG);
 	free(msqids, M_MSG);
 	mtx_destroy(&msq_mtx);
 	return (0);
 }
 
 
 static int
 sysvmsg_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = msginit();
 		if (error != 0)
 			msgunload();
 		break;
 	case MOD_UNLOAD:
 		error = msgunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvmsg_mod = {
 	"sysvmsg",
 	&sysvmsg_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
 MODULE_VERSION(sysvmsg, 1);
 
 static void
 msg_freehdr(msghdr)
 	struct msg *msghdr;
 {
 	while (msghdr->msg_ts > 0) {
 		short next;
 		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
 			panic("msghdr->msg_spot out of range");
 		next = msgmaps[msghdr->msg_spot].next;
 		msgmaps[msghdr->msg_spot].next = free_msgmaps;
 		free_msgmaps = msghdr->msg_spot;
 		nfree_msgmaps++;
 		msghdr->msg_spot = next;
 		if (msghdr->msg_ts >= msginfo.msgssz)
 			msghdr->msg_ts -= msginfo.msgssz;
 		else
 			msghdr->msg_ts = 0;
 	}
 	if (msghdr->msg_spot != -1)
 		panic("msghdr->msg_spot != -1");
 	msghdr->msg_next = free_msghdrs;
 	free_msghdrs = msghdr;
 #ifdef MAC
 	mac_sysvmsg_cleanup(msghdr);
 #endif
 }
 
 static void
 msq_remove(struct msqid_kernel *msqkptr)
 {
 	struct msg *msghdr;
 
 	racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
 	crfree(msqkptr->cred);
 	msqkptr->cred = NULL;
 
 	/* Free the message headers */
 	msghdr = msqkptr->u.msg_first;
 	while (msghdr != NULL) {
 		struct msg *msghdr_tmp;
 
 		/* Free the segments of each message */
 		msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 		msqkptr->u.msg_qnum--;
 		msghdr_tmp = msghdr;
 		msghdr = msghdr->msg_next;
 		msg_freehdr(msghdr_tmp);
 	}
 
 	if (msqkptr->u.msg_cbytes != 0)
 		panic("msg_cbytes is screwed up");
 	if (msqkptr->u.msg_qnum != 0)
 		panic("msg_qnum is screwed up");
 
 	msqkptr->u.msg_qbytes = 0;	/* Mark it as free */
 
 #ifdef MAC
 	mac_sysvmsq_cleanup(msqkptr);
 #endif
 
 	wakeup(msqkptr);
 }
 
 static struct prison *
 msg_find_prison(struct ucred *cred)
 {
 	struct prison *pr, *rpr;
 
 	pr = cred->cr_prison;
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, msg_prison_slot);
 	prison_unlock(pr);
 	return rpr;
 }
 
 static int
 msq_prison_cansee(struct prison *rpr, struct msqid_kernel *msqkptr)
 {
 
 	if (msqkptr->cred == NULL ||
 	    !(rpr == msqkptr->cred->cr_prison ||
 	      prison_ischild(rpr, msqkptr->cred->cr_prison)))
 		return (EINVAL);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds *buf;
 };
 #endif
 int
 sys_msgctl(struct thread *td, struct msgctl_args *uap)
 {
 	int msqid = uap->msqid;
 	int cmd = uap->cmd;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
 	if (cmd == IPC_SET &&
 	    (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
 		return (error);
 	error = kern_msgctl(td, msqid, cmd, &msqbuf);
 	if (cmd == IPC_STAT && error == 0)
 		error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
 	return (error);
 }
 
 int
 kern_msgctl(td, msqid, cmd, msqbuf)
 	struct thread *td;
 	int msqid;
 	int cmd;
 	struct msqid_ds *msqbuf;
 {
 	int rval, error, msqix;
 	struct msqid_kernel *msqkptr;
 	struct prison *rpr;
 
 	rpr = msg_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 
 	AUDIT_ARG_SVIPC_CMD(cmd);
 	AUDIT_ARG_SVIPC_ID(msqid);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such msqid\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	error = msq_prison_cansee(rpr, msqkptr);
 	if (error != 0) {
 		DPRINTF(("requester can't see prison\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	rval = 0;
 
 	switch (cmd) {
 
 	case IPC_RMID:
 	{
 #ifdef MAC
 		struct msg *msghdr;
 #endif
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 
 #ifdef MAC
 		/*
 		 * Check that the thread has MAC access permissions to
 		 * individual msghdrs.  Note: We need to do this in a
 		 * separate loop because the actual loop alters the
 		 * msq/msghdr info as it progresses, and there is no going
 		 * back if half the way through we discover that the
 		 * thread cannot free a certain msghdr.  The msq will get
 		 * into an inconsistent state.
 		 */
 		for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
 		    msghdr = msghdr->msg_next) {
 			error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr);
 			if (error != 0)
 				goto done2;
 		}
 #endif
 
 		msq_remove(msqkptr);
 	}
 
 		break;
 
 	case IPC_SET:
 		AUDIT_ARG_SVIPC_PERM(&msqbuf->msg_perm);
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 		if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
 			error = priv_check(td, PRIV_IPC_MSGSIZE);
 			if (error)
 				goto done2;
 		}
 		if (msqbuf->msg_qbytes > msginfo.msgmnb) {
 			DPRINTF(("can't increase msg_qbytes beyond %d"
 			    "(truncating)\n", msginfo.msgmnb));
 			msqbuf->msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
 		}
 		if (msqbuf->msg_qbytes == 0) {
 			DPRINTF(("can't reduce msg_qbytes to 0\n"));
 			error = EINVAL;		/* non-standard errno! */
 			goto done2;
 		}
 		msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid;	/* change the owner */
 		msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid;	/* change the owner */
 		msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
 		    (msqbuf->msg_perm.mode & 0777);
 		msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
 		msqkptr->u.msg_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 			DPRINTF(("requester doesn't have read access\n"));
 			goto done2;
 		}
 		*msqbuf = msqkptr->u;
 		if (td->td_ucred->cr_prison != msqkptr->cred->cr_prison)
 			msqbuf->msg_perm.key = IPC_PRIVATE;
 		break;
 
 	default:
 		DPRINTF(("invalid command %d\n", cmd));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgget_args {
 	key_t	key;
 	int	msgflg;
 };
 #endif
 
 int
 sys_msgget(struct thread *td, struct msgget_args *uap)
 {
 	int msqid, error = 0;
 	int key = uap->key;
 	int msgflg = uap->msgflg;
 	struct ucred *cred = td->td_ucred;
 	struct msqid_kernel *msqkptr = NULL;
 
 	DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
 
 	if (msg_find_prison(cred) == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	if (key != IPC_PRIVATE) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes != 0 &&
 			    msqkptr->cred != NULL &&
 			    msqkptr->cred->cr_prison == cred->cr_prison &&
 			    msqkptr->u.msg_perm.key == key)
 				break;
 		}
 		if (msqid < msginfo.msgmni) {
 			DPRINTF(("found public key\n"));
 			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			AUDIT_ARG_SVIPC_ID(IXSEQ_TO_IPCID(msqid,
 			    msqkptr->u.msg_perm));
 			if ((error = ipcperm(td, &msqkptr->u.msg_perm,
 			    msgflg & 0700))) {
 				DPRINTF(("requester doesn't have 0%o access\n",
 				    msgflg & 0700));
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_sysvmsq_check_msqget(cred, msqkptr);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the msqid_ds\n"));
 	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			/*
 			 * Look for an unallocated and unlocked msqid_ds.
 			 * msqid_ds's can be locked by msgsnd or msgrcv while
 			 * they are copying the message in/out.  We can't
 			 * re-use the entry until they release it.
 			 */
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes == 0 &&
 			    (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
 				break;
 		}
 		if (msqid == msginfo.msgmni) {
 			DPRINTF(("no more msqid_ds's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
 			PROC_UNLOCK(td->td_proc);
 			if (error != 0) {
 				error = ENOSPC;
 				goto done2;
 			}
 		}
 #endif
 		DPRINTF(("msqid %d is available\n", msqid));
 		msqkptr->u.msg_perm.key = key;
 		msqkptr->u.msg_perm.cuid = cred->cr_uid;
 		msqkptr->u.msg_perm.uid = cred->cr_uid;
 		msqkptr->u.msg_perm.cgid = cred->cr_gid;
 		msqkptr->u.msg_perm.gid = cred->cr_gid;
 		msqkptr->u.msg_perm.mode = (msgflg & 0777);
 		msqkptr->cred = crhold(cred);
 		/* Make sure that the returned msqid is unique */
 		msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
 		msqkptr->u.msg_first = NULL;
 		msqkptr->u.msg_last = NULL;
 		msqkptr->u.msg_cbytes = 0;
 		msqkptr->u.msg_qnum = 0;
 		msqkptr->u.msg_qbytes = msginfo.msgmnb;
 		msqkptr->u.msg_lspid = 0;
 		msqkptr->u.msg_lrpid = 0;
 		msqkptr->u.msg_stime = 0;
 		msqkptr->u.msg_rtime = 0;
 		msqkptr->u.msg_ctime = time_second;
 #ifdef MAC
 		mac_sysvmsq_create(cred, msqkptr);
 #endif
 		AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm);
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	/* Construct the unique msqid */
 	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgsnd_args {
 	int	msqid;
 	const void	*msgp;	/* XXX msgp is actually mtext. */
 	size_t	msgsz;
 	int	msgflg;
 };
 #endif
 int
 kern_msgsnd(struct thread *td, int msqid, const void *msgp,
     size_t msgsz, int msgflg, long mtype)
 {
 	int msqix, segs_needed, error = 0;
 	struct msqid_kernel *msqkptr;
 	struct msg *msghdr;
 	struct prison *rpr;
 	short next;
 #ifdef RACCT
 	size_t saved_msgsz;
 #endif
 
 	rpr = msg_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	AUDIT_ARG_SVIPC_ID(msqid);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		error = EINVAL;
 		goto done2;
 	}
 
 	msqkptr = &msqids[msqix];
 	AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = msq_prison_cansee(rpr, msqkptr))) {
 		DPRINTF(("requester can't see prison\n"));
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
 		DPRINTF(("requester doesn't have write access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
 			PROC_UNLOCK(td->td_proc);
 			error = EAGAIN;
 			goto done2;
 		}
 		saved_msgsz = msgsz;
 		if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
 			racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
 			PROC_UNLOCK(td->td_proc);
 			error = EAGAIN;
 			goto done2;
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	segs_needed = howmany(msgsz, msginfo.msgssz);
 	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
 	    msginfo.msgssz, segs_needed));
 	for (;;) {
 		int need_more_resources = 0;
 
 		/*
 		 * check msgsz
 		 * (inside this loop in case msg_qbytes changes while we sleep)
 		 */
 
 		if (msgsz > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
 			error = EINVAL;
 			goto done3;
 		}
 
 		if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
 			DPRINTF(("msqid is locked\n"));
 			need_more_resources = 1;
 		}
 		if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
 			need_more_resources = 1;
 		}
 		if (segs_needed > nfree_msgmaps) {
 			DPRINTF(("segs_needed > nfree_msgmaps\n"));
 			need_more_resources = 1;
 		}
 		if (free_msghdrs == NULL) {
 			DPRINTF(("no more msghdrs\n"));
 			need_more_resources = 1;
 		}
 
 		if (need_more_resources) {
 			int we_own_it;
 
 			if ((msgflg & IPC_NOWAIT) != 0) {
 				DPRINTF(("need more resources but caller "
 				    "doesn't want to wait\n"));
 				error = EAGAIN;
 				goto done3;
 			}
 
 			if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
 				DPRINTF(("we don't own the msqid_ds\n"));
 				we_own_it = 0;
 			} else {
 				/* Force later arrivals to wait for our
 				   request */
 				DPRINTF(("we own the msqid_ds\n"));
 				msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 				we_own_it = 1;
 			}
 			DPRINTF(("msgsnd:  goodnight\n"));
 			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 			    "msgsnd", hz);
 			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
 			if (we_own_it)
 				msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			if (error == EWOULDBLOCK) {
 				DPRINTF(("msgsnd:  timed out\n"));
 				continue;
 			}
 			if (error != 0) {
 				DPRINTF(("msgsnd:  interrupted system call\n"));
 				error = EINTR;
 				goto done3;
 			}
 
 			/*
 			 * Make sure that the msq queue still exists
 			 */
 
 			if (msqkptr->u.msg_qbytes == 0) {
 				DPRINTF(("msqid deleted\n"));
 				error = EIDRM;
 				goto done3;
 			}
 
 		} else {
 			DPRINTF(("got all the resources that we need\n"));
 			break;
 		}
 	}
 
 	/*
 	 * We have the resources that we need.
 	 * Make sure!
 	 */
 
 	if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
 		panic("msg_perm.mode & MSG_LOCKED");
 	if (segs_needed > nfree_msgmaps)
 		panic("segs_needed > nfree_msgmaps");
 	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
 		panic("msgsz + msg_cbytes > msg_qbytes");
 	if (free_msghdrs == NULL)
 		panic("no more msghdrs");
 
 	/*
 	 * Re-lock the msqid_ds in case we page-fault when copying in the
 	 * message
 	 */
 
 	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 		panic("msqid_ds is already locked");
 	msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 
 	/*
 	 * Allocate a message header
 	 */
 
 	msghdr = free_msghdrs;
 	free_msghdrs = msghdr->msg_next;
 	msghdr->msg_spot = -1;
 	msghdr->msg_ts = msgsz;
 	msghdr->msg_type = mtype;
 #ifdef MAC
 	/*
 	 * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here
 	 * immediately?  Or, should it be checked just before the msg is
 	 * enqueued in the msgq (as it is done now)?
 	 */
 	mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr);
 #endif
 
 	/*
 	 * Allocate space for the message
 	 */
 
 	while (segs_needed > 0) {
 		if (nfree_msgmaps <= 0)
 			panic("not enough msgmaps");
 		if (free_msgmaps == -1)
 			panic("nil free_msgmaps");
 		next = free_msgmaps;
 		if (next <= -1)
 			panic("next too low #1");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #1");
 		DPRINTF(("allocating segment %d to message\n", next));
 		free_msgmaps = msgmaps[next].next;
 		nfree_msgmaps--;
 		msgmaps[next].next = msghdr->msg_spot;
 		msghdr->msg_spot = next;
 		segs_needed--;
 	}
 
 	/*
 	 * Validate the message type
 	 */
 
 	if (msghdr->msg_type < 1) {
 		msg_freehdr(msghdr);
 		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 		wakeup(msqkptr);
 		DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
 		error = EINVAL;
 		goto done3;
 	}
 
 	/*
 	 * Copy in the message body
 	 */
 
 	next = msghdr->msg_spot;
 	while (msgsz > 0) {
 		size_t tlen;
 		if (msgsz > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz;
 		if (next <= -1)
 			panic("next too low #2");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #2");
 		mtx_unlock(&msq_mtx);
 		if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
 		    tlen)) != 0) {
 			mtx_lock(&msq_mtx);
 			DPRINTF(("error %d copying in message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			wakeup(msqkptr);
 			goto done3;
 		}
 		mtx_lock(&msq_mtx);
 		msgsz -= tlen;
 		msgp = (const char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 	if (next != -1)
 		panic("didn't use all the msg segments");
 
 	/*
 	 * We've got the message.  Unlock the msqid_ds.
 	 */
 
 	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 
 	/*
 	 * Make sure that the msqid_ds is still allocated.
 	 */
 
 	if (msqkptr->u.msg_qbytes == 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		error = EIDRM;
 		goto done3;
 	}
 
 #ifdef MAC
 	/*
 	 * Note: Since the task/thread allocates the msghdr and usually
 	 * primes it with its own MAC label, for a majority of policies, it
 	 * won't be necessary to check whether the msghdr has access
 	 * permissions to the msgq.  The mac_sysvmsq_check_msqsnd check would
 	 * suffice in that case.  However, this hook may be required where
 	 * individual policies derive a non-identical label for the msghdr
 	 * from the current thread label and may want to check the msghdr
 	 * enqueue permissions, along with read/write permissions to the
 	 * msgq.
 	 */
 	error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr);
 	if (error != 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		goto done3;
 	}
 #endif
 
 	/*
 	 * Put the message into the queue
 	 */
 	if (msqkptr->u.msg_first == NULL) {
 		msqkptr->u.msg_first = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	} else {
 		msqkptr->u.msg_last->msg_next = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	}
 	msqkptr->u.msg_last->msg_next = NULL;
 
 	msqkptr->u.msg_cbytes += msghdr->msg_ts;
 	msqkptr->u.msg_qnum++;
 	msqkptr->u.msg_lspid = td->td_proc->p_pid;
 	msqkptr->u.msg_stime = time_second;
 
 	wakeup(msqkptr);
 	td->td_retval[0] = 0;
 done3:
 #ifdef RACCT
 	if (racct_enable && error != 0) {
 		PROC_LOCK(td->td_proc);
 		racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
 		racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 sys_msgsnd(struct thread *td, struct msgsnd_args *uap)
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
 	    uap->msgsz, uap->msgflg));
 
 	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
 		DPRINTF(("error %d copying the message type\n", error));
 		return (error);
 	}
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)uap->msgp + sizeof(mtype),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgrcv_args {
 	int	msqid;
 	void	*msgp;
 	size_t	msgsz;
 	long	msgtyp;
 	int	msgflg;
 };
 #endif
 /* XXX msgp is actually mtext. */
 int
 kern_msgrcv(struct thread *td, int msqid, void *msgp, size_t msgsz, long msgtyp,
     int msgflg, long *mtype)
 {
 	size_t len;
 	struct msqid_kernel *msqkptr;
 	struct msg *msghdr;
 	struct prison *rpr;
 	int msqix, error = 0;
 	short next;
 
 	rpr = msg_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 
 	AUDIT_ARG_SVIPC_ID(msqid);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 	mtx_lock(&msq_mtx);
 	AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = msq_prison_cansee(rpr, msqkptr))) {
 		DPRINTF(("requester can't see prison\n"));
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 		DPRINTF(("requester doesn't have read access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 	msghdr = NULL;
 	while (msghdr == NULL) {
 		if (msgtyp == 0) {
 			msghdr = msqkptr->u.msg_first;
 			if (msghdr != NULL) {
 				if (msgsz < msghdr->msg_ts &&
 				    (msgflg & MSG_NOERROR) == 0) {
 					DPRINTF(("first message on the queue "
 					    "is too big (want %zu, got %d)\n",
 					    msgsz, msghdr->msg_ts));
 					error = E2BIG;
 					goto done2;
 				}
 #ifdef MAC
 				error = mac_sysvmsq_check_msgrcv(td->td_ucred,
 				    msghdr);
 				if (error != 0)
 					goto done2;
 #endif
 				if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
 					msqkptr->u.msg_first = NULL;
 					msqkptr->u.msg_last = NULL;
 				} else {
 					msqkptr->u.msg_first = msghdr->msg_next;
 					if (msqkptr->u.msg_first == NULL)
 						panic("msg_first/last screwed up #1");
 				}
 			}
 		} else {
 			struct msg *previous;
 			struct msg **prev;
 
 			previous = NULL;
 			prev = &(msqkptr->u.msg_first);
 			while ((msghdr = *prev) != NULL) {
 				/*
 				 * Is this message's type an exact match or is
 				 * this message's type less than or equal to
 				 * the absolute value of a negative msgtyp?
 				 * Note that the second half of this test can
 				 * NEVER be true if msgtyp is positive since
 				 * msg_type is always positive!
 				 */
 
 				if (msgtyp == msghdr->msg_type ||
 				    msghdr->msg_type <= -msgtyp) {
 					DPRINTF(("found message type %ld, "
 					    "requested %ld\n",
 					    msghdr->msg_type, msgtyp));
 					if (msgsz < msghdr->msg_ts &&
 					    (msgflg & MSG_NOERROR) == 0) {
 						DPRINTF(("requested message "
 						    "on the queue is too big "
 						    "(want %zu, got %hu)\n",
 						    msgsz, msghdr->msg_ts));
 						error = E2BIG;
 						goto done2;
 					}
 #ifdef MAC
 					error = mac_sysvmsq_check_msgrcv(
 					    td->td_ucred, msghdr);
 					if (error != 0)
 						goto done2;
 #endif
 					*prev = msghdr->msg_next;
 					if (msghdr == msqkptr->u.msg_last) {
 						if (previous == NULL) {
 							if (prev !=
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #2");
 							msqkptr->u.msg_first =
 							    NULL;
 							msqkptr->u.msg_last =
 							    NULL;
 						} else {
 							if (prev ==
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #3");
 							msqkptr->u.msg_last =
 							    previous;
 						}
 					}
 					break;
 				}
 				previous = msghdr;
 				prev = &(msghdr->msg_next);
 			}
 		}
 
 		/*
 		 * We've either extracted the msghdr for the appropriate
 		 * message or there isn't one.
 		 * If there is one then bail out of this loop.
 		 */
 
 		if (msghdr != NULL)
 			break;
 
 		/*
 		 * Hmph!  No message found.  Does the user want to wait?
 		 */
 
 		if ((msgflg & IPC_NOWAIT) != 0) {
 			DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
 			    msgtyp));
 			/* The SVID says to return ENOMSG. */
 			error = ENOMSG;
 			goto done2;
 		}
 
 		/*
 		 * Wait for something to happen
 		 */
 
 		DPRINTF(("msgrcv:  goodnight\n"));
 		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 		    "msgrcv", 0);
 		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
 
 		if (error != 0) {
 			DPRINTF(("msgrcv:  interrupted system call\n"));
 			error = EINTR;
 			goto done2;
 		}
 
 		/*
 		 * Make sure that the msq queue still exists
 		 */
 
 		if (msqkptr->u.msg_qbytes == 0 ||
 		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 			DPRINTF(("msqid deleted\n"));
 			error = EIDRM;
 			goto done2;
 		}
 	}
 
 	/*
 	 * Return the message to the user.
 	 *
 	 * First, do the bookkeeping (before we risk being interrupted).
 	 */
 
 	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 	msqkptr->u.msg_qnum--;
 	msqkptr->u.msg_lrpid = td->td_proc->p_pid;
 	msqkptr->u.msg_rtime = time_second;
 
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1);
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts);
 
 	/*
 	 * Make msgsz the actual amount that we'll be returning.
 	 * Note that this effectively truncates the message if it is too long
 	 * (since msgsz is never increased).
 	 */
 
 	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
 	    msghdr->msg_ts));
 	if (msgsz > msghdr->msg_ts)
 		msgsz = msghdr->msg_ts;
 	*mtype = msghdr->msg_type;
 
 	/*
 	 * Return the segments to the user
 	 */
 
 	next = msghdr->msg_spot;
 	for (len = 0; len < msgsz; len += msginfo.msgssz) {
 		size_t tlen;
 
 		if (msgsz - len > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz - len;
 		if (next <= -1)
 			panic("next too low #3");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #3");
 		mtx_unlock(&msq_mtx);
 		error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
 		mtx_lock(&msq_mtx);
 		if (error != 0) {
 			DPRINTF(("error (%d) copying out message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			wakeup(msqkptr);
 			goto done2;
 		}
 		msgp = (char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 
 	/*
 	 * Done, return the actual number of bytes copied out.
 	 */
 
 	msg_freehdr(msghdr);
 	wakeup(msqkptr);
 	td->td_retval[0] = msgsz;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 sys_msgrcv(struct thread *td, struct msgrcv_args *uap)
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
 	    uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
 
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)uap->msgp + sizeof(mtype), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
 		DPRINTF(("error %d copying the message type\n", error));
 	return (error);
 }
 
 static int
 sysctl_msqids(SYSCTL_HANDLER_ARGS)
 {
 	struct msqid_kernel tmsqk;
 	struct prison *pr, *rpr;
 	int error, i;
 
 	pr = req->td->td_ucred->cr_prison;
 	rpr = msg_find_prison(req->td->td_ucred);
 	error = 0;
 	for (i = 0; i < msginfo.msgmni; i++) {
 		mtx_lock(&msq_mtx);
 		if (msqids[i].u.msg_qbytes == 0 || rpr == NULL ||
 		    msq_prison_cansee(rpr, &msqids[i]) != 0)
 			bzero(&tmsqk, sizeof(tmsqk));
 		else {
 			tmsqk = msqids[i];
 			if (tmsqk.cred->cr_prison != pr)
 				tmsqk.u.msg_perm.key = IPC_PRIVATE;
 		}
 		mtx_unlock(&msq_mtx);
 		error = SYSCTL_OUT(req, &tmsqk, sizeof(tmsqk));
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
     "Maximum message size");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
     "Number of message queue identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
     "Maximum number of bytes in a queue");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
     "Maximum number of messages in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
     "Size of a message segment");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
     "Number of message segments");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_msqids, "", "Message queue IDs");
 
 static int
 msg_prison_check(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *prpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/*
 	 * sysvmsg is a jailsys integer.
 	 * It must be "disable" if the parent jail is disabled.
 	 */
 	error = vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys));
 	if (error != ENOENT) {
 		if (error != 0)
 			return (error);
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			break;
 		case JAIL_SYS_NEW:
 		case JAIL_SYS_INHERIT:
 			prison_lock(pr->pr_parent);
 			prpr = osd_jail_get(pr->pr_parent, msg_prison_slot);
 			prison_unlock(pr->pr_parent);
 			if (prpr == NULL)
 				return (EPERM);
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 static int
 msg_prison_set(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *tpr, *orpr, *nrpr, *trpr;
 	struct vfsoptlist *opts = data;
 	void *rsv;
 	int jsys, descend;
 
 	/*
 	 * sysvmsg controls which jail is the root of the associated msgs (this
 	 * jail or same as the parent), or if the feature is available at all.
 	 */
 	if (vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)) == ENOENT)
 		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
 		    ? JAIL_SYS_INHERIT
 		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
 		    ? JAIL_SYS_DISABLE
 		    : -1;
 	if (jsys == JAIL_SYS_DISABLE) {
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, msg_prison_slot);
 		if (orpr != NULL)
 			osd_jail_del(pr, msg_prison_slot);
 		prison_unlock(pr);
 		if (orpr != NULL) {
 			if (orpr == pr)
 				msg_prison_cleanup(pr);
 			/* Disable all child jails as well. */
 			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 				prison_lock(tpr);
 				trpr = osd_jail_get(tpr, msg_prison_slot);
 				if (trpr != NULL) {
 					osd_jail_del(tpr, msg_prison_slot);
 					prison_unlock(tpr);
 					if (trpr == tpr)
 						msg_prison_cleanup(tpr);
 				} else {
 					prison_unlock(tpr);
 					descend = 0;
 				}
 			}
 		}
 	} else if (jsys != -1) {
 		if (jsys == JAIL_SYS_NEW)
 			nrpr = pr;
 		else {
 			prison_lock(pr->pr_parent);
 			nrpr = osd_jail_get(pr->pr_parent, msg_prison_slot);
 			prison_unlock(pr->pr_parent);
 		}
 		rsv = osd_reserve(msg_prison_slot);
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, msg_prison_slot);
 		if (orpr != nrpr)
 			(void)osd_jail_set_reserved(pr, msg_prison_slot, rsv,
 			    nrpr);
 		else
 			osd_free_reserved(rsv);
 		prison_unlock(pr);
 		if (orpr != nrpr) {
 			if (orpr == pr)
 				msg_prison_cleanup(pr);
 			if (orpr != NULL) {
 				/* Change child jails matching the old root, */
 				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 					prison_lock(tpr);
 					trpr = osd_jail_get(tpr,
 					    msg_prison_slot);
 					if (trpr == orpr) {
 						(void)osd_jail_set(tpr,
 						    msg_prison_slot, nrpr);
 						prison_unlock(tpr);
 						if (trpr == tpr)
 							msg_prison_cleanup(tpr);
 					} else {
 						prison_unlock(tpr);
 						descend = 0;
 					}
 				}
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 msg_prison_get(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/* Set sysvmsg based on the jail's root prison. */
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, msg_prison_slot);
 	prison_unlock(pr);
 	jsys = rpr == NULL ? JAIL_SYS_DISABLE
 	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
 	error = vfs_setopt(opts, "sysvmsg", &jsys, sizeof(jsys));
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 static int
 msg_prison_remove(void *obj, void *data __unused)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, msg_prison_slot);
 	prison_unlock(pr);
 	if (rpr == pr)
 		msg_prison_cleanup(pr);
 	return (0);
 }
 
 static void
 msg_prison_cleanup(struct prison *pr)
 {
 	struct msqid_kernel *msqkptr;
 	int i;
 
 	/* Remove any msqs that belong to this jail. */
 	mtx_lock(&msq_mtx);
 	for (i = 0; i < msginfo.msgmni; i++) {
 		msqkptr = &msqids[i];
 		if (msqkptr->u.msg_qbytes != 0 &&
 		    msqkptr->cred != NULL && msqkptr->cred->cr_prison == pr)
 			msq_remove(msqkptr);
 	}
 	mtx_unlock(&msq_mtx);
 }
 
 SYSCTL_JAIL_PARAM_SYS_NODE(sysvmsg, CTLFLAG_RW, "SYSV message queues");
 
 #ifdef COMPAT_FREEBSD32
 int
 freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	switch (uap->which) {
 	case 0:
 		return (freebsd7_freebsd32_msgctl(td,
 		    (struct freebsd7_freebsd32_msgctl_args *)&uap->a2));
 	case 2:
 		return (freebsd32_msgsnd(td,
 		    (struct freebsd32_msgsnd_args *)&uap->a2));
 	case 3:
 		return (freebsd32_msgrcv(td,
 		    (struct freebsd32_msgrcv_args *)&uap->a2));
 	default:
 		return (sys_msgsys(td, (struct msgsys_args *)uap));
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_msgctl(struct thread *td,
     struct freebsd7_freebsd32_msgctl_args *uap)
 {
 	struct msqid_ds msqbuf;
 	struct msqid_ds32_old msqbuf32;
 	int error;
 
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
 		PTRIN_CP(msqbuf32, msqbuf, msg_first);
 		PTRIN_CP(msqbuf32, msqbuf, msg_last);
 		CP(msqbuf32, msqbuf, msg_cbytes);
 		CP(msqbuf32, msqbuf, msg_qnum);
 		CP(msqbuf32, msqbuf, msg_qbytes);
 		CP(msqbuf32, msqbuf, msg_lspid);
 		CP(msqbuf32, msqbuf, msg_lrpid);
 		CP(msqbuf32, msqbuf, msg_stime);
 		CP(msqbuf32, msqbuf, msg_rtime);
 		CP(msqbuf32, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		bzero(&msqbuf32, sizeof(msqbuf32));
 		freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
 		PTROUT_CP(msqbuf, msqbuf32, msg_first);
 		PTROUT_CP(msqbuf, msqbuf32, msg_last);
 		CP(msqbuf, msqbuf32, msg_cbytes);
 		CP(msqbuf, msqbuf32, msg_qnum);
 		CP(msqbuf, msqbuf32, msg_qbytes);
 		CP(msqbuf, msqbuf32, msg_lspid);
 		CP(msqbuf, msqbuf32, msg_lrpid);
 		CP(msqbuf, msqbuf32, msg_stime);
 		CP(msqbuf, msqbuf32, msg_rtime);
 		CP(msqbuf, msqbuf32, msg_ctime);
 		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap)
 {
 	struct msqid_ds msqbuf;
 	struct msqid_ds32 msqbuf32;
 	int error;
 
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
 		PTRIN_CP(msqbuf32, msqbuf, msg_first);
 		PTRIN_CP(msqbuf32, msqbuf, msg_last);
 		CP(msqbuf32, msqbuf, msg_cbytes);
 		CP(msqbuf32, msqbuf, msg_qnum);
 		CP(msqbuf32, msqbuf, msg_qbytes);
 		CP(msqbuf32, msqbuf, msg_lspid);
 		CP(msqbuf32, msqbuf, msg_lrpid);
 		CP(msqbuf32, msqbuf, msg_stime);
 		CP(msqbuf32, msqbuf, msg_rtime);
 		CP(msqbuf32, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
 		PTROUT_CP(msqbuf, msqbuf32, msg_first);
 		PTROUT_CP(msqbuf, msqbuf32, msg_last);
 		CP(msqbuf, msqbuf32, msg_cbytes);
 		CP(msqbuf, msqbuf32, msg_qnum);
 		CP(msqbuf, msqbuf32, msg_qbytes);
 		CP(msqbuf, msqbuf32, msg_lspid);
 		CP(msqbuf, msqbuf32, msg_lrpid);
 		CP(msqbuf, msqbuf32, msg_stime);
 		CP(msqbuf, msqbuf32, msg_rtime);
 		CP(msqbuf, msqbuf32, msg_ctime);
 		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
 	}
 	return (error);
 }
 
 int
 freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap)
 {
 	const void *msgp;
 	long mtype;
 	int32_t mtype32;
 	int error;
 
 	msgp = PTRIN(uap->msgp);
 	if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0)
 		return (error);
 	mtype = mtype32;
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)msgp + sizeof(mtype32),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 int
 freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap)
 {
 	void *msgp;
 	long mtype;
 	int32_t mtype32;
 	int error;
 
 	msgp = PTRIN(uap->msgp);
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)msgp + sizeof(mtype32), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	mtype32 = (int32_t)mtype;
 	return (copyout(&mtype32, msgp, sizeof(mtype32)));
 }
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *msgcalls[] = {
 	(sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget,
 	(sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv
 };
 
 /*
  * Entry point for all MSG calls.
  *
  * XXX actually varargs.
  * struct msgsys_args {
  *		int	which;
  *		int	a2;
  *		int	a3;
  *		int	a4;
  *		int	a5;
  *		int	a6;
  *	} *uap;
  */
 int
 sys_msgsys(struct thread *td, struct msgsys_args *uap)
 {
 	int error;
 
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	if (uap->which < 0 || uap->which >= nitems(msgcalls))
 		return (EINVAL);
 	error = (*msgcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7_msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds_old *buf;
 };
 #endif
 int
 freebsd7_msgctl(struct thread *td, struct freebsd7_msgctl_args *uap)
 {
 	struct msqid_ds_old msqold;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd,
 	    uap->buf));
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqold, sizeof(msqold));
 		if (error)
 			return (error);
 		ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm);
 		CP(msqold, msqbuf, msg_first);
 		CP(msqold, msqbuf, msg_last);
 		CP(msqold, msqbuf, msg_cbytes);
 		CP(msqold, msqbuf, msg_qnum);
 		CP(msqold, msqbuf, msg_qbytes);
 		CP(msqold, msqbuf, msg_lspid);
 		CP(msqold, msqbuf, msg_lrpid);
 		CP(msqold, msqbuf, msg_stime);
 		CP(msqold, msqbuf, msg_rtime);
 		CP(msqold, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		bzero(&msqold, sizeof(msqold));
 		ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm);
 		CP(msqbuf, msqold, msg_first);
 		CP(msqbuf, msqold, msg_last);
 		CP(msqbuf, msqold, msg_cbytes);
 		CP(msqbuf, msqold, msg_qnum);
 		CP(msqbuf, msqold, msg_qbytes);
 		CP(msqbuf, msqold, msg_lspid);
 		CP(msqbuf, msqold, msg_lrpid);
 		CP(msqbuf, msqold, msg_stime);
 		CP(msqbuf, msqold, msg_rtime);
 		CP(msqbuf, msqold, msg_ctime);
 		error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old));
 	}
 	return (error);
 }
 
 #undef CP
 
 #endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
 	   COMPAT_FREEBSD7 */
Index: head/sys/kern/sysv_sem.c
===================================================================
--- head/sys/kern/sysv_sem.c	(revision 326270)
+++ head/sys/kern/sysv_sem.c	(revision 326271)
@@ -1,1972 +1,1974 @@
 /*-
  * Implementation of SVID semaphores
  *
  * Author:  Daniel Boulet
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003-2005 McAfee, Inc.
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/sem.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(sysv_sem, "System V semaphores support");
 
 static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
 
 #ifdef SEM_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)
 #endif
 
 static int seminit(void);
 static int sysvsem_modload(struct module *, int, void *);
 static int semunload(void);
 static void semexit_myhook(void *arg, struct proc *p);
 static int sysctl_sema(SYSCTL_HANDLER_ARGS);
 static int semvalid(int semid, struct prison *rpr,
     struct semid_kernel *semakptr);
 static void sem_remove(int semidx, struct ucred *cred);
 static struct prison *sem_find_prison(struct ucred *);
 static int sem_prison_cansee(struct prison *, struct semid_kernel *);
 static int sem_prison_check(void *, void *);
 static int sem_prison_set(void *, void *);
 static int sem_prison_get(void *, void *);
 static int sem_prison_remove(void *, void *);
 static void sem_prison_cleanup(struct prison *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args;
 int __semctl(struct thread *td, struct __semctl_args *uap);
 struct semget_args;
 int semget(struct thread *td, struct semget_args *uap);
 struct semop_args;
 int semop(struct thread *td, struct semop_args *uap);
 #endif
 
 static struct sem_undo *semu_alloc(struct thread *td);
 static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
     int semid, int semseq, int semnum, int adjval);
 static void semundo_clear(int semid, int semnum);
 
 static struct mtx	sem_mtx;	/* semaphore global lock */
 static struct mtx sem_undo_mtx;
 static int	semtot = 0;
 static struct semid_kernel *sema;	/* semaphore id pool */
 static struct mtx *sema_mtx;	/* semaphore id pool mutexes*/
 static struct sem *sem;		/* semaphore pool */
 LIST_HEAD(, sem_undo) semu_list;	/* list of active undo structures */
 LIST_HEAD(, sem_undo) semu_free_list;	/* list of free undo structures */
 static int	*semu;		/* undo structure pool */
 static eventhandler_tag semexit_tag;
 static unsigned sem_prison_slot;	/* prison OSD slot */
 
 #define SEMUNDO_MTX		sem_undo_mtx
 #define SEMUNDO_LOCK()		mtx_lock(&SEMUNDO_MTX);
 #define SEMUNDO_UNLOCK()	mtx_unlock(&SEMUNDO_MTX);
 #define SEMUNDO_LOCKASSERT(how)	mtx_assert(&SEMUNDO_MTX, (how));
 
 struct sem {
 	u_short	semval;		/* semaphore value */
 	pid_t	sempid;		/* pid of last operation */
 	u_short	semncnt;	/* # awaiting semval > cval */
 	u_short	semzcnt;	/* # awaiting semval = 0 */
 };
 
 /*
  * Undo structure (one per process)
  */
 struct sem_undo {
 	LIST_ENTRY(sem_undo) un_next;	/* ptr to next active undo structure */
 	struct	proc *un_proc;		/* owner of this structure */
 	short	un_cnt;			/* # of active entries */
 	struct undo {
 		short	un_adjval;	/* adjust on exit values */
 		short	un_num;		/* semaphore # */
 		int	un_id;		/* semid */
 		unsigned short un_seq;
 	} un_ent[1];			/* undo entries */
 };
 
 /*
  * Configuration parameters
  */
 #ifndef SEMMNI
 #define SEMMNI	50		/* # of semaphore identifiers */
 #endif
 #ifndef SEMMNS
 #define SEMMNS	340		/* # of semaphores in system */
 #endif
 #ifndef SEMUME
 #define SEMUME	50		/* max # of undo entries per process */
 #endif
 #ifndef SEMMNU
 #define SEMMNU	150		/* # of undo structures in system */
 #endif
 
 /* shouldn't need tuning */
 #ifndef SEMMSL
 #define SEMMSL	SEMMNS		/* max # of semaphores per id */
 #endif
 #ifndef SEMOPM
 #define SEMOPM	100		/* max # of operations per semop call */
 #endif
 
 #define SEMVMX	32767		/* semaphore maximum value */
 #define SEMAEM	16384		/* adjust on exit max value */
 
 /*
  * Due to the way semaphore memory is allocated, we have to ensure that
  * SEMUSZ is properly aligned.
  */
 
 #define	SEM_ALIGN(bytes) roundup2(bytes, sizeof(long))
 
 /* actual size of an undo structure */
 #define SEMUSZ	SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
 
 /*
  * Macro to find a particular sem_undo vector
  */
 #define SEMU(ix) \
 	((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
 
 /*
  * semaphore info struct
  */
 struct seminfo seminfo = {
                 SEMMNI,         /* # of semaphore identifiers */
                 SEMMNS,         /* # of semaphores in system */
                 SEMMNU,         /* # of undo structures in system */
                 SEMMSL,         /* max # of semaphores per id */
                 SEMOPM,         /* max # of operations per semop call */
                 SEMUME,         /* max # of undo entries per process */
                 SEMUSZ,         /* size in bytes of undo structure */
                 SEMVMX,         /* semaphore maximum value */
                 SEMAEM          /* adjust on exit max value */
 };
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
     "Number of semaphore identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
     "Maximum number of semaphores in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
     "Maximum number of undo structures in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RWTUN, &seminfo.semmsl, 0,
     "Max semaphores per id");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
     "Max operations per semop call");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
     "Max undo entries per process");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
     "Size in bytes of undo structure");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RWTUN, &seminfo.semvmx, 0,
     "Semaphore maximum value");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RWTUN, &seminfo.semaem, 0,
     "Adjust on exit max value");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sema,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_sema, "", "Semaphore id pool");
 
 static struct syscall_helper_data sem_syscalls[] = {
 	SYSCALL_INIT_HELPER(__semctl),
 	SYSCALL_INIT_HELPER(semget),
 	SYSCALL_INIT_HELPER(semop),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER(semsys),
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data sem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_semctl),
 	SYSCALL32_INIT_HELPER_COMPAT(semget),
 	SYSCALL32_INIT_HELPER_COMPAT(semop),
 	SYSCALL32_INIT_HELPER(freebsd32_semsys),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 seminit(void)
 {
 	struct prison *pr;
 	void **rsv;
 	int i, error;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_CHECK] =		sem_prison_check,
 	    [PR_METHOD_SET] =		sem_prison_set,
 	    [PR_METHOD_GET] =		sem_prison_get,
 	    [PR_METHOD_REMOVE] =	sem_prison_remove,
 	};
 
 	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
 	sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
 	    M_WAITOK);
 	sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
 	    M_WAITOK | M_ZERO);
 	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
 
 	for (i = 0; i < seminfo.semmni; i++) {
 		sema[i].u.sem_base = 0;
 		sema[i].u.sem_perm.mode = 0;
 		sema[i].u.sem_perm.seq = 0;
 #ifdef MAC
 		mac_sysvsem_init(&sema[i]);
 #endif
 	}
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
 	LIST_INIT(&semu_free_list);
 	for (i = 0; i < seminfo.semmnu; i++) {
 		struct sem_undo *suptr = SEMU(i);
 		suptr->un_proc = NULL;
 		LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	}
 	LIST_INIT(&semu_list);
 	mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
 	mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF);
 	semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 
 	/* Set current prisons according to their allow.sysvipc. */
 	sem_prison_slot = osd_jail_register(NULL, methods);
 	rsv = osd_reserve(sem_prison_slot);
 	prison_lock(&prison0);
 	(void)osd_jail_set_reserved(&prison0, sem_prison_slot, rsv, &prison0);
 	prison_unlock(&prison0);
 	rsv = NULL;
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (rsv == NULL)
 			rsv = osd_reserve(sem_prison_slot);
 		prison_lock(pr);
 		if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) {
 			(void)osd_jail_set_reserved(pr, sem_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
 		}
 		prison_unlock(pr);
 	}
 	if (rsv != NULL)
 		osd_free_reserved(rsv);
 	sx_sunlock(&allprison_lock);
 
 	error = syscall_helper_register(sem_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(sem32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 semunload(void)
 {
 	int i;
 
 	/* XXXKIB */
 	if (semtot != 0)
 		return (EBUSY);
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(sem32_syscalls);
 #endif
 	syscall_helper_unregister(sem_syscalls);
 	EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
 	if (sem_prison_slot != 0)
 		osd_jail_deregister(sem_prison_slot);
 #ifdef MAC
 	for (i = 0; i < seminfo.semmni; i++)
 		mac_sysvsem_destroy(&sema[i]);
 #endif
 	free(sem, M_SEM);
 	free(sema, M_SEM);
 	free(semu, M_SEM);
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_destroy(&sema_mtx[i]);
 	free(sema_mtx, M_SEM);
 	mtx_destroy(&sem_mtx);
 	mtx_destroy(&sem_undo_mtx);
 	return (0);
 }
 
 static int
 sysvsem_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = seminit();
 		if (error != 0)
 			semunload();
 		break;
 	case MOD_UNLOAD:
 		error = semunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvsem_mod = {
 	"sysvsem",
 	&sysvsem_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvsem, 1);
 
 /*
  * Allocate a new sem_undo structure for a process
  * (returns ptr to structure or NULL if no more room)
  */
 
 static struct sem_undo *
 semu_alloc(struct thread *td)
 {
 	struct sem_undo *suptr;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	if ((suptr = LIST_FIRST(&semu_free_list)) == NULL)
 		return (NULL);
 	LIST_REMOVE(suptr, un_next);
 	LIST_INSERT_HEAD(&semu_list, suptr, un_next);
 	suptr->un_cnt = 0;
 	suptr->un_proc = td->td_proc;
 	return (suptr);
 }
 
 static int
 semu_try_free(struct sem_undo *suptr)
 {
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 
 	if (suptr->un_cnt != 0)
 		return (0);
 	LIST_REMOVE(suptr, un_next);
 	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	return (1);
 }
 
 /*
  * Adjust a particular entry for a particular proc
  */
 
 static int
 semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid,
     int semseq, int semnum, int adjval)
 {
 	struct proc *p = td->td_proc;
 	struct sem_undo *suptr;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	/* Look for and remember the sem_undo if the caller doesn't provide
 	   it */
 
 	suptr = *supptr;
 	if (suptr == NULL) {
 		LIST_FOREACH(suptr, &semu_list, un_next) {
 			if (suptr->un_proc == p) {
 				*supptr = suptr;
 				break;
 			}
 		}
 		if (suptr == NULL) {
 			if (adjval == 0)
 				return(0);
 			suptr = semu_alloc(td);
 			if (suptr == NULL)
 				return (ENOSPC);
 			*supptr = suptr;
 		}
 	}
 
 	/*
 	 * Look for the requested entry and adjust it (delete if adjval becomes
 	 * 0).
 	 */
 	sunptr = &suptr->un_ent[0];
 	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 		if (sunptr->un_id != semid || sunptr->un_num != semnum)
 			continue;
 		if (adjval != 0) {
 			adjval += sunptr->un_adjval;
 			if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 				return (ERANGE);
 		}
 		sunptr->un_adjval = adjval;
 		if (sunptr->un_adjval == 0) {
 			suptr->un_cnt--;
 			if (i < suptr->un_cnt)
 				suptr->un_ent[i] =
 				    suptr->un_ent[suptr->un_cnt];
 			if (suptr->un_cnt == 0)
 				semu_try_free(suptr);
 		}
 		return (0);
 	}
 
 	/* Didn't find the right entry - create it */
 	if (adjval == 0)
 		return (0);
 	if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 		return (ERANGE);
 	if (suptr->un_cnt != seminfo.semume) {
 		sunptr = &suptr->un_ent[suptr->un_cnt];
 		suptr->un_cnt++;
 		sunptr->un_adjval = adjval;
 		sunptr->un_id = semid;
 		sunptr->un_num = semnum;
 		sunptr->un_seq = semseq;
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 semundo_clear(int semid, int semnum)
 {
 	struct sem_undo *suptr, *suptr1;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) {
 		sunptr = &suptr->un_ent[0];
 		for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 			if (sunptr->un_id != semid)
 				continue;
 			if (semnum == -1 || sunptr->un_num == semnum) {
 				suptr->un_cnt--;
 				if (i < suptr->un_cnt) {
 					suptr->un_ent[i] =
 					    suptr->un_ent[suptr->un_cnt];
 					continue;
 				}
 				semu_try_free(suptr);
 			}
 			if (semnum != -1)
 				break;
 		}
 	}
 }
 
 static int
 semvalid(int semid, struct prison *rpr, struct semid_kernel *semakptr)
 {
 
 	return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 	    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ||
 	    sem_prison_cansee(rpr, semakptr) ? EINVAL : 0);
 }
 
 static void
 sem_remove(int semidx, struct ucred *cred)
 {
 	struct semid_kernel *semakptr;
 	int i;
 
 	KASSERT(semidx >= 0 && semidx < seminfo.semmni,
 		("semidx out of bounds"));
 	semakptr = &sema[semidx];
 	semakptr->u.sem_perm.cuid = cred ? cred->cr_uid : 0;
 	semakptr->u.sem_perm.uid = cred ? cred->cr_uid : 0;
 	semakptr->u.sem_perm.mode = 0;
 	racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
 	crfree(semakptr->cred);
 	semakptr->cred = NULL;
 	SEMUNDO_LOCK();
 	semundo_clear(semidx, -1);
 	SEMUNDO_UNLOCK();
 #ifdef MAC
 	mac_sysvsem_cleanup(semakptr);
 #endif
 	wakeup(semakptr);
 	for (i = 0; i < seminfo.semmni; i++) {
 		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 		    sema[i].u.sem_base > semakptr->u.sem_base)
 			mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
 	}
 	for (i = semakptr->u.sem_base - sem; i < semtot; i++)
 		sem[i] = sem[i + semakptr->u.sem_nsems];
 	for (i = 0; i < seminfo.semmni; i++) {
 		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 		    sema[i].u.sem_base > semakptr->u.sem_base) {
 			sema[i].u.sem_base -= semakptr->u.sem_nsems;
 			mtx_unlock(&sema_mtx[i]);
 		}
 	}
 	semtot -= semakptr->u.sem_nsems;
 }
 
 static struct prison *
 sem_find_prison(struct ucred *cred)
 {
 	struct prison *pr, *rpr;
 
 	pr = cred->cr_prison;
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, sem_prison_slot);
 	prison_unlock(pr);
 	return rpr;
 }
 
 static int
 sem_prison_cansee(struct prison *rpr, struct semid_kernel *semakptr)
 {
 
 	if (semakptr->cred == NULL ||
 	    !(rpr == semakptr->cred->cr_prison ||
 	      prison_ischild(rpr, semakptr->cred->cr_prison)))
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Note that the user-mode half of this passes a union, not a pointer.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun *arg;
 };
 #endif
 int
 sys___semctl(struct thread *td, struct __semctl_args *uap)
 {
 	struct semid_ds dsbuf;
 	union semun arg, semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
 		if (error)
 			return (error);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 int
 kern_semctl(struct thread *td, int semid, int semnum, int cmd,
     union semun *arg, register_t *rval)
 {
 	u_short *array;
 	struct ucred *cred = td->td_ucred;
 	int i, error;
 	struct prison *rpr;
 	struct semid_ds *sbuf;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	u_short usval, count;
 	int semidx;
 
 	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
 	    semid, semnum, cmd, arg));
 
 	AUDIT_ARG_SVIPC_CMD(cmd);
 	AUDIT_ARG_SVIPC_ID(semid);
 
 	rpr = sem_find_prison(td->td_ucred);
 	if (sem == NULL)
 		return (ENOSYS);
 
 	array = NULL;
 
 	switch(cmd) {
 	case SEM_STAT:
 		/*
 		 * For this command we assume semid is an array index
 		 * rather than an IPC id.
 		 */
 		if (semid < 0 || semid >= seminfo.semmni)
 			return (EINVAL);
 		semakptr = &sema[semid];
 		sema_mtxp = &sema_mtx[semid];
 		mtx_lock(sema_mtxp);
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 			error = EINVAL;
 			goto done2;
 		}
 		if ((error = sem_prison_cansee(rpr, semakptr)))
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 #ifdef MAC
 		error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
 		if (error != 0)
 			goto done2;
 #endif
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		if (cred->cr_prison != semakptr->cred->cr_prison)
 			arg->buf->sem_perm.key = IPC_PRIVATE;
 		*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
 		mtx_unlock(sema_mtxp);
 		return (0);
 	}
 
 	semidx = IPCID_TO_IX(semid);
 	if (semidx < 0 || semidx >= seminfo.semmni)
 		return (EINVAL);
 
 	semakptr = &sema[semidx];
 	sema_mtxp = &sema_mtx[semidx];
 	if (cmd == IPC_RMID)
 		mtx_lock(&sem_mtx);
 	mtx_lock(sema_mtxp);
 
 #ifdef MAC
 	error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	*rval = 0;
 
 	switch (cmd) {
 	case IPC_RMID:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		sem_remove(semidx, cred);
 		break;
 
 	case IPC_SET:
 		AUDIT_ARG_SVIPC_PERM(&arg->buf->sem_perm);
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		sbuf = arg->buf;
 		semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
 		semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
 		semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
 		    ~0777) | (sbuf->sem_perm.mode & 0777);
 		semakptr->u.sem_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		if (cred->cr_prison != semakptr->cred->cr_prison)
 			arg->buf->sem_perm.key = IPC_PRIVATE;
 		break;
 
 	case GETNCNT:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semncnt;
 		break;
 
 	case GETPID:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].sempid;
 		break;
 
 	case GETVAL:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semval;
 		break;
 
 	case GETALL:
 		/*
 		 * Unfortunately, callers of this function don't know
 		 * in advance how many semaphores are in this set.
 		 * While we could just allocate the maximum size array
 		 * and pass the actual size back to the caller, that
 		 * won't work for SETALL since we can't copyin() more
 		 * data than the user specified as we may return a
 		 * spurious EFAULT.
 		 * 
 		 * Note that the number of semaphores in a set is
 		 * fixed for the life of that set.  The only way that
 		 * the 'count' could change while are blocked in
 		 * malloc() is if this semaphore set were destroyed
 		 * and a new one created with the same index.
 		 * However, semvalid() will catch that due to the
 		 * sequence number unless exactly 0x8000 (or a
 		 * multiple thereof) semaphore sets for the same index
 		 * are created and destroyed while we are in malloc!
 		 *
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		mtx_lock(sema_mtxp);
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++)
 			array[i] = semakptr->u.sem_base[i].semval;
 		mtx_unlock(sema_mtxp);
 		error = copyout(array, arg->array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		break;
 
 	case GETZCNT:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semzcnt;
 		break;
 
 	case SETVAL:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		if (arg->val < 0 || arg->val > seminfo.semvmx) {
 			error = ERANGE;
 			goto done2;
 		}
 		semakptr->u.sem_base[semnum].semval = arg->val;
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, semnum);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case SETALL:
 		/*
 		 * See comment on GETALL for why 'count' shouldn't change
 		 * and why we require a userland buffer.
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		error = copyin(arg->array, array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		if (error)
 			break;
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++) {
 			usval = array[i];
 			if (usval > seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			}
 			semakptr->u.sem_base[i].semval = usval;
 		}
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 done2:
 	mtx_unlock(sema_mtxp);
 	if (cmd == IPC_RMID)
 		mtx_unlock(&sem_mtx);
 	if (array != NULL)
 		free(array, M_TEMP);
 	return(error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semget_args {
 	key_t	key;
 	int	nsems;
 	int	semflg;
 };
 #endif
 int
 sys_semget(struct thread *td, struct semget_args *uap)
 {
 	int semid, error = 0;
 	int key = uap->key;
 	int nsems = uap->nsems;
 	int semflg = uap->semflg;
 	struct ucred *cred = td->td_ucred;
 
 	DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
 
 	AUDIT_ARG_VALUE(semflg);
 
 	if (sem_find_prison(cred) == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&sem_mtx);
 	if (key != IPC_PRIVATE) {
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[semid].cred != NULL &&
 			    sema[semid].cred->cr_prison == cred->cr_prison &&
 			    sema[semid].u.sem_perm.key == key)
 				break;
 		}
 		if (semid < seminfo.semmni) {
 			AUDIT_ARG_SVIPC_ID(semid);
 			DPRINTF(("found public key\n"));
 			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			if ((error = ipcperm(td, &sema[semid].u.sem_perm,
 			    semflg & 0700))) {
 				goto done2;
 			}
 			if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
 				DPRINTF(("too small\n"));
 				error = EINVAL;
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_sysvsem_check_semget(cred, &sema[semid]);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the semid_kernel\n"));
 	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
 		if (nsems <= 0 || nsems > seminfo.semmsl) {
 			DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
 			    seminfo.semmsl));
 			error = EINVAL;
 			goto done2;
 		}
 		if (nsems > seminfo.semmns - semtot) {
 			DPRINTF((
 			    "not enough semaphores left (need %d, got %d)\n",
 			    nsems, seminfo.semmns - semtot));
 			error = ENOSPC;
 			goto done2;
 		}
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
 				break;
 		}
 		if (semid == seminfo.semmni) {
 			DPRINTF(("no more semid_kernel's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			error = racct_add(td->td_proc, RACCT_NSEM, nsems);
 			PROC_UNLOCK(td->td_proc);
 			if (error != 0) {
 				error = ENOSPC;
 				goto done2;
 			}
 		}
 #endif
 		DPRINTF(("semid %d is available\n", semid));
 		mtx_lock(&sema_mtx[semid]);
 		KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
 		    ("Lost semaphore %d", semid));
 		sema[semid].u.sem_perm.key = key;
 		sema[semid].u.sem_perm.cuid = cred->cr_uid;
 		sema[semid].u.sem_perm.uid = cred->cr_uid;
 		sema[semid].u.sem_perm.cgid = cred->cr_gid;
 		sema[semid].u.sem_perm.gid = cred->cr_gid;
 		sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
 		sema[semid].cred = crhold(cred);
 		sema[semid].u.sem_perm.seq =
 		    (sema[semid].u.sem_perm.seq + 1) & 0x7fff;
 		sema[semid].u.sem_nsems = nsems;
 		sema[semid].u.sem_otime = 0;
 		sema[semid].u.sem_ctime = time_second;
 		sema[semid].u.sem_base = &sem[semtot];
 		semtot += nsems;
 		bzero(sema[semid].u.sem_base,
 		    sizeof(sema[semid].u.sem_base[0])*nsems);
 #ifdef MAC
 		mac_sysvsem_create(cred, &sema[semid]);
 #endif
 		mtx_unlock(&sema_mtx[semid]);
 		DPRINTF(("sembase = %p, next = %p\n",
 		    sema[semid].u.sem_base, &sem[semtot]));
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
 done2:
 	mtx_unlock(&sem_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semop_args {
 	int	semid;
 	struct	sembuf *sops;
 	size_t	nsops;
 };
 #endif
 int
 sys_semop(struct thread *td, struct semop_args *uap)
 {
 #define SMALL_SOPS	8
 	struct sembuf small_sops[SMALL_SOPS];
 	int semid = uap->semid;
 	size_t nsops = uap->nsops;
 	struct prison *rpr;
 	struct sembuf *sops;
 	struct semid_kernel *semakptr;
 	struct sembuf *sopptr = NULL;
 	struct sem *semptr = NULL;
 	struct sem_undo *suptr;
 	struct mtx *sema_mtxp;
 	size_t i, j, k;
 	int error;
 	int do_wakeup, do_undos;
 	unsigned short seq;
 
 #ifdef SEM_DEBUG
 	sops = NULL;
 #endif
 	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
 
 	AUDIT_ARG_SVIPC_ID(semid);
 
 	rpr = sem_find_prison(td->td_ucred);
 	if (sem == NULL)
 		return (ENOSYS);
 
 	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
 
 	if (semid < 0 || semid >= seminfo.semmni)
 		return (EINVAL);
 
 	/* Allocate memory for sem_ops */
 	if (nsops <= SMALL_SOPS)
 		sops = small_sops;
 	else if (nsops > seminfo.semopm) {
 		DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
 		    nsops));
 		return (E2BIG);
 	} else {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			if (nsops >
 			    racct_get_available(td->td_proc, RACCT_NSEMOP)) {
 				PROC_UNLOCK(td->td_proc);
 				return (E2BIG);
 			}
 			PROC_UNLOCK(td->td_proc);
 		}
 #endif
 
 		sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
 	}
 	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
 		DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
 		    uap->sops, sops, nsops * sizeof(sops[0])));
 		if (sops != small_sops)
 			free(sops, M_SEM);
 		return (error);
 	}
 
 	semakptr = &sema[semid];
 	sema_mtxp = &sema_mtx[semid];
 	mtx_lock(sema_mtxp);
 	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 		error = EINVAL;
 		goto done2;
 	}
 	seq = semakptr->u.sem_perm.seq;
 	if (seq != IPCID_TO_SEQ(uap->semid)) {
 		error = EINVAL;
 		goto done2;
 	}
 	if ((error = sem_prison_cansee(rpr, semakptr)) != 0)
 		goto done2;
 	/*
 	 * Initial pass through sops to see what permissions are needed.
 	 * Also perform any checks that don't need repeating on each
 	 * attempt to satisfy the request vector.
 	 */
 	j = 0;		/* permission needed */
 	do_undos = 0;
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		if (sopptr->sem_num >= semakptr->u.sem_nsems) {
 			error = EFBIG;
 			goto done2;
 		}
 		if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
 			do_undos = 1;
 		j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
 	}
 
 	if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
 		DPRINTF(("error = %d from ipaccess\n", error));
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j);
 	if (error != 0)
 		goto done2;
 #endif
 
 	/*
 	 * Loop trying to satisfy the vector of requests.
 	 * If we reach a point where we must wait, any requests already
 	 * performed are rolled back and we go to sleep until some other
 	 * process wakes us up.  At this point, we start all over again.
 	 *
 	 * This ensures that from the perspective of other tasks, a set
 	 * of requests is atomic (never partially satisfied).
 	 */
 	for (;;) {
 		do_wakeup = 0;
 		error = 0;	/* error return if necessary */
 
 		for (i = 0; i < nsops; i++) {
 			sopptr = &sops[i];
 			semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 			DPRINTF((
 			    "semop:  semakptr=%p, sem_base=%p, "
 			    "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
 			    semakptr, semakptr->u.sem_base, semptr,
 			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
 			    (sopptr->sem_flg & IPC_NOWAIT) ?
 			    "nowait" : "wait"));
 
 			if (sopptr->sem_op < 0) {
 				if (semptr->semval + sopptr->sem_op < 0) {
 					DPRINTF(("semop:  can't do it now\n"));
 					break;
 				} else {
 					semptr->semval += sopptr->sem_op;
 					if (semptr->semval == 0 &&
 					    semptr->semzcnt > 0)
 						do_wakeup = 1;
 				}
 			} else if (sopptr->sem_op == 0) {
 				if (semptr->semval != 0) {
 					DPRINTF(("semop:  not zero now\n"));
 					break;
 				}
 			} else if (semptr->semval + sopptr->sem_op >
 			    seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			} else {
 				if (semptr->semncnt > 0)
 					do_wakeup = 1;
 				semptr->semval += sopptr->sem_op;
 			}
 		}
 
 		/*
 		 * Did we get through the entire vector?
 		 */
 		if (i >= nsops)
 			goto done;
 
 		/*
 		 * No ... rollback anything that we've already done
 		 */
 		DPRINTF(("semop:  rollback 0 through %d\n", i-1));
 		for (j = 0; j < i; j++)
 			semakptr->u.sem_base[sops[j].sem_num].semval -=
 			    sops[j].sem_op;
 
 		/* If we detected an error, return it */
 		if (error != 0)
 			goto done2;
 
 		/*
 		 * If the request that we couldn't satisfy has the
 		 * NOWAIT flag set then return with EAGAIN.
 		 */
 		if (sopptr->sem_flg & IPC_NOWAIT) {
 			error = EAGAIN;
 			goto done2;
 		}
 
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt++;
 		else
 			semptr->semncnt++;
 
 		DPRINTF(("semop:  good night!\n"));
 		error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
 		    "semwait", 0);
 		DPRINTF(("semop:  good morning (error=%d)!\n", error));
 		/* return code is checked below, after sem[nz]cnt-- */
 
 		/*
 		 * Make sure that the semaphore still exists
 		 */
 		seq = semakptr->u.sem_perm.seq;
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 		    seq != IPCID_TO_SEQ(uap->semid)) {
 			error = EIDRM;
 			goto done2;
 		}
 
 		/*
 		 * Renew the semaphore's pointer after wakeup since
 		 * during msleep sem_base may have been modified and semptr
 		 * is not valid any more
 		 */
 		semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 		/*
 		 * The semaphore is still alive.  Readjust the count of
 		 * waiting processes.
 		 */
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt--;
 		else
 			semptr->semncnt--;
 
 		/*
 		 * Is it really morning, or was our sleep interrupted?
 		 * (Delayed check of msleep() return code because we
 		 * need to decrement sem[nz]cnt either way.)
 		 */
 		if (error != 0) {
 			error = EINTR;
 			goto done2;
 		}
 		DPRINTF(("semop:  good morning!\n"));
 	}
 
 done:
 	/*
 	 * Process any SEM_UNDO requests.
 	 */
 	if (do_undos) {
 		SEMUNDO_LOCK();
 		suptr = NULL;
 		for (i = 0; i < nsops; i++) {
 			/*
 			 * We only need to deal with SEM_UNDO's for non-zero
 			 * op's.
 			 */
 			int adjval;
 
 			if ((sops[i].sem_flg & SEM_UNDO) == 0)
 				continue;
 			adjval = sops[i].sem_op;
 			if (adjval == 0)
 				continue;
 			error = semundo_adjust(td, &suptr, semid, seq,
 			    sops[i].sem_num, -adjval);
 			if (error == 0)
 				continue;
 
 			/*
 			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
 			 * Rollback the adjustments to this point and then
 			 * rollback the semaphore ups and down so we can return
 			 * with an error with all structures restored.  We
 			 * rollback the undo's in the exact reverse order that
 			 * we applied them.  This guarantees that we won't run
 			 * out of space as we roll things back out.
 			 */
 			for (j = 0; j < i; j++) {
 				k = i - j - 1;
 				if ((sops[k].sem_flg & SEM_UNDO) == 0)
 					continue;
 				adjval = sops[k].sem_op;
 				if (adjval == 0)
 					continue;
 				if (semundo_adjust(td, &suptr, semid, seq,
 				    sops[k].sem_num, adjval) != 0)
 					panic("semop - can't undo undos");
 			}
 
 			for (j = 0; j < nsops; j++)
 				semakptr->u.sem_base[sops[j].sem_num].semval -=
 				    sops[j].sem_op;
 
 			DPRINTF(("error = %d from semundo_adjust\n", error));
 			SEMUNDO_UNLOCK();
 			goto done2;
 		} /* loop through the sops */
 		SEMUNDO_UNLOCK();
 	} /* if (do_undos) */
 
 	/* We're definitely done - set the sempid's and time */
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		semptr = &semakptr->u.sem_base[sopptr->sem_num];
 		semptr->sempid = td->td_proc->p_pid;
 	}
 	semakptr->u.sem_otime = time_second;
 
 	/*
 	 * Do a wakeup if any semaphore was up'd whilst something was
 	 * sleeping on it.
 	 */
 	if (do_wakeup) {
 		DPRINTF(("semop:  doing wakeup\n"));
 		wakeup(semakptr);
 		DPRINTF(("semop:  back from wakeup\n"));
 	}
 	DPRINTF(("semop:  done\n"));
 	td->td_retval[0] = 0;
 done2:
 	mtx_unlock(sema_mtxp);
 	if (sops != small_sops)
 		free(sops, M_SEM);
 	return (error);
 }
 
 /*
  * Go through the undo structures for this process and apply the adjustments to
  * semaphores.
  */
 static void
 semexit_myhook(void *arg, struct proc *p)
 {
 	struct sem_undo *suptr;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	int semid, semnum, adjval, ix;
 	unsigned short seq;
 
 	/*
 	 * Go through the chain of undo vectors looking for one
 	 * associated with this process.
 	 */
 	if (LIST_EMPTY(&semu_list))
 		return;
 	SEMUNDO_LOCK();
 	LIST_FOREACH(suptr, &semu_list, un_next) {
 		if (suptr->un_proc == p)
 			break;
 	}
 	if (suptr == NULL) {
 		SEMUNDO_UNLOCK();
 		return;
 	}
 	LIST_REMOVE(suptr, un_next);
 
 	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
 	    suptr->un_cnt));
 
 	/*
 	 * If there are any active undo elements then process them.
 	 */
 	if (suptr->un_cnt > 0) {
 		SEMUNDO_UNLOCK();
 		for (ix = 0; ix < suptr->un_cnt; ix++) {
 			semid = suptr->un_ent[ix].un_id;
 			semnum = suptr->un_ent[ix].un_num;
 			adjval = suptr->un_ent[ix].un_adjval;
 			seq = suptr->un_ent[ix].un_seq;
 			semakptr = &sema[semid];
 			sema_mtxp = &sema_mtx[semid];
 
 			mtx_lock(sema_mtxp);
 			if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 			    (semakptr->u.sem_perm.seq != seq)) {
 				mtx_unlock(sema_mtxp);
 				continue;
 			}
 			if (semnum >= semakptr->u.sem_nsems)
 				panic("semexit - semnum out of range");
 
 			DPRINTF((
 			    "semexit:  %p id=%d num=%d(adj=%d) ; sem=%d\n",
 			    suptr->un_proc, suptr->un_ent[ix].un_id,
 			    suptr->un_ent[ix].un_num,
 			    suptr->un_ent[ix].un_adjval,
 			    semakptr->u.sem_base[semnum].semval));
 
 			if (adjval < 0 && semakptr->u.sem_base[semnum].semval <
 			    -adjval)
 				semakptr->u.sem_base[semnum].semval = 0;
 			else
 				semakptr->u.sem_base[semnum].semval += adjval;
 
 			wakeup(semakptr);
 			DPRINTF(("semexit:  back from wakeup\n"));
 			mtx_unlock(sema_mtxp);
 		}
 		SEMUNDO_LOCK();
 	}
 
 	/*
 	 * Deallocate the undo vector.
 	 */
 	DPRINTF(("removing vector\n"));
 	suptr->un_proc = NULL;
 	suptr->un_cnt = 0;
 	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	SEMUNDO_UNLOCK();
 }
 
 static int
 sysctl_sema(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr, *rpr;
 	struct semid_kernel tsemak;
 	int error, i;
 
 	pr = req->td->td_ucred->cr_prison;
 	rpr = sem_find_prison(req->td->td_ucred);
 	error = 0;
 	for (i = 0; i < seminfo.semmni; i++) {
 		mtx_lock(&sema_mtx[i]);
 		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) == 0 ||
 		    rpr == NULL || sem_prison_cansee(rpr, &sema[i]) != 0)
 			bzero(&tsemak, sizeof(tsemak));
 		else {
 			tsemak = sema[i];
 			if (tsemak.cred->cr_prison != pr)
 				tsemak.u.sem_perm.key = IPC_PRIVATE;
 		}
 		mtx_unlock(&sema_mtx[i]);
 		error = SYSCTL_OUT(req, &tsemak, sizeof(tsemak));
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 static int
 sem_prison_check(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *prpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/*
 	 * sysvsem is a jailsys integer.
 	 * It must be "disable" if the parent jail is disabled.
 	 */
 	error = vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys));
 	if (error != ENOENT) {
 		if (error != 0)
 			return (error);
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			break;
 		case JAIL_SYS_NEW:
 		case JAIL_SYS_INHERIT:
 			prison_lock(pr->pr_parent);
 			prpr = osd_jail_get(pr->pr_parent, sem_prison_slot);
 			prison_unlock(pr->pr_parent);
 			if (prpr == NULL)
 				return (EPERM);
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 static int
 sem_prison_set(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *tpr, *orpr, *nrpr, *trpr;
 	struct vfsoptlist *opts = data;
 	void *rsv;
 	int jsys, descend;
 
 	/*
 	 * sysvsem controls which jail is the root of the associated sems (this
 	 * jail or same as the parent), or if the feature is available at all.
 	 */
 	if (vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys)) == ENOENT)
 		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
 		    ? JAIL_SYS_INHERIT
 		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
 		    ? JAIL_SYS_DISABLE
 		    : -1;
 	if (jsys == JAIL_SYS_DISABLE) {
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, sem_prison_slot);
 		if (orpr != NULL)
 			osd_jail_del(pr, sem_prison_slot);
 		prison_unlock(pr);
 		if (orpr != NULL) {
 			if (orpr == pr)
 				sem_prison_cleanup(pr);
 			/* Disable all child jails as well. */
 			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 				prison_lock(tpr);
 				trpr = osd_jail_get(tpr, sem_prison_slot);
 				if (trpr != NULL) {
 					osd_jail_del(tpr, sem_prison_slot);
 					prison_unlock(tpr);
 					if (trpr == tpr)
 						sem_prison_cleanup(tpr);
 				} else {
 					prison_unlock(tpr);
 					descend = 0;
 				}
 			}
 		}
 	} else if (jsys != -1) {
 		if (jsys == JAIL_SYS_NEW)
 			nrpr = pr;
 		else {
 			prison_lock(pr->pr_parent);
 			nrpr = osd_jail_get(pr->pr_parent, sem_prison_slot);
 			prison_unlock(pr->pr_parent);
 		}
 		rsv = osd_reserve(sem_prison_slot);
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, sem_prison_slot);
 		if (orpr != nrpr)
 			(void)osd_jail_set_reserved(pr, sem_prison_slot, rsv,
 			    nrpr);
 		else
 			osd_free_reserved(rsv);
 		prison_unlock(pr);
 		if (orpr != nrpr) {
 			if (orpr == pr)
 				sem_prison_cleanup(pr);
 			if (orpr != NULL) {
 				/* Change child jails matching the old root, */
 				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 					prison_lock(tpr);
 					trpr = osd_jail_get(tpr,
 					    sem_prison_slot);
 					if (trpr == orpr) {
 						(void)osd_jail_set(tpr,
 						    sem_prison_slot, nrpr);
 						prison_unlock(tpr);
 						if (trpr == tpr)
 							sem_prison_cleanup(tpr);
 					} else {
 						prison_unlock(tpr);
 						descend = 0;
 					}
 				}
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 sem_prison_get(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/* Set sysvsem based on the jail's root prison. */
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, sem_prison_slot);
 	prison_unlock(pr);
 	jsys = rpr == NULL ? JAIL_SYS_DISABLE
 	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
 	error = vfs_setopt(opts, "sysvsem", &jsys, sizeof(jsys));
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 static int
 sem_prison_remove(void *obj, void *data __unused)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, sem_prison_slot);
 	prison_unlock(pr);
 	if (rpr == pr)
 		sem_prison_cleanup(pr);
 	return (0);
 }
 
 static void
 sem_prison_cleanup(struct prison *pr)
 {
 	int i;
 
 	/* Remove any sems that belong to this jail. */
 	mtx_lock(&sem_mtx);
 	for (i = 0; i < seminfo.semmni; i++) {
 		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 		    sema[i].cred != NULL && sema[i].cred->cr_prison == pr) {
 			mtx_lock(&sema_mtx[i]);
 			sem_remove(i, NULL);
 			mtx_unlock(&sema_mtx[i]);
 		}
 	}
 	mtx_unlock(&sem_mtx);
 }
 
 SYSCTL_JAIL_PARAM_SYS_NODE(sysvsem, CTLFLAG_RW, "SYSV semaphores");
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *semcalls[] = {
 	(sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget,
 	(sy_call_t *)sys_semop
 };
 
 /*
  * Entry point for all SEM calls.
  */
 int
 sys_semsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct semsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 	} */ *uap;
 {
 	int error;
 
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	if (uap->which < 0 || uap->which >= nitems(semcalls))
 		return (EINVAL);
 	error = (*semcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7___semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun_old *arg;
 };
 #endif
 int
 freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap)
 {
 	struct semid_ds_old dsold;
 	struct semid_ds dsbuf;
 	union semun_old arg;
 	union semun semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsold, sizeof(dsold));
 		if (error)
 			return (error);
 		ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm);
 		CP(dsold, dsbuf, sem_base);
 		CP(dsold, dsbuf, sem_nsems);
 		CP(dsold, dsbuf, sem_otime);
 		CP(dsold, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsold, sizeof(dsold));
 		ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm);
 		CP(dsbuf, dsold, sem_base);
 		CP(dsbuf, dsold, sem_nsems);
 		CP(dsbuf, dsold, sem_otime);
 		CP(dsbuf, dsold, sem_ctime);
 		error = copyout(&dsold, arg.buf, sizeof(dsold));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 #endif /* COMPAT_FREEBSD{4,5,6,7} */
 
 #ifdef COMPAT_FREEBSD32
 
 int
 freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	switch (uap->which) {
 	case 0:
 		return (freebsd7_freebsd32_semctl(td,
 		    (struct freebsd7_freebsd32_semctl_args *)&uap->a2));
 	default:
 		return (sys_semsys(td, (struct semsys_args *)uap));
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_semctl(struct thread *td,
     struct freebsd7_freebsd32_semctl_args *uap)
 {
 	struct semid_ds32_old dsbuf32;
 	struct semid_ds dsbuf;
 	union semun semun;
 	union semun32 arg;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);		
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
 		PTRIN_CP(dsbuf32, dsbuf, sem_base);
 		CP(dsbuf32, dsbuf, sem_nsems);
 		CP(dsbuf32, dsbuf, sem_otime);
 		CP(dsbuf32, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = PTRIN(arg.array);
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsbuf32, sizeof(dsbuf32));
 		freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
 		PTROUT_CP(dsbuf, dsbuf32, sem_base);
 		CP(dsbuf, dsbuf32, sem_nsems);
 		CP(dsbuf, dsbuf32, sem_otime);
 		CP(dsbuf, dsbuf32, sem_ctime);
 		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 #endif
 
 int
 freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap)
 {
 	struct semid_ds32 dsbuf32;
 	struct semid_ds dsbuf;
 	union semun semun;
 	union semun32 arg;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);		
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
 		PTRIN_CP(dsbuf32, dsbuf, sem_base);
 		CP(dsbuf32, dsbuf, sem_nsems);
 		CP(dsbuf32, dsbuf, sem_otime);
 		CP(dsbuf32, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = PTRIN(arg.array);
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsbuf32, sizeof(dsbuf32));
 		freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
 		PTROUT_CP(dsbuf, dsbuf32, sem_base);
 		CP(dsbuf, dsbuf32, sem_nsems);
 		CP(dsbuf, dsbuf32, sem_otime);
 		CP(dsbuf, dsbuf32, sem_ctime);
 		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 #endif /* COMPAT_FREEBSD32 */
Index: head/sys/kern/sysv_shm.c
===================================================================
--- head/sys/kern/sysv_shm.c	(revision 326270)
+++ head/sys/kern/sysv_shm.c	(revision 326271)
@@ -1,1663 +1,1665 @@
 /*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
 /*-
+ * SPDX-License-Identifier: BSD-4-Clause AND BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Adam Glass and Charles
  *	Hannum.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/shm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 FEATURE(sysv_shm, "System V shared memory segments support");
 
 static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
 
 static int shmget_allocate_segment(struct thread *td,
     struct shmget_args *uap, int mode);
 static int shmget_existing(struct thread *td, struct shmget_args *uap,
     int mode, int segnum);
 
 #define	SHMSEG_FREE     	0x0200
 #define	SHMSEG_REMOVED  	0x0400
 #define	SHMSEG_ALLOCATED	0x0800
 
 static int shm_last_free, shm_nused, shmalloced;
 vm_size_t shm_committed;
 static struct shmid_kernel *shmsegs;
 static unsigned shm_prison_slot;
 
 struct shmmap_state {
 	vm_offset_t va;
 	int shmid;
 };
 
 static void shm_deallocate_segment(struct shmid_kernel *);
 static int shm_find_segment_by_key(struct prison *, key_t);
 static struct shmid_kernel *shm_find_segment(struct prison *, int, bool);
 static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
 static void shmrealloc(void);
 static int shminit(void);
 static int sysvshm_modload(struct module *, int, void *);
 static int shmunload(void);
 static void shmexit_myhook(struct vmspace *vm);
 static void shmfork_myhook(struct proc *p1, struct proc *p2);
 static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
 static void shm_remove(struct shmid_kernel *, int);
 static struct prison *shm_find_prison(struct ucred *);
 static int shm_prison_cansee(struct prison *, struct shmid_kernel *);
 static int shm_prison_check(void *, void *);
 static int shm_prison_set(void *, void *);
 static int shm_prison_get(void *, void *);
 static int shm_prison_remove(void *, void *);
 static void shm_prison_cleanup(struct prison *);
 
 /*
  * Tuneable values.
  */
 #ifndef SHMMAXPGS
 #define	SHMMAXPGS	131072	/* Note: sysv shared memory is swap backed. */
 #endif
 #ifndef SHMMAX
 #define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
 #endif
 #ifndef SHMMIN
 #define	SHMMIN	1
 #endif
 #ifndef SHMMNI
 #define	SHMMNI	192
 #endif
 #ifndef SHMSEG
 #define	SHMSEG	128
 #endif
 #ifndef SHMALL
 #define	SHMALL	(SHMMAXPGS)
 #endif
 
 struct	shminfo shminfo = {
 	.shmmax = SHMMAX,
 	.shmmin = SHMMIN,
 	.shmmni = SHMMNI,
 	.shmseg = SHMSEG,
 	.shmall = SHMALL
 };
 
 static int shm_use_phys;
 static int shm_allow_removed = 1;
 
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RWTUN, &shminfo.shmmax, 0,
     "Maximum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RWTUN, &shminfo.shmmin, 0,
     "Minimum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
     "Number of shared memory identifiers");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
     "Number of segments per process");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RWTUN, &shminfo.shmall, 0,
     "Maximum number of pages available for shared memory");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RWTUN,
     &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RWTUN,
     &shm_allow_removed, 0,
     "Enable/Disable attachment to attached segments marked for removal");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_shmsegs, "",
     "Current number of shared memory segments allocated");
 
 static struct sx sysvshmsx;
 #define	SYSVSHM_LOCK()		sx_xlock(&sysvshmsx)
 #define	SYSVSHM_UNLOCK()	sx_xunlock(&sysvshmsx)
 #define	SYSVSHM_ASSERT_LOCKED()	sx_assert(&sysvshmsx, SA_XLOCKED)
 
 static int
 shm_find_segment_by_key(struct prison *pr, key_t key)
 {
 	int i;
 
 	for (i = 0; i < shmalloced; i++)
 		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
 		    shmsegs[i].cred != NULL &&
 		    shmsegs[i].cred->cr_prison == pr &&
 		    shmsegs[i].u.shm_perm.key == key)
 			return (i);
 	return (-1);
 }
 
 /*
  * Finds segment either by shmid if is_shmid is true, or by segnum if
  * is_shmid is false.
  */
 static struct shmid_kernel *
 shm_find_segment(struct prison *rpr, int arg, bool is_shmid)
 {
 	struct shmid_kernel *shmseg;
 	int segnum;
 
 	segnum = is_shmid ? IPCID_TO_IX(arg) : arg;
 	if (segnum < 0 || segnum >= shmalloced)
 		return (NULL);
 	shmseg = &shmsegs[segnum];
 	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 	    (!shm_allow_removed &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
 	    (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)) ||
 	    shm_prison_cansee(rpr, shmseg) != 0)
 		return (NULL);
 	return (shmseg);
 }
 
 static void
 shm_deallocate_segment(struct shmid_kernel *shmseg)
 {
 	vm_size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	vm_object_deallocate(shmseg->object);
 	shmseg->object = NULL;
 	size = round_page(shmseg->u.shm_segsz);
 	shm_committed -= btoc(size);
 	shm_nused--;
 	shmseg->u.shm_perm.mode = SHMSEG_FREE;
 #ifdef MAC
 	mac_sysvshm_cleanup(shmseg);
 #endif
 	racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
 	racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
 	crfree(shmseg->cred);
 	shmseg->cred = NULL;
 }
 
 static int
 shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
 {
 	struct shmid_kernel *shmseg;
 	int segnum, result;
 	vm_size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 	segnum = IPCID_TO_IX(shmmap_s->shmid);
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 
 	shmseg = &shmsegs[segnum];
 	size = round_page(shmseg->u.shm_segsz);
 	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
 	if (result != KERN_SUCCESS)
 		return (EINVAL);
 	shmmap_s->shmid = -1;
 	shmseg->u.shm_dtime = time_second;
 	if (--shmseg->u.shm_nattch == 0 &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
 		shm_deallocate_segment(shmseg);
 		shm_last_free = segnum;
 	}
 	return (0);
 }
 
 static void
 shm_remove(struct shmid_kernel *shmseg, int segnum)
 {
 
 	shmseg->u.shm_perm.key = IPC_PRIVATE;
 	shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
 	if (shmseg->u.shm_nattch == 0) {
 		shm_deallocate_segment(shmseg);
 		shm_last_free = segnum;
 	}
 }
 
 static struct prison *
 shm_find_prison(struct ucred *cred)
 {
 	struct prison *pr, *rpr;
 
 	pr = cred->cr_prison;
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, shm_prison_slot);
 	prison_unlock(pr);
 	return rpr;
 }
 
 static int
 shm_prison_cansee(struct prison *rpr, struct shmid_kernel *shmseg)
 {
 
 	if (shmseg->cred == NULL ||
 	    !(rpr == shmseg->cred->cr_prison ||
 	      prison_ischild(rpr, shmseg->cred->cr_prison)))
 		return (EINVAL);
 	return (0);
 }
 
 static int
 kern_shmdt_locked(struct thread *td, const void *shmaddr)
 {
 	struct proc *p = td->td_proc;
 	struct shmmap_state *shmmap_s;
 #if defined(AUDIT) || defined(MAC)
 	struct shmid_kernel *shmsegptr;
 #endif
 #ifdef MAC
 	int error;
 #endif
 	int i;
 
 	SYSVSHM_ASSERT_LOCKED();
 	if (shm_find_prison(td->td_ucred) == NULL)
 		return (ENOSYS);
 	shmmap_s = p->p_vmspace->vm_shm;
  	if (shmmap_s == NULL)
 		return (EINVAL);
 	AUDIT_ARG_SVIPC_ID(shmmap_s->shmid);
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1 &&
 		    shmmap_s->va == (vm_offset_t)shmaddr) {
 			break;
 		}
 	}
 	if (i == shminfo.shmseg)
 		return (EINVAL);
 #if (defined(AUDIT) && defined(KDTRACE_HOOKS)) || defined(MAC)
 	shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
 #endif
 #ifdef MAC
 	error = mac_sysvshm_check_shmdt(td->td_ucred, shmsegptr);
 	if (error != 0)
 		return (error);
 #endif
 	return (shm_delete_mapping(p->p_vmspace, shmmap_s));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmdt_args {
 	const void *shmaddr;
 };
 #endif
 int
 sys_shmdt(struct thread *td, struct shmdt_args *uap)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmdt_locked(td, uap->shmaddr);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static int
 kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr,
     int shmflg)
 {
 	struct prison *rpr;
 	struct proc *p = td->td_proc;
 	struct shmid_kernel *shmseg;
 	struct shmmap_state *shmmap_s;
 	vm_offset_t attach_va;
 	vm_prot_t prot;
 	vm_size_t size;
 	int error, i, rv;
 
 	AUDIT_ARG_SVIPC_ID(shmid);
 	AUDIT_ARG_VALUE(shmflg);
 
 	SYSVSHM_ASSERT_LOCKED();
 	rpr = shm_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 	shmmap_s = p->p_vmspace->vm_shm;
 	if (shmmap_s == NULL) {
 		shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
 		    M_SHM, M_WAITOK);
 		for (i = 0; i < shminfo.shmseg; i++)
 			shmmap_s[i].shmid = -1;
 		KASSERT(p->p_vmspace->vm_shm == NULL, ("raced"));
 		p->p_vmspace->vm_shm = shmmap_s;
 	}
 	shmseg = shm_find_segment(rpr, shmid, true);
 	if (shmseg == NULL)
 		return (EINVAL);
 	error = ipcperm(td, &shmseg->u.shm_perm,
 	    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
 	if (error != 0)
 		return (error);
 #ifdef MAC
 	error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	for (i = 0; i < shminfo.shmseg; i++) {
 		if (shmmap_s->shmid == -1)
 			break;
 		shmmap_s++;
 	}
 	if (i >= shminfo.shmseg)
 		return (EMFILE);
 	size = round_page(shmseg->u.shm_segsz);
 	prot = VM_PROT_READ;
 	if ((shmflg & SHM_RDONLY) == 0)
 		prot |= VM_PROT_WRITE;
 	if (shmaddr != NULL) {
 		if ((shmflg & SHM_RND) != 0)
 			attach_va = rounddown2((vm_offset_t)shmaddr, SHMLBA);
 		else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0)
 			attach_va = (vm_offset_t)shmaddr;
 		else
 			return (EINVAL);
 	} else {
 		/*
 		 * This is just a hint to vm_map_find() about where to
 		 * put it.
 		 */
 		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
 		    lim_max(td, RLIMIT_DATA));
 	}
 
 	vm_object_reference(shmseg->object);
 	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object, 0, &attach_va,
 	    size, 0, shmaddr != NULL ? VMFS_NO_SPACE : VMFS_OPTIMAL_SPACE,
 	    prot, prot, MAP_INHERIT_SHARE | MAP_PREFAULT_PARTIAL);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(shmseg->object);
 		return (ENOMEM);
 	}
 
 	shmmap_s->va = attach_va;
 	shmmap_s->shmid = shmid;
 	shmseg->u.shm_lpid = p->p_pid;
 	shmseg->u.shm_atime = time_second;
 	shmseg->u.shm_nattch++;
 	td->td_retval[0] = attach_va;
 	return (error);
 }
 
 int
 kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmat_locked(td, shmid, shmaddr, shmflg);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmat_args {
 	int shmid;
 	const void *shmaddr;
 	int shmflg;
 };
 #endif
 int
 sys_shmat(struct thread *td, struct shmat_args *uap)
 {
 
 	return (kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg));
 }
 
 static int
 kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf,
     size_t *bufsz)
 {
 	struct prison *rpr;
 	struct shmid_kernel *shmseg;
 	struct shmid_ds *shmidp;
 	struct shm_info shm_info;
 	int error;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	rpr = shm_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 
 	AUDIT_ARG_SVIPC_ID(shmid);
 	AUDIT_ARG_SVIPC_CMD(cmd);
 
 	switch (cmd) {
 	/*
 	 * It is possible that kern_shmctl is being called from the Linux ABI
 	 * layer, in which case, we will need to implement IPC_INFO.  It should
 	 * be noted that other shmctl calls will be funneled through here for
 	 * Linix binaries as well.
 	 *
 	 * NB: The Linux ABI layer will convert this data to structure(s) more
 	 * consistent with the Linux ABI.
 	 */
 	case IPC_INFO:
 		memcpy(buf, &shminfo, sizeof(shminfo));
 		if (bufsz)
 			*bufsz = sizeof(shminfo);
 		td->td_retval[0] = shmalloced;
 		return (0);
 	case SHM_INFO: {
 		shm_info.used_ids = shm_nused;
 		shm_info.shm_rss = 0;	/*XXX where to get from ? */
 		shm_info.shm_tot = 0;	/*XXX where to get from ? */
 		shm_info.shm_swp = 0;	/*XXX where to get from ? */
 		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
 		shm_info.swap_successes = 0;	/*XXX where to get from ? */
 		memcpy(buf, &shm_info, sizeof(shm_info));
 		if (bufsz != NULL)
 			*bufsz = sizeof(shm_info);
 		td->td_retval[0] = shmalloced;
 		return (0);
 	}
 	}
 	shmseg = shm_find_segment(rpr, shmid, cmd != SHM_STAT);
 	if (shmseg == NULL)
 		return (EINVAL);
 #ifdef MAC
 	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
 	if (error != 0)
 		return (error);
 #endif
 	switch (cmd) {
 	case SHM_STAT:
 	case IPC_STAT:
 		shmidp = (struct shmid_ds *)buf;
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 		if (error != 0)
 			return (error);
 		memcpy(shmidp, &shmseg->u, sizeof(struct shmid_ds));
 		if (td->td_ucred->cr_prison != shmseg->cred->cr_prison)
 			shmidp->shm_perm.key = IPC_PRIVATE;
 		if (bufsz != NULL)
 			*bufsz = sizeof(struct shmid_ds);
 		if (cmd == SHM_STAT) {
 			td->td_retval[0] = IXSEQ_TO_IPCID(shmid,
 			    shmseg->u.shm_perm);
 		}
 		break;
 	case IPC_SET:
 		shmidp = (struct shmid_ds *)buf;
 		AUDIT_ARG_SVIPC_PERM(&shmidp->shm_perm);
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error != 0)
 			return (error);
 		shmseg->u.shm_perm.uid = shmidp->shm_perm.uid;
 		shmseg->u.shm_perm.gid = shmidp->shm_perm.gid;
 		shmseg->u.shm_perm.mode =
 		    (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
 		    (shmidp->shm_perm.mode & ACCESSPERMS);
 		shmseg->u.shm_ctime = time_second;
 		break;
 	case IPC_RMID:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error != 0)
 			return (error);
 		shm_remove(shmseg, IPCID_TO_IX(shmid));
 		break;
 #if 0
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 #endif
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmctl_locked(td, shmid, cmd, buf, bufsz);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds *buf;
 };
 #endif
 int
 sys_shmctl(struct thread *td, struct shmctl_args *uap)
 {
 	int error;
 	struct shmid_ds buf;
 	size_t bufsz;
 
 	/*
 	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
 	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
 	 * return an error back to the user since we do not to support this.
 	 */
 	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
 	    uap->cmd == SHM_STAT)
 		return (EINVAL);
 
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
 			goto done;
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_STAT:
 		error = copyout(&buf, uap->buf, bufsz);
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 
 static int
 shmget_existing(struct thread *td, struct shmget_args *uap, int mode,
     int segnum)
 {
 	struct shmid_kernel *shmseg;
 #ifdef MAC
 	int error;
 #endif
 
 	SYSVSHM_ASSERT_LOCKED();
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 	shmseg = &shmsegs[segnum];
 	if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
 		return (EEXIST);
 #ifdef MAC
 	error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	if (uap->size != 0 && uap->size > shmseg->u.shm_segsz)
 		return (EINVAL);
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	return (0);
 }
 
 static int
 shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode)
 {
 	struct ucred *cred = td->td_ucred;
 	struct shmid_kernel *shmseg;
 	vm_object_t shm_object;
 	int i, segnum;
 	size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
 		return (EINVAL);
 	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
 		return (ENOSPC);
 	size = round_page(uap->size);
 	if (shm_committed + btoc(size) > shminfo.shmall)
 		return (ENOMEM);
 	if (shm_last_free < 0) {
 		shmrealloc();	/* Maybe expand the shmsegs[] array. */
 		for (i = 0; i < shmalloced; i++)
 			if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
 				break;
 		if (i == shmalloced)
 			return (ENOSPC);
 		segnum = i;
 	} else  {
 		segnum = shm_last_free;
 		shm_last_free = -1;
 	}
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 	shmseg = &shmsegs[segnum];
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOSPC);
 		}
 		if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
 			racct_sub(td->td_proc, RACCT_NSHM, 1);
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	/*
 	 * We make sure that we have allocated a pager before we need
 	 * to.
 	 */
 	shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
 	    0, size, VM_PROT_DEFAULT, 0, cred);
 	if (shm_object == NULL) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			racct_sub(td->td_proc, RACCT_NSHM, 1);
 			racct_sub(td->td_proc, RACCT_SHMSIZE, size);
 			PROC_UNLOCK(td->td_proc);
 		}
 #endif
 		return (ENOMEM);
 	}
 	shm_object->pg_color = 0;
 	VM_OBJECT_WLOCK(shm_object);
 	vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shm_object, OBJ_COLORED | OBJ_NOSPLIT);
 	VM_OBJECT_WUNLOCK(shm_object);
 
 	shmseg->object = shm_object;
 	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
 	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
 	shmseg->u.shm_perm.mode = (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
 	shmseg->u.shm_perm.key = uap->key;
 	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
 	shmseg->cred = crhold(cred);
 	shmseg->u.shm_segsz = uap->size;
 	shmseg->u.shm_cpid = td->td_proc->p_pid;
 	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
 	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
 #ifdef MAC
 	mac_sysvshm_create(cred, shmseg);
 #endif
 	shmseg->u.shm_ctime = time_second;
 	shm_committed += btoc(size);
 	shm_nused++;
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmget_args {
 	key_t key;
 	size_t size;
 	int shmflg;
 };
 #endif
 int
 sys_shmget(struct thread *td, struct shmget_args *uap)
 {
 	int segnum, mode;
 	int error;
 
 	if (shm_find_prison(td->td_ucred) == NULL)
 		return (ENOSYS);
 	mode = uap->shmflg & ACCESSPERMS;
 	SYSVSHM_LOCK();
 	if (uap->key == IPC_PRIVATE) {
 		error = shmget_allocate_segment(td, uap, mode);
 	} else {
 		segnum = shm_find_segment_by_key(td->td_ucred->cr_prison,
 		    uap->key);
 		if (segnum >= 0)
 			error = shmget_existing(td, uap, mode, segnum);
 		else if ((uap->shmflg & IPC_CREAT) == 0)
 			error = ENOENT;
 		else
 			error = shmget_allocate_segment(td, uap, mode);
 	}
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static void
 shmfork_myhook(struct proc *p1, struct proc *p2)
 {
 	struct shmmap_state *shmmap_s;
 	size_t size;
 	int i;
 
 	SYSVSHM_LOCK();
 	size = shminfo.shmseg * sizeof(struct shmmap_state);
 	shmmap_s = malloc(size, M_SHM, M_WAITOK);
 	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
 	p2->p_vmspace->vm_shm = shmmap_s;
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1) {
 			KASSERT(IPCID_TO_IX(shmmap_s->shmid) >= 0 &&
 			    IPCID_TO_IX(shmmap_s->shmid) < shmalloced,
 			    ("segnum %d shmalloced %d",
 			    IPCID_TO_IX(shmmap_s->shmid), shmalloced));
 			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
 		}
 	}
 	SYSVSHM_UNLOCK();
 }
 
 static void
 shmexit_myhook(struct vmspace *vm)
 {
 	struct shmmap_state *base, *shm;
 	int i;
 
 	base = vm->vm_shm;
 	if (base != NULL) {
 		vm->vm_shm = NULL;
 		SYSVSHM_LOCK();
 		for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
 			if (shm->shmid != -1)
 				shm_delete_mapping(vm, shm);
 		}
 		SYSVSHM_UNLOCK();
 		free(base, M_SHM);
 	}
 }
 
 static void
 shmrealloc(void)
 {
 	struct shmid_kernel *newsegs;
 	int i;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (shmalloced >= shminfo.shmmni)
 		return;
 
 	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
 	for (i = 0; i < shmalloced; i++)
 		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
 	for (; i < shminfo.shmmni; i++) {
 		newsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		newsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_sysvshm_init(&newsegs[i]);
 #endif
 	}
 	free(shmsegs, M_SHM);
 	shmsegs = newsegs;
 	shmalloced = shminfo.shmmni;
 }
 
 static struct syscall_helper_data shm_syscalls[] = {
 	SYSCALL_INIT_HELPER(shmat),
 	SYSCALL_INIT_HELPER(shmctl),
 	SYSCALL_INIT_HELPER(shmdt),
 	SYSCALL_INIT_HELPER(shmget),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
 #endif
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 	SYSCALL_INIT_HELPER(shmsys),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data shm32_syscalls[] = {
 	SYSCALL32_INIT_HELPER_COMPAT(shmat),
 	SYSCALL32_INIT_HELPER_COMPAT(shmdt),
 	SYSCALL32_INIT_HELPER_COMPAT(shmget),
 	SYSCALL32_INIT_HELPER(freebsd32_shmsys),
 	SYSCALL32_INIT_HELPER(freebsd32_shmctl),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 shminit(void)
 {
 	struct prison *pr;
 	void **rsv;
 	int i, error;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_CHECK] =		shm_prison_check,
 	    [PR_METHOD_SET] =		shm_prison_set,
 	    [PR_METHOD_GET] =		shm_prison_get,
 	    [PR_METHOD_REMOVE] =	shm_prison_remove,
 	};
 
 #ifndef BURN_BRIDGES
 	if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
 		printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
 #endif
 	if (shminfo.shmmax == SHMMAX) {
 		/* Initialize shmmax dealing with possible overflow. */
 		for (i = PAGE_SIZE; i != 0; i--) {
 			shminfo.shmmax = shminfo.shmall * i;
 			if ((shminfo.shmmax / shminfo.shmall) == (u_long)i)
 				break;
 		}
 	}
 	shmalloced = shminfo.shmmni;
 	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
 	for (i = 0; i < shmalloced; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_sysvshm_init(&shmsegs[i]);
 #endif
 	}
 	shm_last_free = 0;
 	shm_nused = 0;
 	shm_committed = 0;
 	sx_init(&sysvshmsx, "sysvshmsx");
 	shmexit_hook = &shmexit_myhook;
 	shmfork_hook = &shmfork_myhook;
 
 	/* Set current prisons according to their allow.sysvipc. */
 	shm_prison_slot = osd_jail_register(NULL, methods);
 	rsv = osd_reserve(shm_prison_slot);
 	prison_lock(&prison0);
 	(void)osd_jail_set_reserved(&prison0, shm_prison_slot, rsv, &prison0);
 	prison_unlock(&prison0);
 	rsv = NULL;
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (rsv == NULL)
 			rsv = osd_reserve(shm_prison_slot);
 		prison_lock(pr);
 		if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) {
 			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
 		}
 		prison_unlock(pr);
 	}
 	if (rsv != NULL)
 		osd_free_reserved(rsv);
 	sx_sunlock(&allprison_lock);
 
 	error = syscall_helper_register(shm_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(shm32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 shmunload(void)
 {
 	int i;
 
 	if (shm_nused > 0)
 		return (EBUSY);
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(shm32_syscalls);
 #endif
 	syscall_helper_unregister(shm_syscalls);
 	if (shm_prison_slot != 0)
 		osd_jail_deregister(shm_prison_slot);
 
 	for (i = 0; i < shmalloced; i++) {
 #ifdef MAC
 		mac_sysvshm_destroy(&shmsegs[i]);
 #endif
 		/*
 		 * Objects might be still mapped into the processes
 		 * address spaces.  Actual free would happen on the
 		 * last mapping destruction.
 		 */
 		if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
 			vm_object_deallocate(shmsegs[i].object);
 	}
 	free(shmsegs, M_SHM);
 	shmexit_hook = NULL;
 	shmfork_hook = NULL;
 	sx_destroy(&sysvshmsx);
 	return (0);
 }
 
 static int
 sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
 {
 	struct shmid_kernel tshmseg;
 	struct prison *pr, *rpr;
 	int error, i;
 
 	SYSVSHM_LOCK();
 	pr = req->td->td_ucred->cr_prison;
 	rpr = shm_find_prison(req->td->td_ucred);
 	error = 0;
 	for (i = 0; i < shmalloced; i++) {
 		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 		    rpr == NULL || shm_prison_cansee(rpr, &shmsegs[i]) != 0) {
 			bzero(&tshmseg, sizeof(tshmseg));
 			tshmseg.u.shm_perm.mode = SHMSEG_FREE;
 		} else {
 			tshmseg = shmsegs[i];
 			if (tshmseg.cred->cr_prison != pr)
 				tshmseg.u.shm_perm.key = IPC_PRIVATE;
 		}
 		error = SYSCTL_OUT(req, &tshmseg, sizeof(tshmseg));
 		if (error != 0)
 			break;
 	}
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static int
 shm_prison_check(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *prpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/*
 	 * sysvshm is a jailsys integer.
 	 * It must be "disable" if the parent jail is disabled.
 	 */
 	error = vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys));
 	if (error != ENOENT) {
 		if (error != 0)
 			return (error);
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			break;
 		case JAIL_SYS_NEW:
 		case JAIL_SYS_INHERIT:
 			prison_lock(pr->pr_parent);
 			prpr = osd_jail_get(pr->pr_parent, shm_prison_slot);
 			prison_unlock(pr->pr_parent);
 			if (prpr == NULL)
 				return (EPERM);
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 static int
 shm_prison_set(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *tpr, *orpr, *nrpr, *trpr;
 	struct vfsoptlist *opts = data;
 	void *rsv;
 	int jsys, descend;
 
 	/*
 	 * sysvshm controls which jail is the root of the associated segments
 	 * (this jail or same as the parent), or if the feature is available
 	 * at all.
 	 */
 	if (vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)) == ENOENT)
 		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
 		    ? JAIL_SYS_INHERIT
 		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
 		    ? JAIL_SYS_DISABLE
 		    : -1;
 	if (jsys == JAIL_SYS_DISABLE) {
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, shm_prison_slot);
 		if (orpr != NULL)
 			osd_jail_del(pr, shm_prison_slot);
 		prison_unlock(pr);
 		if (orpr != NULL) {
 			if (orpr == pr)
 				shm_prison_cleanup(pr);
 			/* Disable all child jails as well. */
 			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 				prison_lock(tpr);
 				trpr = osd_jail_get(tpr, shm_prison_slot);
 				if (trpr != NULL) {
 					osd_jail_del(tpr, shm_prison_slot);
 					prison_unlock(tpr);
 					if (trpr == tpr)
 						shm_prison_cleanup(tpr);
 				} else {
 					prison_unlock(tpr);
 					descend = 0;
 				}
 			}
 		}
 	} else if (jsys != -1) {
 		if (jsys == JAIL_SYS_NEW)
 			nrpr = pr;
 		else {
 			prison_lock(pr->pr_parent);
 			nrpr = osd_jail_get(pr->pr_parent, shm_prison_slot);
 			prison_unlock(pr->pr_parent);
 		}
 		rsv = osd_reserve(shm_prison_slot);
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, shm_prison_slot);
 		if (orpr != nrpr)
 			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
 			    nrpr);
 		else
 			osd_free_reserved(rsv);
 		prison_unlock(pr);
 		if (orpr != nrpr) {
 			if (orpr == pr)
 				shm_prison_cleanup(pr);
 			if (orpr != NULL) {
 				/* Change child jails matching the old root, */
 				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 					prison_lock(tpr);
 					trpr = osd_jail_get(tpr,
 					    shm_prison_slot);
 					if (trpr == orpr) {
 						(void)osd_jail_set(tpr,
 						    shm_prison_slot, nrpr);
 						prison_unlock(tpr);
 						if (trpr == tpr)
 							shm_prison_cleanup(tpr);
 					} else {
 						prison_unlock(tpr);
 						descend = 0;
 					}
 				}
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 shm_prison_get(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/* Set sysvshm based on the jail's root prison. */
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, shm_prison_slot);
 	prison_unlock(pr);
 	jsys = rpr == NULL ? JAIL_SYS_DISABLE
 	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
 	error = vfs_setopt(opts, "sysvshm", &jsys, sizeof(jsys));
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 static int
 shm_prison_remove(void *obj, void *data __unused)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 
 	SYSVSHM_LOCK();
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, shm_prison_slot);
 	prison_unlock(pr);
 	if (rpr == pr)
 		shm_prison_cleanup(pr);
 	SYSVSHM_UNLOCK();
 	return (0);
 }
 
 static void
 shm_prison_cleanup(struct prison *pr)
 {
 	struct shmid_kernel *shmseg;
 	int i;
 
 	/* Remove any segments that belong to this jail. */
 	for (i = 0; i < shmalloced; i++) {
 		shmseg = &shmsegs[i];
 		if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) &&
 		    shmseg->cred != NULL && shmseg->cred->cr_prison == pr) {
 			shm_remove(shmseg, i);
 		}
 	}
 }
 
 SYSCTL_JAIL_PARAM_SYS_NODE(sysvshm, CTLFLAG_RW, "SYSV shared memory");
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmid_ds {
 	struct	ipc_perm_old shm_perm;	/* operation perms */
 	int	shm_segsz;		/* size of segment (bytes) */
 	u_short	shm_cpid;		/* pid, creator */
 	u_short	shm_lpid;		/* pid, last operation */
 	short	shm_nattch;		/* no. of current attaches */
 	time_t	shm_atime;		/* last attach time */
 	time_t	shm_dtime;		/* last detach time */
 	time_t	shm_ctime;		/* last change time */
 	void	*shm_handle;		/* internal handle for shm segment */
 };
 
 struct oshmctl_args {
 	int shmid;
 	int cmd;
 	struct oshmid_ds *ubuf;
 };
 
 static int
 oshmctl(struct thread *td, struct oshmctl_args *uap)
 {
 #ifdef COMPAT_43
 	int error = 0;
 	struct prison *rpr;
 	struct shmid_kernel *shmseg;
 	struct oshmid_ds outbuf;
 
 	rpr = shm_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 	if (uap->cmd != IPC_STAT) {
 		return (freebsd7_shmctl(td,
 		    (struct freebsd7_shmctl_args *)uap));
 	}
 	SYSVSHM_LOCK();
 	shmseg = shm_find_segment(rpr, uap->shmid, true);
 	if (shmseg == NULL) {
 		SYSVSHM_UNLOCK();
 		return (EINVAL);
 	}
 	error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 	if (error != 0) {
 		SYSVSHM_UNLOCK();
 		return (error);
 	}
 #ifdef MAC
 	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
 	if (error != 0) {
 		SYSVSHM_UNLOCK();
 		return (error);
 	}
 #endif
 	ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
 	outbuf.shm_segsz = shmseg->u.shm_segsz;
 	outbuf.shm_cpid = shmseg->u.shm_cpid;
 	outbuf.shm_lpid = shmseg->u.shm_lpid;
 	outbuf.shm_nattch = shmseg->u.shm_nattch;
 	outbuf.shm_atime = shmseg->u.shm_atime;
 	outbuf.shm_dtime = shmseg->u.shm_dtime;
 	outbuf.shm_ctime = shmseg->u.shm_ctime;
 	outbuf.shm_handle = shmseg->object;
 	SYSVSHM_UNLOCK();
 	return (copyout(&outbuf, uap->ubuf, sizeof(outbuf)));
 #else
 	return (EINVAL);
 #endif
 }
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *shmcalls[] = {
 	(sy_call_t *)sys_shmat, (sy_call_t *)oshmctl,
 	(sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget,
 	(sy_call_t *)freebsd7_shmctl
 };
 
 #ifndef _SYS_SYSPROTO_H_
 /* XXX actually varargs. */
 struct shmsys_args {
 	int	which;
 	int	a2;
 	int	a3;
 	int	a4;
 };
 #endif
 int
 sys_shmsys(struct thread *td, struct shmsys_args *uap)
 {
 
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	if (uap->which < 0 || uap->which >= nitems(shmcalls))
 		return (EINVAL);
 	return ((*shmcalls[uap->which])(td, &uap->a2));
 }
 
 #endif	/* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */
 
 #ifdef COMPAT_FREEBSD32
 
 int
 freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	switch (uap->which) {
 	case 0:	{	/* shmat */
 		struct shmat_args ap;
 
 		ap.shmid = uap->a2;
 		ap.shmaddr = PTRIN(uap->a3);
 		ap.shmflg = uap->a4;
 		return (sysent[SYS_shmat].sy_call(td, &ap));
 	}
 	case 2: {	/* shmdt */
 		struct shmdt_args ap;
 
 		ap.shmaddr = PTRIN(uap->a2);
 		return (sysent[SYS_shmdt].sy_call(td, &ap));
 	}
 	case 3: {	/* shmget */
 		struct shmget_args ap;
 
 		ap.key = uap->a2;
 		ap.size = uap->a3;
 		ap.shmflg = uap->a4;
 		return (sysent[SYS_shmget].sy_call(td, &ap));
 	}
 	case 4: {	/* shmctl */
 		struct freebsd7_freebsd32_shmctl_args ap;
 
 		ap.shmid = uap->a2;
 		ap.cmd = uap->a3;
 		ap.buf = PTRIN(uap->a4);
 		return (freebsd7_freebsd32_shmctl(td, &ap));
 	}
 	case 1:		/* oshmctl */
 	default:
 		return (EINVAL);
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_shmctl(struct thread *td,
     struct freebsd7_freebsd32_shmctl_args *uap)
 {
 	int error;
 	union {
 		struct shmid_ds shmid_ds;
 		struct shm_info shm_info;
 		struct shminfo shminfo;
 	} u;
 	union {
 		struct shmid_ds32_old shmid_ds32;
 		struct shm_info32 shm_info32;
 		struct shminfo32 shminfo32;
 	} u32;
 	size_t sz;
 
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &u32.shmid_ds32,
 		    sizeof(u32.shmid_ds32))))
 			goto done;
 		freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
 		    &u.shmid_ds.shm_perm);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 		CP(u.shminfo, u32.shminfo32, shmmax);
 		CP(u.shminfo, u32.shminfo32, shmmin);
 		CP(u.shminfo, u32.shminfo32, shmmni);
 		CP(u.shminfo, u32.shminfo32, shmseg);
 		CP(u.shminfo, u32.shminfo32, shmall);
 		error = copyout(&u32.shminfo32, uap->buf,
 		    sizeof(u32.shminfo32));
 		break;
 	case SHM_INFO:
 		CP(u.shm_info, u32.shm_info32, used_ids);
 		CP(u.shm_info, u32.shm_info32, shm_rss);
 		CP(u.shm_info, u32.shm_info32, shm_tot);
 		CP(u.shm_info, u32.shm_info32, shm_swp);
 		CP(u.shm_info, u32.shm_info32, swap_attempts);
 		CP(u.shm_info, u32.shm_info32, swap_successes);
 		error = copyout(&u32.shm_info32, uap->buf,
 		    sizeof(u32.shm_info32));
 		break;
 	case SHM_STAT:
 	case IPC_STAT:
 		freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
 		    &u32.shmid_ds32.shm_perm);
 		if (u.shmid_ds.shm_segsz > INT32_MAX)
 			u32.shmid_ds32.shm_segsz = INT32_MAX;
 		else
 			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
 		u32.shmid_ds32.shm_internal = 0;
 		error = copyout(&u32.shmid_ds32, uap->buf,
 		    sizeof(u32.shmid_ds32));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap)
 {
 	int error;
 	union {
 		struct shmid_ds shmid_ds;
 		struct shm_info shm_info;
 		struct shminfo shminfo;
 	} u;
 	union {
 		struct shmid_ds32 shmid_ds32;
 		struct shm_info32 shm_info32;
 		struct shminfo32 shminfo32;
 	} u32;
 	size_t sz;
 
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &u32.shmid_ds32,
 		    sizeof(u32.shmid_ds32))))
 			goto done;
 		freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
 		    &u.shmid_ds.shm_perm);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 		CP(u.shminfo, u32.shminfo32, shmmax);
 		CP(u.shminfo, u32.shminfo32, shmmin);
 		CP(u.shminfo, u32.shminfo32, shmmni);
 		CP(u.shminfo, u32.shminfo32, shmseg);
 		CP(u.shminfo, u32.shminfo32, shmall);
 		error = copyout(&u32.shminfo32, uap->buf,
 		    sizeof(u32.shminfo32));
 		break;
 	case SHM_INFO:
 		CP(u.shm_info, u32.shm_info32, used_ids);
 		CP(u.shm_info, u32.shm_info32, shm_rss);
 		CP(u.shm_info, u32.shm_info32, shm_tot);
 		CP(u.shm_info, u32.shm_info32, shm_swp);
 		CP(u.shm_info, u32.shm_info32, swap_attempts);
 		CP(u.shm_info, u32.shm_info32, swap_successes);
 		error = copyout(&u32.shm_info32, uap->buf,
 		    sizeof(u32.shm_info32));
 		break;
 	case SHM_STAT:
 	case IPC_STAT:
 		freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
 		    &u32.shmid_ds32.shm_perm);
 		if (u.shmid_ds.shm_segsz > INT32_MAX)
 			u32.shmid_ds32.shm_segsz = INT32_MAX;
 		else
 			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
 		error = copyout(&u32.shmid_ds32, uap->buf,
 		    sizeof(u32.shmid_ds32));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7_shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds_old *buf;
 };
 #endif
 int
 freebsd7_shmctl(struct thread *td, struct freebsd7_shmctl_args *uap)
 {
 	int error;
 	struct shmid_ds_old old;
 	struct shmid_ds buf;
 	size_t bufsz;
 
 	/*
 	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
 	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
 	 * return an error back to the user since we do not to support this.
 	 */
 	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
 	    uap->cmd == SHM_STAT)
 		return (EINVAL);
 
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &old, sizeof(old))))
 			goto done;
 		ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
 		CP(old, buf, shm_segsz);
 		CP(old, buf, shm_lpid);
 		CP(old, buf, shm_cpid);
 		CP(old, buf, shm_nattch);
 		CP(old, buf, shm_atime);
 		CP(old, buf, shm_dtime);
 		CP(old, buf, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_STAT:
 		ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
 		if (buf.shm_segsz > INT_MAX)
 			old.shm_segsz = INT_MAX;
 		else
 			CP(buf, old, shm_segsz);
 		CP(buf, old, shm_lpid);
 		CP(buf, old, shm_cpid);
 		if (buf.shm_nattch > SHRT_MAX)
 			old.shm_nattch = SHRT_MAX;
 		else
 			CP(buf, old, shm_nattch);
 		CP(buf, old, shm_atime);
 		CP(buf, old, shm_dtime);
 		CP(buf, old, shm_ctime);
 		old.shm_internal = NULL;
 		error = copyout(&old, uap->buf, sizeof(old));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 #endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
 	   COMPAT_FREEBSD7 */
 
 static int
 sysvshm_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = shminit();
 		if (error != 0)
 			shmunload();
 		break;
 	case MOD_UNLOAD:
 		error = shmunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvshm_mod = {
 	"sysvshm",
 	&sysvshm_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvshm, 1);
Index: head/sys/kern/tty.c
===================================================================
--- head/sys/kern/tty.c	(revision 326270)
+++ head/sys/kern/tty.c	(revision 326271)
@@ -1,2347 +1,2349 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #ifdef COMPAT_43TTY
 #include <sys/ioctl_compat.h>
 #endif /* COMPAT_43TTY */
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/serial.h>
 #include <sys/signal.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/ttycom.h>
 #define TTYDEFCHARS
 #include <sys/ttydefaults.h>
 #undef TTYDEFCHARS
 #include <sys/ucred.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 static MALLOC_DEFINE(M_TTY, "tty", "tty device");
 
 static void tty_rel_free(struct tty *tp);
 
 static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
 static struct sx tty_list_sx;
 SX_SYSINIT(tty_list, &tty_list_sx, "tty list");
 static unsigned int tty_list_count = 0;
 
 /* Character device of /dev/console. */
 static struct cdev	*dev_console;
 static const char	*dev_console_filename;
 
 /*
  * Flags that are supported and stored by this implementation.
  */
 #define TTYSUP_IFLAG	(IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|\
 			INLCR|IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL)
 #define TTYSUP_OFLAG	(OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET)
 #define TTYSUP_LFLAG	(ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\
 			ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\
 			FLUSHO|NOKERNINFO|NOFLSH)
 #define TTYSUP_CFLAG	(CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\
 			HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\
 			CDSR_OFLOW|CCAR_OFLOW)
 
 #define	TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)
 
 static int  tty_drainwait = 5 * 60;
 SYSCTL_INT(_kern, OID_AUTO, tty_drainwait, CTLFLAG_RWTUN,
     &tty_drainwait, 0, "Default output drain timeout in seconds");
 
 /*
  * Set TTY buffer sizes.
  */
 
 #define	TTYBUF_MAX	65536
 
 /*
  * Allocate buffer space if necessary, and set low watermarks, based on speed.
  * Note that the ttyxxxq_setsize() functions may drop and then reacquire the tty
  * lock during memory allocation.  They will return ENXIO if the tty disappears
  * while unlocked.
  */
 static int
 tty_watermarks(struct tty *tp)
 {
 	size_t bs = 0;
 	int error;
 
 	/* Provide an input buffer for 2 seconds of data. */
 	if (tp->t_termios.c_cflag & CREAD)
 		bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX);
 	error = ttyinq_setsize(&tp->t_inq, tp, bs);
 	if (error != 0)
 		return (error);
 
 	/* Set low watermark at 10% (when 90% is available). */
 	tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10;
 
 	/* Provide an output buffer for 2 seconds of data. */
 	bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX);
 	error = ttyoutq_setsize(&tp->t_outq, tp, bs);
 	if (error != 0)
 		return (error);
 
 	/* Set low watermark at 10% (when 90% is available). */
 	tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10;
 
 	return (0);
 }
 
 static int
 tty_drain(struct tty *tp, int leaving)
 {
 	sbintime_t timeout_at;
 	size_t bytes;
 	int error;
 
 	if (ttyhook_hashook(tp, getc_inject))
 		/* buffer is inaccessible */
 		return (0);
 
 	/*
 	 * For close(), use the recent historic timeout of "1 second without
 	 * making progress".  For tcdrain(), use t_drainwait as the timeout,
 	 * with zero meaning "no timeout" which gives POSIX behavior.
 	 */
 	if (leaving)
 		timeout_at = getsbinuptime() + SBT_1S;
 	else if (tp->t_drainwait != 0)
 		timeout_at = getsbinuptime() + SBT_1S * tp->t_drainwait;
 	else
 		timeout_at = 0;
 
 	/*
 	 * Poll the output buffer and the hardware for completion, at 10 Hz.
 	 * Polling is required for devices which are not able to signal an
 	 * interrupt when the transmitter becomes idle (most USB serial devs).
 	 * The unusual structure of this loop ensures we check for busy one more
 	 * time after tty_timedwait() returns EWOULDBLOCK, so that success has
 	 * higher priority than timeout if the IO completed in the last 100mS.
 	 */
 	error = 0;
 	bytes = ttyoutq_bytesused(&tp->t_outq);
 	for (;;) {
 		if (ttyoutq_bytesused(&tp->t_outq) == 0 && !ttydevsw_busy(tp))
 			return (0);
 		if (error != 0)
 			return (error);
 		ttydevsw_outwakeup(tp);
 		error = tty_timedwait(tp, &tp->t_outwait, hz / 10);
 		if (error != 0 && error != EWOULDBLOCK)
 			return (error);
 		else if (timeout_at == 0 || getsbinuptime() < timeout_at)
 			error = 0;
 		else if (leaving && ttyoutq_bytesused(&tp->t_outq) < bytes) {
 			/* In close, making progress, grant an extra second. */
 			error = 0;
 			timeout_at += SBT_1S;
 			bytes = ttyoutq_bytesused(&tp->t_outq);
 		}
 	}
 }
 
 /*
  * Though ttydev_enter() and ttydev_leave() seem to be related, they
  * don't have to be used together. ttydev_enter() is used by the cdev
  * operations to prevent an actual operation from being processed when
  * the TTY has been abandoned. ttydev_leave() is used by ttydev_open()
  * and ttydev_close() to determine whether per-TTY data should be
  * deallocated.
  */
 
 static __inline int
 ttydev_enter(struct tty *tp)
 {
 
 	tty_lock(tp);
 
 	if (tty_gone(tp) || !tty_opened(tp)) {
 		/* Device is already gone. */
 		tty_unlock(tp);
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 ttydev_leave(struct tty *tp)
 {
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) {
 		/* Device is still opened somewhere. */
 		tty_unlock(tp);
 		return;
 	}
 
 	tp->t_flags |= TF_OPENCLOSE;
 
 	/* Stop asynchronous I/O. */
 	funsetown(&tp->t_sigio);
 
 	/* Remove console TTY. */
 	if (constty == tp)
 		constty_clear();
 
 	/* Drain any output. */
 	if (!tty_gone(tp))
 		tty_drain(tp, 1);
 
 	ttydisc_close(tp);
 
 	/* Free i/o queues now since they might be large. */
 	ttyinq_free(&tp->t_inq);
 	tp->t_inlow = 0;
 	ttyoutq_free(&tp->t_outq);
 	tp->t_outlow = 0;
 
 	knlist_clear(&tp->t_inpoll.si_note, 1);
 	knlist_clear(&tp->t_outpoll.si_note, 1);
 
 	if (!tty_gone(tp))
 		ttydevsw_close(tp);
 
 	tp->t_flags &= ~TF_OPENCLOSE;
 	cv_broadcast(&tp->t_dcdwait);
 	tty_rel_free(tp);
 }
 
 /*
  * Operations that are exposed through the character device in /dev.
  */
 static int
 ttydev_open(struct cdev *dev, int oflags, int devtype __unused,
     struct thread *td)
 {
 	struct tty *tp;
 	int error;
 
 	tp = dev->si_drv1;
 	error = 0;
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		/* Device is already gone. */
 		tty_unlock(tp);
 		return (ENXIO);
 	}
 
 	/*
 	 * Block when other processes are currently opening or closing
 	 * the TTY.
 	 */
 	while (tp->t_flags & TF_OPENCLOSE) {
 		error = tty_wait(tp, &tp->t_dcdwait);
 		if (error != 0) {
 			tty_unlock(tp);
 			return (error);
 		}
 	}
 	tp->t_flags |= TF_OPENCLOSE;
 
 	/*
 	 * Make sure the "tty" and "cua" device cannot be opened at the
 	 * same time.  The console is a "tty" device.
 	 */
 	if (TTY_CALLOUT(tp, dev)) {
 		if (tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) {
 			error = EBUSY;
 			goto done;
 		}
 	} else {
 		if (tp->t_flags & TF_OPENED_OUT) {
 			error = EBUSY;
 			goto done;
 		}
 	}
 
 	if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) {
 		error = EBUSY;
 		goto done;
 	}
 
 	if (!tty_opened(tp)) {
 		/* Set proper termios flags. */
 		if (TTY_CALLOUT(tp, dev))
 			tp->t_termios = tp->t_termios_init_out;
 		else
 			tp->t_termios = tp->t_termios_init_in;
 		ttydevsw_param(tp, &tp->t_termios);
 		/* Prevent modem control on callout devices and /dev/console. */
 		if (TTY_CALLOUT(tp, dev) || dev == dev_console)
 			tp->t_termios.c_cflag |= CLOCAL;
 
 		ttydevsw_modem(tp, SER_DTR|SER_RTS, 0);
 
 		error = ttydevsw_open(tp);
 		if (error != 0)
 			goto done;
 
 		ttydisc_open(tp);
 		error = tty_watermarks(tp);
 		if (error != 0)
 			goto done;
 	}
 
 	/* Wait for Carrier Detect. */
 	if ((oflags & O_NONBLOCK) == 0 &&
 	    (tp->t_termios.c_cflag & CLOCAL) == 0) {
 		while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) {
 			error = tty_wait(tp, &tp->t_dcdwait);
 			if (error != 0)
 				goto done;
 		}
 	}
 
 	if (dev == dev_console)
 		tp->t_flags |= TF_OPENED_CONS;
 	else if (TTY_CALLOUT(tp, dev))
 		tp->t_flags |= TF_OPENED_OUT;
 	else
 		tp->t_flags |= TF_OPENED_IN;
 	MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 ||
 	    (tp->t_flags & TF_OPENED_OUT) == 0);
 
 done:	tp->t_flags &= ~TF_OPENCLOSE;
 	cv_broadcast(&tp->t_dcdwait);
 	ttydev_leave(tp);
 
 	return (error);
 }
 
 static int
 ttydev_close(struct cdev *dev, int fflag, int devtype __unused,
     struct thread *td __unused)
 {
 	struct tty *tp = dev->si_drv1;
 
 	tty_lock(tp);
 
 	/*
 	 * Don't actually close the device if it is being used as the
 	 * console.
 	 */
 	MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 ||
 	    (tp->t_flags & TF_OPENED_OUT) == 0);
 	if (dev == dev_console)
 		tp->t_flags &= ~TF_OPENED_CONS;
 	else
 		tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT);
 
 	if (tp->t_flags & TF_OPENED) {
 		tty_unlock(tp);
 		return (0);
 	}
 
 	/* If revoking, flush output now to avoid draining it later. */
 	if (fflag & FREVOKE)
 		tty_flush(tp, FWRITE);
 
 	tp->t_flags &= ~TF_EXCLUDE;
 
 	/* Properly wake up threads that are stuck - revoke(). */
 	tp->t_revokecnt++;
 	tty_wakeup(tp, FREAD|FWRITE);
 	cv_broadcast(&tp->t_bgwait);
 	cv_broadcast(&tp->t_dcdwait);
 
 	ttydev_leave(tp);
 
 	return (0);
 }
 
 static __inline int
 tty_is_ctty(struct tty *tp, struct proc *p)
 {
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT);
 }
 
 int
 tty_wait_background(struct tty *tp, struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 	struct pgrp *pg;
 	ksiginfo_t ksi;
 	int error;
 
 	MPASS(sig == SIGTTIN || sig == SIGTTOU);
 	tty_lock_assert(tp, MA_OWNED);
 
 	for (;;) {
 		PROC_LOCK(p);
 		/*
 		 * The process should only sleep, when:
 		 * - This terminal is the controlling terminal
 		 * - Its process group is not the foreground process
 		 *   group
 		 * - The parent process isn't waiting for the child to
 		 *   exit
 		 * - the signal to send to the process isn't masked
 		 */
 		if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) {
 			/* Allow the action to happen. */
 			PROC_UNLOCK(p);
 			return (0);
 		}
 
 		if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) ||
 		    SIGISMEMBER(td->td_sigmask, sig)) {
 			/* Only allow them in write()/ioctl(). */
 			PROC_UNLOCK(p);
 			return (sig == SIGTTOU ? 0 : EIO);
 		}
 
 		pg = p->p_pgrp;
 		if (p->p_flag & P_PPWAIT || pg->pg_jobc == 0) {
 			/* Don't allow the action to happen. */
 			PROC_UNLOCK(p);
 			return (EIO);
 		}
 		PROC_UNLOCK(p);
 
 		/*
 		 * Send the signal and sleep until we're the new
 		 * foreground process group.
 		 */
 		if (sig != 0) {
 			ksiginfo_init(&ksi);
 			ksi.ksi_code = SI_KERNEL;
 			ksi.ksi_signo = sig;
 			sig = 0;
 		}
 		PGRP_LOCK(pg);
 		pgsignal(pg, ksi.ksi_signo, 1, &ksi);
 		PGRP_UNLOCK(pg);
 
 		error = tty_wait(tp, &tp->t_bgwait);
 		if (error)
 			return (error);
 	}
 }
 
 static int
 ttydev_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		goto done;
 	error = ttydisc_read(tp, uio, ioflag);
 	tty_unlock(tp);
 
 	/*
 	 * The read() call should not throw an error when the device is
 	 * being destroyed. Silently convert it to an EOF.
 	 */
 done:	if (error == ENXIO)
 		error = 0;
 	return (error);
 }
 
 static int
 ttydev_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	if (tp->t_termios.c_lflag & TOSTOP) {
 		error = tty_wait_background(tp, curthread, SIGTTOU);
 		if (error)
 			goto done;
 	}
 
 	if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) {
 		/* Allow non-blocking writes to bypass serialization. */
 		error = ttydisc_write(tp, uio, ioflag);
 	} else {
 		/* Serialize write() calls. */
 		while (tp->t_flags & TF_BUSY_OUT) {
 			error = tty_wait(tp, &tp->t_outserwait);
 			if (error)
 				goto done;
 		}
 
 		tp->t_flags |= TF_BUSY_OUT;
 		error = ttydisc_write(tp, uio, ioflag);
 		tp->t_flags &= ~TF_BUSY_OUT;
 		cv_signal(&tp->t_outserwait);
 	}
 
 done:	tty_unlock(tp);
 	return (error);
 }
 
 static int
 ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	switch (cmd) {
 	case TIOCCBRK:
 	case TIOCCONS:
 	case TIOCDRAIN:
 	case TIOCEXCL:
 	case TIOCFLUSH:
 	case TIOCNXCL:
 	case TIOCSBRK:
 	case TIOCSCTTY:
 	case TIOCSETA:
 	case TIOCSETAF:
 	case TIOCSETAW:
 	case TIOCSPGRP:
 	case TIOCSTART:
 	case TIOCSTAT:
 	case TIOCSTI:
 	case TIOCSTOP:
 	case TIOCSWINSZ:
 #if 0
 	case TIOCSDRAINWAIT:
 	case TIOCSETD:
 #endif
 #ifdef COMPAT_43TTY
 	case  TIOCLBIC:
 	case  TIOCLBIS:
 	case  TIOCLSET:
 	case  TIOCSETC:
 	case OTIOCSETD:
 	case  TIOCSETN:
 	case  TIOCSETP:
 	case  TIOCSLTC:
 #endif /* COMPAT_43TTY */
 		/*
 		 * If the ioctl() causes the TTY to be modified, let it
 		 * wait in the background.
 		 */
 		error = tty_wait_background(tp, curthread, SIGTTOU);
 		if (error)
 			goto done;
 	}
 
 	if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) {
 		struct termios *old = &tp->t_termios;
 		struct termios *new = (struct termios *)data;
 		struct termios *lock = TTY_CALLOUT(tp, dev) ?
 		    &tp->t_termios_lock_out : &tp->t_termios_lock_in;
 		int cc;
 
 		/*
 		 * Lock state devices.  Just overwrite the values of the
 		 * commands that are currently in use.
 		 */
 		new->c_iflag = (old->c_iflag & lock->c_iflag) |
 		    (new->c_iflag & ~lock->c_iflag);
 		new->c_oflag = (old->c_oflag & lock->c_oflag) |
 		    (new->c_oflag & ~lock->c_oflag);
 		new->c_cflag = (old->c_cflag & lock->c_cflag) |
 		    (new->c_cflag & ~lock->c_cflag);
 		new->c_lflag = (old->c_lflag & lock->c_lflag) |
 		    (new->c_lflag & ~lock->c_lflag);
 		for (cc = 0; cc < NCCS; ++cc)
 			if (lock->c_cc[cc])
 				new->c_cc[cc] = old->c_cc[cc];
 		if (lock->c_ispeed)
 			new->c_ispeed = old->c_ispeed;
 		if (lock->c_ospeed)
 			new->c_ospeed = old->c_ospeed;
 	}
 
 	error = tty_ioctl(tp, cmd, data, fflag, td);
 done:	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ttydev_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error, revents = 0;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
 
 	if (events & (POLLIN|POLLRDNORM)) {
 		/* See if we can read something. */
 		if (ttydisc_read_poll(tp) > 0)
 			revents |= events & (POLLIN|POLLRDNORM);
 	}
 
 	if (tp->t_flags & TF_ZOMBIE) {
 		/* Hangup flag on zombie state. */
 		revents |= POLLHUP;
 	} else if (events & (POLLOUT|POLLWRNORM)) {
 		/* See if we can write something. */
 		if (ttydisc_write_poll(tp) > 0)
 			revents |= events & (POLLOUT|POLLWRNORM);
 	}
 
 	if (revents == 0) {
 		if (events & (POLLIN|POLLRDNORM))
 			selrecord(td, &tp->t_inpoll);
 		if (events & (POLLOUT|POLLWRNORM))
 			selrecord(td, &tp->t_outpoll);
 	}
 
 	tty_unlock(tp);
 
 	return (revents);
 }
 
 static int
 ttydev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	/* Handle mmap() through the driver. */
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (-1);
 	error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr);
 	tty_unlock(tp);
 
 	return (error);
 }
 
 /*
  * kqueue support.
  */
 
 static void
 tty_kqops_read_detach(struct knote *kn)
 {
 	struct tty *tp = kn->kn_hook;
 
 	knlist_remove(&tp->t_inpoll.si_note, kn, 0);
 }
 
 static int
 tty_kqops_read_event(struct knote *kn, long hint __unused)
 {
 	struct tty *tp = kn->kn_hook;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_read_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static void
 tty_kqops_write_detach(struct knote *kn)
 {
 	struct tty *tp = kn->kn_hook;
 
 	knlist_remove(&tp->t_outpoll.si_note, kn, 0);
 }
 
 static int
 tty_kqops_write_event(struct knote *kn, long hint __unused)
 {
 	struct tty *tp = kn->kn_hook;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tty_gone(tp)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_write_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static struct filterops tty_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = tty_kqops_read_detach,
 	.f_event = tty_kqops_read_event,
 };
 
 static struct filterops tty_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = tty_kqops_write_detach,
 	.f_event = tty_kqops_write_event,
 };
 
 static int
 ttydev_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_hook = tp;
 		kn->kn_fop = &tty_kqops_read;
 		knlist_add(&tp->t_inpoll.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		kn->kn_hook = tp;
 		kn->kn_fop = &tty_kqops_write;
 		knlist_add(&tp->t_outpoll.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	tty_unlock(tp);
 	return (error);
 }
 
 static struct cdevsw ttydev_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttydev_open,
 	.d_close	= ttydev_close,
 	.d_read		= ttydev_read,
 	.d_write	= ttydev_write,
 	.d_ioctl	= ttydev_ioctl,
 	.d_kqfilter	= ttydev_kqfilter,
 	.d_poll		= ttydev_poll,
 	.d_mmap		= ttydev_mmap,
 	.d_name		= "ttydev",
 	.d_flags	= D_TTY,
 };
 
 /*
  * Init/lock-state devices
  */
 
 static int
 ttyil_open(struct cdev *dev, int oflags __unused, int devtype __unused,
     struct thread *td)
 {
 	struct tty *tp;
 	int error;
 
 	tp = dev->si_drv1;
 	error = 0;
 	tty_lock(tp);
 	if (tty_gone(tp))
 		error = ENODEV;
 	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ttyil_close(struct cdev *dev __unused, int flag __unused, int mode __unused,
     struct thread *td __unused)
 {
 
 	return (0);
 }
 
 static int
 ttyil_rdwr(struct cdev *dev __unused, struct uio *uio __unused,
     int ioflag __unused)
 {
 
 	return (ENODEV);
 }
 
 static int
 ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		error = ENODEV;
 		goto done;
 	}
 
 	error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
 	if (error != ENOIOCTL)
 		goto done;
 	error = 0;
 
 	switch (cmd) {
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		*(struct termios*)data = *(struct termios*)dev->si_drv2;
 		break;
 	case TIOCSETA:
 		/* Set terminal flags through tcsetattr(). */
 		error = priv_check(td, PRIV_TTY_SETA);
 		if (error)
 			break;
 		*(struct termios*)dev->si_drv2 = *(struct termios*)data;
 		break;
 	case TIOCGETD:
 		*(int *)data = TTYDISC;
 		break;
 	case TIOCGWINSZ:
 		bzero(data, sizeof(struct winsize));
 		break;
 	default:
 		error = ENOTTY;
 	}
 
 done:	tty_unlock(tp);
 	return (error);
 }
 
 static struct cdevsw ttyil_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttyil_open,
 	.d_close	= ttyil_close,
 	.d_read		= ttyil_rdwr,
 	.d_write	= ttyil_rdwr,
 	.d_ioctl	= ttyil_ioctl,
 	.d_name		= "ttyil",
 	.d_flags	= D_TTY,
 };
 
 static void
 tty_init_termios(struct tty *tp)
 {
 	struct termios *t = &tp->t_termios_init_in;
 
 	t->c_cflag = TTYDEF_CFLAG;
 	t->c_iflag = TTYDEF_IFLAG;
 	t->c_lflag = TTYDEF_LFLAG;
 	t->c_oflag = TTYDEF_OFLAG;
 	t->c_ispeed = TTYDEF_SPEED;
 	t->c_ospeed = TTYDEF_SPEED;
 	memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars);
 
 	tp->t_termios_init_out = *t;
 }
 
 void
 tty_init_console(struct tty *tp, speed_t s)
 {
 	struct termios *ti = &tp->t_termios_init_in;
 	struct termios *to = &tp->t_termios_init_out;
 
 	if (s != 0) {
 		ti->c_ispeed = ti->c_ospeed = s;
 		to->c_ispeed = to->c_ospeed = s;
 	}
 
 	ti->c_cflag |= CLOCAL;
 	to->c_cflag |= CLOCAL;
 }
 
 /*
  * Standard device routine implementations, mostly meant for
  * pseudo-terminal device drivers. When a driver creates a new terminal
  * device class, missing routines are patched.
  */
 
 static int
 ttydevsw_defopen(struct tty *tp __unused)
 {
 
 	return (0);
 }
 
 static void
 ttydevsw_defclose(struct tty *tp __unused)
 {
 
 }
 
 static void
 ttydevsw_defoutwakeup(struct tty *tp __unused)
 {
 
 	panic("Terminal device has output, while not implemented");
 }
 
 static void
 ttydevsw_definwakeup(struct tty *tp __unused)
 {
 
 }
 
 static int
 ttydevsw_defioctl(struct tty *tp __unused, u_long cmd __unused,
     caddr_t data __unused, struct thread *td __unused)
 {
 
 	return (ENOIOCTL);
 }
 
 static int
 ttydevsw_defcioctl(struct tty *tp __unused, int unit __unused,
     u_long cmd __unused, caddr_t data __unused, struct thread *td __unused)
 {
 
 	return (ENOIOCTL);
 }
 
 static int
 ttydevsw_defparam(struct tty *tp __unused, struct termios *t)
 {
 
 	/*
 	 * Allow the baud rate to be adjusted for pseudo-devices, but at
 	 * least restrict it to 115200 to prevent excessive buffer
 	 * usage.  Also disallow 0, to prevent foot shooting.
 	 */
 	if (t->c_ispeed < B50)
 		t->c_ispeed = B50;
 	else if (t->c_ispeed > B115200)
 		t->c_ispeed = B115200;
 	if (t->c_ospeed < B50)
 		t->c_ospeed = B50;
 	else if (t->c_ospeed > B115200)
 		t->c_ospeed = B115200;
 	t->c_cflag |= CREAD;
 
 	return (0);
 }
 
 static int
 ttydevsw_defmodem(struct tty *tp __unused, int sigon __unused,
     int sigoff __unused)
 {
 
 	/* Simulate a carrier to make the TTY layer happy. */
 	return (SER_DCD);
 }
 
 static int
 ttydevsw_defmmap(struct tty *tp __unused, vm_ooffset_t offset __unused,
     vm_paddr_t *paddr __unused, int nprot __unused,
     vm_memattr_t *memattr __unused)
 {
 
 	return (-1);
 }
 
 static void
 ttydevsw_defpktnotify(struct tty *tp __unused, char event __unused)
 {
 
 }
 
 static void
 ttydevsw_deffree(void *softc __unused)
 {
 
 	panic("Terminal device freed without a free-handler");
 }
 
 static bool
 ttydevsw_defbusy(struct tty *tp __unused)
 {
 
 	return (FALSE);
 }
 
 /*
  * TTY allocation and deallocation. TTY devices can be deallocated when
  * the driver doesn't use it anymore, when the TTY isn't a session's
  * controlling TTY and when the device node isn't opened through devfs.
  */
 
 struct tty *
 tty_alloc(struct ttydevsw *tsw, void *sc)
 {
 
 	return (tty_alloc_mutex(tsw, sc, NULL));
 }
 
 struct tty *
 tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex)
 {
 	struct tty *tp;
 
 	/* Make sure the driver defines all routines. */
 #define PATCH_FUNC(x) do {				\
 	if (tsw->tsw_ ## x == NULL)			\
 		tsw->tsw_ ## x = ttydevsw_def ## x;	\
 } while (0)
 	PATCH_FUNC(open);
 	PATCH_FUNC(close);
 	PATCH_FUNC(outwakeup);
 	PATCH_FUNC(inwakeup);
 	PATCH_FUNC(ioctl);
 	PATCH_FUNC(cioctl);
 	PATCH_FUNC(param);
 	PATCH_FUNC(modem);
 	PATCH_FUNC(mmap);
 	PATCH_FUNC(pktnotify);
 	PATCH_FUNC(free);
 	PATCH_FUNC(busy);
 #undef PATCH_FUNC
 
 	tp = malloc(sizeof(struct tty), M_TTY, M_WAITOK|M_ZERO);
 	tp->t_devsw = tsw;
 	tp->t_devswsoftc = sc;
 	tp->t_flags = tsw->tsw_flags;
 	tp->t_drainwait = tty_drainwait;
 
 	tty_init_termios(tp);
 
 	cv_init(&tp->t_inwait, "ttyin");
 	cv_init(&tp->t_outwait, "ttyout");
 	cv_init(&tp->t_outserwait, "ttyosr");
 	cv_init(&tp->t_bgwait, "ttybg");
 	cv_init(&tp->t_dcdwait, "ttydcd");
 
 	/* Allow drivers to use a custom mutex to lock the TTY. */
 	if (mutex != NULL) {
 		tp->t_mtx = mutex;
 	} else {
 		tp->t_mtx = &tp->t_mtxobj;
 		mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF);
 	}
 
 	knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx);
 
 	return (tp);
 }
 
 static void
 tty_dealloc(void *arg)
 {
 	struct tty *tp = arg;
 
 	/*
 	 * ttyydev_leave() usually frees the i/o queues earlier, but it is
 	 * not always called between queue allocation and here.  The queues
 	 * may be allocated by ioctls on a pty control device without the
 	 * corresponding pty slave device ever being open, or after it is
 	 * closed.
 	 */
 	ttyinq_free(&tp->t_inq);
 	ttyoutq_free(&tp->t_outq);
 	seldrain(&tp->t_inpoll);
 	seldrain(&tp->t_outpoll);
 	knlist_destroy(&tp->t_inpoll.si_note);
 	knlist_destroy(&tp->t_outpoll.si_note);
 
 	cv_destroy(&tp->t_inwait);
 	cv_destroy(&tp->t_outwait);
 	cv_destroy(&tp->t_bgwait);
 	cv_destroy(&tp->t_dcdwait);
 	cv_destroy(&tp->t_outserwait);
 
 	if (tp->t_mtx == &tp->t_mtxobj)
 		mtx_destroy(&tp->t_mtxobj);
 	ttydevsw_free(tp);
 	free(tp, M_TTY);
 }
 
 static void
 tty_rel_free(struct tty *tp)
 {
 	struct cdev *dev;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 #define	TF_ACTIVITY	(TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE)
 	if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) {
 		/* TTY is still in use. */
 		tty_unlock(tp);
 		return;
 	}
 
 	/* TTY can be deallocated. */
 	dev = tp->t_dev;
 	tp->t_dev = NULL;
 	tty_unlock(tp);
 
 	if (dev != NULL) {
 		sx_xlock(&tty_list_sx);
 		TAILQ_REMOVE(&tty_list, tp, t_list);
 		tty_list_count--;
 		sx_xunlock(&tty_list_sx);
 		destroy_dev_sched_cb(dev, tty_dealloc, tp);
 	}
 }
 
 void
 tty_rel_pgrp(struct tty *tp, struct pgrp *pg)
 {
 
 	MPASS(tp->t_sessioncnt > 0);
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tp->t_pgrp == pg)
 		tp->t_pgrp = NULL;
 
 	tty_unlock(tp);
 }
 
 void
 tty_rel_sess(struct tty *tp, struct session *sess)
 {
 
 	MPASS(tp->t_sessioncnt > 0);
 
 	/* Current session has left. */
 	if (tp->t_session == sess) {
 		tp->t_session = NULL;
 		MPASS(tp->t_pgrp == NULL);
 	}
 	tp->t_sessioncnt--;
 	tty_rel_free(tp);
 }
 
 void
 tty_rel_gone(struct tty *tp)
 {
 
 	MPASS(!tty_gone(tp));
 
 	/* Simulate carrier removal. */
 	ttydisc_modem(tp, 0);
 
 	/* Wake up all blocked threads. */
 	tty_wakeup(tp, FREAD|FWRITE);
 	cv_broadcast(&tp->t_bgwait);
 	cv_broadcast(&tp->t_dcdwait);
 
 	tp->t_flags |= TF_GONE;
 	tty_rel_free(tp);
 }
 
 /*
  * Exposing information about current TTY's through sysctl
  */
 
 static void
 tty_to_xtty(struct tty *tp, struct xtty *xt)
 {
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	xt->xt_size = sizeof(struct xtty);
 	xt->xt_insize = ttyinq_getsize(&tp->t_inq);
 	xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq);
 	xt->xt_inlc = ttyinq_bytesline(&tp->t_inq);
 	xt->xt_inlow = tp->t_inlow;
 	xt->xt_outsize = ttyoutq_getsize(&tp->t_outq);
 	xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq);
 	xt->xt_outlow = tp->t_outlow;
 	xt->xt_column = tp->t_column;
 	xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
 	xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0;
 	xt->xt_flags = tp->t_flags;
 	xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : (uint32_t)NODEV;
 }
 
 static int
 sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long lsize;
 	struct xtty *xtlist, *xt;
 	struct tty *tp;
 	int error;
 
 	sx_slock(&tty_list_sx);
 	lsize = tty_list_count * sizeof(struct xtty);
 	if (lsize == 0) {
 		sx_sunlock(&tty_list_sx);
 		return (0);
 	}
 
 	xtlist = xt = malloc(lsize, M_TTY, M_WAITOK);
 
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		tty_lock(tp);
 		tty_to_xtty(tp, xt);
 		tty_unlock(tp);
 		xt++;
 	}
 	sx_sunlock(&tty_list_sx);
 
 	error = SYSCTL_OUT(req, xtlist, lsize);
 	free(xtlist, M_TTY);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
 	0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs");
 
 /*
  * Device node creation. Device has been set up, now we can expose it to
  * the user.
  */
 
 int
 tty_makedevf(struct tty *tp, struct ucred *cred, int flags,
     const char *fmt, ...)
 {
 	va_list ap;
 	struct make_dev_args args;
 	struct cdev *dev, *init, *lock, *cua, *cinit, *clock;
 	const char *prefix = "tty";
 	char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */
 	uid_t uid;
 	gid_t gid;
 	mode_t mode;
 	int error;
 
 	/* Remove "tty" prefix from devices like PTY's. */
 	if (tp->t_flags & TF_NOPREFIX)
 		prefix = "";
 
 	va_start(ap, fmt);
 	vsnrprintf(name, sizeof name, 32, fmt, ap);
 	va_end(ap);
 
 	if (cred == NULL) {
 		/* System device. */
 		uid = UID_ROOT;
 		gid = GID_WHEEL;
 		mode = S_IRUSR|S_IWUSR;
 	} else {
 		/* User device. */
 		uid = cred->cr_ruid;
 		gid = GID_TTY;
 		mode = S_IRUSR|S_IWUSR|S_IWGRP;
 	}
 
 	flags = flags & TTYMK_CLONING ? MAKEDEV_REF : 0;
 	flags |= MAKEDEV_CHECKNAME;
 
 	/* Master call-in device. */
 	make_dev_args_init(&args);
 	args.mda_flags = flags;
 	args.mda_devsw = &ttydev_cdevsw;
 	args.mda_cr = cred;
 	args.mda_uid = uid;
 	args.mda_gid = gid;
 	args.mda_mode = mode;
 	args.mda_si_drv1 = tp;
 	error = make_dev_s(&args, &dev, "%s%s", prefix, name);
 	if (error != 0)
 		return (error);
 	tp->t_dev = dev;
 
 	init = lock = cua = cinit = clock = NULL;
 
 	/* Slave call-in devices. */
 	if (tp->t_flags & TF_INITLOCK) {
 		args.mda_devsw = &ttyil_cdevsw;
 		args.mda_unit = TTYUNIT_INIT;
 		args.mda_si_drv1 = tp;
 		args.mda_si_drv2 = &tp->t_termios_init_in;
 		error = make_dev_s(&args, &init, "%s%s.init", prefix, name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, init);
 
 		args.mda_unit = TTYUNIT_LOCK;
 		args.mda_si_drv2 = &tp->t_termios_lock_in;
 		error = make_dev_s(&args, &lock, "%s%s.lock", prefix, name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, lock);
 	}
 
 	/* Call-out devices. */
 	if (tp->t_flags & TF_CALLOUT) {
 		make_dev_args_init(&args);
 		args.mda_flags = flags;
 		args.mda_devsw = &ttydev_cdevsw;
 		args.mda_cr = cred;
 		args.mda_uid = UID_UUCP;
 		args.mda_gid = GID_DIALER;
 		args.mda_mode = 0660;
 		args.mda_unit = TTYUNIT_CALLOUT;
 		args.mda_si_drv1 = tp;
 		error = make_dev_s(&args, &cua, "cua%s", name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, cua);
 
 		/* Slave call-out devices. */
 		if (tp->t_flags & TF_INITLOCK) {
 			args.mda_devsw = &ttyil_cdevsw;
 			args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_INIT;
 			args.mda_si_drv2 = &tp->t_termios_init_out;
 			error = make_dev_s(&args, &cinit, "cua%s.init", name);
 			if (error != 0)
 				goto fail;
 			dev_depends(dev, cinit);
 
 			args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_LOCK;
 			args.mda_si_drv2 = &tp->t_termios_lock_out;
 			error = make_dev_s(&args, &clock, "cua%s.lock", name);
 			if (error != 0)
 				goto fail;
 			dev_depends(dev, clock);
 		}
 	}
 
 	sx_xlock(&tty_list_sx);
 	TAILQ_INSERT_TAIL(&tty_list, tp, t_list);
 	tty_list_count++;
 	sx_xunlock(&tty_list_sx);
 
 	return (0);
 
 fail:
 	destroy_dev(dev);
 	if (init)
 		destroy_dev(init);
 	if (lock)
 		destroy_dev(lock);
 	if (cinit)
 		destroy_dev(cinit);
 	if (clock)
 		destroy_dev(clock);
 
 	return (error);
 }
 
 /*
  * Signalling processes.
  */
 
 void
 tty_signal_sessleader(struct tty *tp, int sig)
 {
 	struct proc *p;
 
 	tty_lock_assert(tp, MA_OWNED);
 	MPASS(sig >= 1 && sig < NSIG);
 
 	/* Make signals start output again. */
 	tp->t_flags &= ~TF_STOPPED;
 
 	if (tp->t_session != NULL && tp->t_session->s_leader != NULL) {
 		p = tp->t_session->s_leader;
 		PROC_LOCK(p);
 		kern_psignal(p, sig);
 		PROC_UNLOCK(p);
 	}
 }
 
 void
 tty_signal_pgrp(struct tty *tp, int sig)
 {
 	ksiginfo_t ksi;
 
 	tty_lock_assert(tp, MA_OWNED);
 	MPASS(sig >= 1 && sig < NSIG);
 
 	/* Make signals start output again. */
 	tp->t_flags &= ~TF_STOPPED;
 
 	if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO))
 		tty_info(tp);
 	if (tp->t_pgrp != NULL) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = sig;
 		ksi.ksi_code = SI_KERNEL;
 		PGRP_LOCK(tp->t_pgrp);
 		pgsignal(tp->t_pgrp, sig, 1, &ksi);
 		PGRP_UNLOCK(tp->t_pgrp);
 	}
 }
 
 void
 tty_wakeup(struct tty *tp, int flags)
 {
 
 	if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL)
 		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
 
 	if (flags & FWRITE) {
 		cv_broadcast(&tp->t_outwait);
 		selwakeup(&tp->t_outpoll);
 		KNOTE_LOCKED(&tp->t_outpoll.si_note, 0);
 	}
 	if (flags & FREAD) {
 		cv_broadcast(&tp->t_inwait);
 		selwakeup(&tp->t_inpoll);
 		KNOTE_LOCKED(&tp->t_inpoll.si_note, 0);
 	}
 }
 
 int
 tty_wait(struct tty *tp, struct cv *cv)
 {
 	int error;
 	int revokecnt = tp->t_revokecnt;
 
 	tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
 	MPASS(!tty_gone(tp));
 
 	error = cv_wait_sig(cv, tp->t_mtx);
 
 	/* Bail out when the device slipped away. */
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	/* Restart the system call when we may have been revoked. */
 	if (tp->t_revokecnt != revokecnt)
 		return (ERESTART);
 
 	return (error);
 }
 
 int
 tty_timedwait(struct tty *tp, struct cv *cv, int hz)
 {
 	int error;
 	int revokecnt = tp->t_revokecnt;
 
 	tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
 	MPASS(!tty_gone(tp));
 
 	error = cv_timedwait_sig(cv, tp->t_mtx, hz);
 
 	/* Bail out when the device slipped away. */
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	/* Restart the system call when we may have been revoked. */
 	if (tp->t_revokecnt != revokecnt)
 		return (ERESTART);
 
 	return (error);
 }
 
 void
 tty_flush(struct tty *tp, int flags)
 {
 
 	if (flags & FWRITE) {
 		tp->t_flags &= ~TF_HIWAT_OUT;
 		ttyoutq_flush(&tp->t_outq);
 		tty_wakeup(tp, FWRITE);
 		if (!tty_gone(tp)) {
 			ttydevsw_outwakeup(tp);
 			ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE);
 		}
 	}
 	if (flags & FREAD) {
 		tty_hiwat_in_unblock(tp);
 		ttyinq_flush(&tp->t_inq);
 		tty_wakeup(tp, FREAD);
 		if (!tty_gone(tp)) {
 			ttydevsw_inwakeup(tp);
 			ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD);
 		}
 	}
 }
 
 void
 tty_set_winsize(struct tty *tp, const struct winsize *wsz)
 {
 
 	if (memcmp(&tp->t_winsize, wsz, sizeof(*wsz)) == 0)
 		return;
 	tp->t_winsize = *wsz;
 	tty_signal_pgrp(tp, SIGWINCH);
 }
 
 static int
 tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
     struct thread *td)
 {
 	int error;
 
 	switch (cmd) {
 	/*
 	 * Modem commands.
 	 * The SER_* and TIOCM_* flags are the same, but one bit
 	 * shifted. I don't know why.
 	 */
 	case TIOCSDTR:
 		ttydevsw_modem(tp, SER_DTR, 0);
 		return (0);
 	case TIOCCDTR:
 		ttydevsw_modem(tp, 0, SER_DTR);
 		return (0);
 	case TIOCMSET: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp,
 		    (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1,
 		    ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1);
 		return (0);
 	}
 	case TIOCMBIS: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0);
 		return (0);
 	}
 	case TIOCMBIC: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1);
 		return (0);
 	}
 	case TIOCMGET:
 		*(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1);
 		return (0);
 
 	case FIOASYNC:
 		if (*(int *)data)
 			tp->t_flags |= TF_ASYNC;
 		else
 			tp->t_flags &= ~TF_ASYNC;
 		return (0);
 	case FIONBIO:
 		/* This device supports non-blocking operation. */
 		return (0);
 	case FIONREAD:
 		*(int *)data = ttyinq_bytescanonicalized(&tp->t_inq);
 		return (0);
 	case FIONWRITE:
 	case TIOCOUTQ:
 		*(int *)data = ttyoutq_bytesused(&tp->t_outq);
 		return (0);
 	case FIOSETOWN:
 		if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
 			/* Not allowed to set ownership. */
 			return (ENOTTY);
 
 		/* Temporarily unlock the TTY to set ownership. */
 		tty_unlock(tp);
 		error = fsetown(*(int *)data, &tp->t_sigio);
 		tty_lock(tp);
 		return (error);
 	case FIOGETOWN:
 		if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
 			/* Not allowed to set ownership. */
 			return (ENOTTY);
 
 		/* Get ownership. */
 		*(int *)data = fgetown(&tp->t_sigio);
 		return (0);
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		*(struct termios*)data = tp->t_termios;
 		return (0);
 	case TIOCSETA:
 	case TIOCSETAW:
 	case TIOCSETAF: {
 		struct termios *t = data;
 
 		/*
 		 * Who makes up these funny rules? According to POSIX,
 		 * input baud rate is set equal to the output baud rate
 		 * when zero.
 		 */
 		if (t->c_ispeed == 0)
 			t->c_ispeed = t->c_ospeed;
 
 		/* Discard any unsupported bits. */
 		t->c_iflag &= TTYSUP_IFLAG;
 		t->c_oflag &= TTYSUP_OFLAG;
 		t->c_lflag &= TTYSUP_LFLAG;
 		t->c_cflag &= TTYSUP_CFLAG;
 
 		/* Set terminal flags through tcsetattr(). */
 		if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
 			error = tty_drain(tp, 0);
 			if (error)
 				return (error);
 			if (cmd == TIOCSETAF)
 				tty_flush(tp, FREAD);
 		}
 
 		/*
 		 * Only call param() when the flags really change.
 		 */
 		if ((t->c_cflag & CIGNORE) == 0 &&
 		    (tp->t_termios.c_cflag != t->c_cflag ||
 		    ((tp->t_termios.c_iflag ^ t->c_iflag) &
 		    (IXON|IXOFF|IXANY)) ||
 		    tp->t_termios.c_ispeed != t->c_ispeed ||
 		    tp->t_termios.c_ospeed != t->c_ospeed)) {
 			error = ttydevsw_param(tp, t);
 			if (error)
 				return (error);
 
 			/* XXX: CLOCAL? */
 
 			tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
 			tp->t_termios.c_ispeed = t->c_ispeed;
 			tp->t_termios.c_ospeed = t->c_ospeed;
 
 			/* Baud rate has changed - update watermarks. */
 			error = tty_watermarks(tp);
 			if (error)
 				return (error);
 		}
 
 		/* Copy new non-device driver parameters. */
 		tp->t_termios.c_iflag = t->c_iflag;
 		tp->t_termios.c_oflag = t->c_oflag;
 		tp->t_termios.c_lflag = t->c_lflag;
 		memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc);
 
 		ttydisc_optimize(tp);
 
 		if ((t->c_lflag & ICANON) == 0) {
 			/*
 			 * When in non-canonical mode, wake up all
 			 * readers. Canonicalize any partial input. VMIN
 			 * and VTIME could also be adjusted.
 			 */
 			ttyinq_canonicalize(&tp->t_inq);
 			tty_wakeup(tp, FREAD);
 		}
 
 		/*
 		 * For packet mode: notify the PTY consumer that VSTOP
 		 * and VSTART may have been changed.
 		 */
 		if (tp->t_termios.c_iflag & IXON &&
 		    tp->t_termios.c_cc[VSTOP] == CTRL('S') &&
 		    tp->t_termios.c_cc[VSTART] == CTRL('Q'))
 			ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP);
 		else
 			ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP);
 		return (0);
 	}
 	case TIOCGETD:
 		/* For compatibility - we only support TTYDISC. */
 		*(int *)data = TTYDISC;
 		return (0);
 	case TIOCGPGRP:
 		if (!tty_is_ctty(tp, td->td_proc))
 			return (ENOTTY);
 
 		if (tp->t_pgrp != NULL)
 			*(int *)data = tp->t_pgrp->pg_id;
 		else
 			*(int *)data = NO_PID;
 		return (0);
 	case TIOCGSID:
 		if (!tty_is_ctty(tp, td->td_proc))
 			return (ENOTTY);
 
 		MPASS(tp->t_session);
 		*(int *)data = tp->t_session->s_sid;
 		return (0);
 	case TIOCSCTTY: {
 		struct proc *p = td->td_proc;
 
 		/* XXX: This looks awful. */
 		tty_unlock(tp);
 		sx_xlock(&proctree_lock);
 		tty_lock(tp);
 
 		if (!SESS_LEADER(p)) {
 			/* Only the session leader may do this. */
 			sx_xunlock(&proctree_lock);
 			return (EPERM);
 		}
 
 		if (tp->t_session != NULL && tp->t_session == p->p_session) {
 			/* This is already our controlling TTY. */
 			sx_xunlock(&proctree_lock);
 			return (0);
 		}
 
 		if (p->p_session->s_ttyp != NULL ||
 		    (tp->t_session != NULL && tp->t_session->s_ttyvp != NULL &&
 		    tp->t_session->s_ttyvp->v_type != VBAD)) {
 			/*
 			 * There is already a relation between a TTY and
 			 * a session, or the caller is not the session
 			 * leader.
 			 *
 			 * Allow the TTY to be stolen when the vnode is
 			 * invalid, but the reference to the TTY is
 			 * still active.  This allows immediate reuse of
 			 * TTYs of which the session leader has been
 			 * killed or the TTY revoked.
 			 */
 			sx_xunlock(&proctree_lock);
 			return (EPERM);
 		}
 
 		/* Connect the session to the TTY. */
 		tp->t_session = p->p_session;
 		tp->t_session->s_ttyp = tp;
 		tp->t_sessioncnt++;
 		sx_xunlock(&proctree_lock);
 
 		/* Assign foreground process group. */
 		tp->t_pgrp = p->p_pgrp;
 		PROC_LOCK(p);
 		p->p_flag |= P_CONTROLT;
 		PROC_UNLOCK(p);
 
 		return (0);
 	}
 	case TIOCSPGRP: {
 		struct pgrp *pg;
 
 		/*
 		 * XXX: Temporarily unlock the TTY to locate the process
 		 * group. This code would be lot nicer if we would ever
 		 * decompose proctree_lock.
 		 */
 		tty_unlock(tp);
 		sx_slock(&proctree_lock);
 		pg = pgfind(*(int *)data);
 		if (pg != NULL)
 			PGRP_UNLOCK(pg);
 		if (pg == NULL || pg->pg_session != td->td_proc->p_session) {
 			sx_sunlock(&proctree_lock);
 			tty_lock(tp);
 			return (EPERM);
 		}
 		tty_lock(tp);
 
 		/*
 		 * Determine if this TTY is the controlling TTY after
 		 * relocking the TTY.
 		 */
 		if (!tty_is_ctty(tp, td->td_proc)) {
 			sx_sunlock(&proctree_lock);
 			return (ENOTTY);
 		}
 		tp->t_pgrp = pg;
 		sx_sunlock(&proctree_lock);
 
 		/* Wake up the background process groups. */
 		cv_broadcast(&tp->t_bgwait);
 		return (0);
 	}
 	case TIOCFLUSH: {
 		int flags = *(int *)data;
 
 		if (flags == 0)
 			flags = (FREAD|FWRITE);
 		else
 			flags &= (FREAD|FWRITE);
 		tty_flush(tp, flags);
 		return (0);
 	}
 	case TIOCDRAIN:
 		/* Drain TTY output. */
 		return tty_drain(tp, 0);
 	case TIOCGDRAINWAIT:
 		*(int *)data = tp->t_drainwait;
 		return (0);
 	case TIOCSDRAINWAIT:
 		error = priv_check(td, PRIV_TTY_DRAINWAIT);
 		if (error == 0)
 			tp->t_drainwait = *(int *)data;
 		return (error);
 	case TIOCCONS:
 		/* Set terminal as console TTY. */
 		if (*(int *)data) {
 			error = priv_check(td, PRIV_TTY_CONSOLE);
 			if (error)
 				return (error);
 
 			/*
 			 * XXX: constty should really need to be locked!
 			 * XXX: allow disconnected constty's to be stolen!
 			 */
 
 			if (constty == tp)
 				return (0);
 			if (constty != NULL)
 				return (EBUSY);
 
 			tty_unlock(tp);
 			constty_set(tp);
 			tty_lock(tp);
 		} else if (constty == tp) {
 			constty_clear();
 		}
 		return (0);
 	case TIOCGWINSZ:
 		/* Obtain window size. */
 		*(struct winsize*)data = tp->t_winsize;
 		return (0);
 	case TIOCSWINSZ:
 		/* Set window size. */
 		tty_set_winsize(tp, data);
 		return (0);
 	case TIOCEXCL:
 		tp->t_flags |= TF_EXCLUDE;
 		return (0);
 	case TIOCNXCL:
 		tp->t_flags &= ~TF_EXCLUDE;
 		return (0);
 	case TIOCSTOP:
 		tp->t_flags |= TF_STOPPED;
 		ttydevsw_pktnotify(tp, TIOCPKT_STOP);
 		return (0);
 	case TIOCSTART:
 		tp->t_flags &= ~TF_STOPPED;
 		ttydevsw_outwakeup(tp);
 		ttydevsw_pktnotify(tp, TIOCPKT_START);
 		return (0);
 	case TIOCSTAT:
 		tty_info(tp);
 		return (0);
 	case TIOCSTI:
 		if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
 			return (EPERM);
 		if (!tty_is_ctty(tp, td->td_proc) &&
 		    priv_check(td, PRIV_TTY_STI))
 			return (EACCES);
 		ttydisc_rint(tp, *(char *)data, 0);
 		ttydisc_rint_done(tp);
 		return (0);
 	}
 
 #ifdef COMPAT_43TTY
 	return tty_ioctl_compat(tp, cmd, data, fflag, td);
 #else /* !COMPAT_43TTY */
 	return (ENOIOCTL);
 #endif /* COMPAT_43TTY */
 }
 
 int
 tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td)
 {
 	int error;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	error = ttydevsw_ioctl(tp, cmd, data, td);
 	if (error == ENOIOCTL)
 		error = tty_generic_ioctl(tp, cmd, data, fflag, td);
 
 	return (error);
 }
 
 dev_t
 tty_udev(struct tty *tp)
 {
 
 	if (tp->t_dev)
 		return (dev2udev(tp->t_dev));
 	else
 		return (NODEV);
 }
 
 int
 tty_checkoutq(struct tty *tp)
 {
 
 	/* 256 bytes should be enough to print a log message. */
 	return (ttyoutq_bytesleft(&tp->t_outq) >= 256);
 }
 
 void
 tty_hiwat_in_block(struct tty *tp)
 {
 
 	if ((tp->t_flags & TF_HIWAT_IN) == 0 &&
 	    tp->t_termios.c_iflag & IXOFF &&
 	    tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) {
 		/*
 		 * Input flow control. Only enter the high watermark when we
 		 * can successfully store the VSTOP character.
 		 */
 		if (ttyoutq_write_nofrag(&tp->t_outq,
 		    &tp->t_termios.c_cc[VSTOP], 1) == 0)
 			tp->t_flags |= TF_HIWAT_IN;
 	} else {
 		/* No input flow control. */
 		tp->t_flags |= TF_HIWAT_IN;
 	}
 }
 
 void
 tty_hiwat_in_unblock(struct tty *tp)
 {
 
 	if (tp->t_flags & TF_HIWAT_IN &&
 	    tp->t_termios.c_iflag & IXOFF &&
 	    tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) {
 		/*
 		 * Input flow control. Only leave the high watermark when we
 		 * can successfully store the VSTART character.
 		 */
 		if (ttyoutq_write_nofrag(&tp->t_outq,
 		    &tp->t_termios.c_cc[VSTART], 1) == 0)
 			tp->t_flags &= ~TF_HIWAT_IN;
 	} else {
 		/* No input flow control. */
 		tp->t_flags &= ~TF_HIWAT_IN;
 	}
 
 	if (!tty_gone(tp))
 		ttydevsw_inwakeup(tp);
 }
 
 /*
  * TTY hooks interface.
  */
 
 static int
 ttyhook_defrint(struct tty *tp, char c, int flags)
 {
 
 	if (ttyhook_rint_bypass(tp, &c, 1) != 1)
 		return (-1);
 
 	return (0);
 }
 
 int
 ttyhook_register(struct tty **rtp, struct proc *p, int fd, struct ttyhook *th,
     void *softc)
 {
 	struct tty *tp;
 	struct file *fp;
 	struct cdev *dev;
 	struct cdevsw *cdp;
 	struct filedesc *fdp;
 	cap_rights_t rights;
 	int error, ref;
 
 	/* Validate the file descriptor. */
 	fdp = p->p_fd;
 	error = fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_TTYHOOK),
 	    &fp, NULL);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		error = EBADF;
 		goto done1;
 	}
 
 	/*
 	 * Make sure the vnode is bound to a character device.
 	 * Unlocked check for the vnode type is ok there, because we
 	 * only shall prevent calling devvn_refthread on the file that
 	 * never has been opened over a character device.
 	 */
 	if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) {
 		error = EINVAL;
 		goto done1;
 	}
 
 	/* Make sure it is a TTY. */
 	cdp = devvn_refthread(fp->f_vnode, &dev, &ref);
 	if (cdp == NULL) {
 		error = ENXIO;
 		goto done1;
 	}
 	if (dev != fp->f_data) {
 		error = ENXIO;
 		goto done2;
 	}
 	if (cdp != &ttydev_cdevsw) {
 		error = ENOTTY;
 		goto done2;
 	}
 	tp = dev->si_drv1;
 
 	/* Try to attach the hook to the TTY. */
 	error = EBUSY;
 	tty_lock(tp);
 	MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0));
 	if (tp->t_flags & TF_HOOK)
 		goto done3;
 
 	tp->t_flags |= TF_HOOK;
 	tp->t_hook = th;
 	tp->t_hooksoftc = softc;
 	*rtp = tp;
 	error = 0;
 
 	/* Maybe we can switch into bypass mode now. */
 	ttydisc_optimize(tp);
 
 	/* Silently convert rint() calls to rint_bypass() when possible. */
 	if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass))
 		th->th_rint = ttyhook_defrint;
 
 done3:	tty_unlock(tp);
 done2:	dev_relthread(dev, ref);
 done1:	fdrop(fp, curthread);
 	return (error);
 }
 
 void
 ttyhook_unregister(struct tty *tp)
 {
 
 	tty_lock_assert(tp, MA_OWNED);
 	MPASS(tp->t_flags & TF_HOOK);
 
 	/* Disconnect the hook. */
 	tp->t_flags &= ~TF_HOOK;
 	tp->t_hook = NULL;
 
 	/* Maybe we need to leave bypass mode. */
 	ttydisc_optimize(tp);
 
 	/* Maybe deallocate the TTY as well. */
 	tty_rel_free(tp);
 }
 
 /*
  * /dev/console handling.
  */
 
 static int
 ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct tty *tp;
 
 	/* System has no console device. */
 	if (dev_console_filename == NULL)
 		return (ENXIO);
 
 	/* Look up corresponding TTY by device name. */
 	sx_slock(&tty_list_sx);
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		if (strcmp(dev_console_filename, tty_devname(tp)) == 0) {
 			dev_console->si_drv1 = tp;
 			break;
 		}
 	}
 	sx_sunlock(&tty_list_sx);
 
 	/* System console has no TTY associated. */
 	if (dev_console->si_drv1 == NULL)
 		return (ENXIO);
 
 	return (ttydev_open(dev, oflags, devtype, td));
 }
 
 static int
 ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 
 	log_console(uio);
 
 	return (ttydev_write(dev, uio, ioflag));
 }
 
 /*
  * /dev/console is a little different than normal TTY's.  When opened,
  * it determines which TTY to use.  When data gets written to it, it
  * will be logged in the kernel message buffer.
  */
 static struct cdevsw ttyconsdev_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttyconsdev_open,
 	.d_close	= ttydev_close,
 	.d_read		= ttydev_read,
 	.d_write	= ttyconsdev_write,
 	.d_ioctl	= ttydev_ioctl,
 	.d_kqfilter	= ttydev_kqfilter,
 	.d_poll		= ttydev_poll,
 	.d_mmap		= ttydev_mmap,
 	.d_name		= "ttyconsdev",
 	.d_flags	= D_TTY,
 };
 
 static void
 ttyconsdev_init(void *unused __unused)
 {
 
 	dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0,
 	    NULL, UID_ROOT, GID_WHEEL, 0600, "console");
 }
 
 SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL);
 
 void
 ttyconsdev_select(const char *name)
 {
 
 	dev_console_filename = name;
 }
 
 /*
  * Debugging routines.
  */
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 
 static const struct {
 	int flag;
 	char val;
 } ttystates[] = {
 #if 0
 	{ TF_NOPREFIX,		'N' },
 #endif
 	{ TF_INITLOCK,		'I' },
 	{ TF_CALLOUT,		'C' },
 
 	/* Keep these together -> 'Oi' and 'Oo'. */
 	{ TF_OPENED,		'O' },
 	{ TF_OPENED_IN,		'i' },
 	{ TF_OPENED_OUT,	'o' },
 	{ TF_OPENED_CONS,	'c' },
 
 	{ TF_GONE,		'G' },
 	{ TF_OPENCLOSE,		'B' },
 	{ TF_ASYNC,		'Y' },
 	{ TF_LITERAL,		'L' },
 
 	/* Keep these together -> 'Hi' and 'Ho'. */
 	{ TF_HIWAT,		'H' },
 	{ TF_HIWAT_IN,		'i' },
 	{ TF_HIWAT_OUT,		'o' },
 
 	{ TF_STOPPED,		'S' },
 	{ TF_EXCLUDE,		'X' },
 	{ TF_BYPASS,		'l' },
 	{ TF_ZOMBIE,		'Z' },
 	{ TF_HOOK,		's' },
 
 	/* Keep these together -> 'bi' and 'bo'. */
 	{ TF_BUSY,		'b' },
 	{ TF_BUSY_IN,		'i' },
 	{ TF_BUSY_OUT,		'o' },
 
 	{ 0,			'\0'},
 };
 
 #define	TTY_FLAG_BITS \
 	"\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN" \
 	"\5OPENED_OUT\6OPENED_CONS\7GONE\10OPENCLOSE" \
 	"\11ASYNC\12LITERAL\13HIWAT_IN\14HIWAT_OUT" \
 	"\15STOPPED\16EXCLUDE\17BYPASS\20ZOMBIE" \
 	"\21HOOK\22BUSY_IN\23BUSY_OUT"
 
 #define DB_PRINTSYM(name, addr) \
 	db_printf("%s  " #name ": ", sep); \
 	db_printsym((db_addr_t) addr, DB_STGY_ANY); \
 	db_printf("\n");
 
 static void
 _db_show_devsw(const char *sep, const struct ttydevsw *tsw)
 {
 
 	db_printf("%sdevsw: ", sep);
 	db_printsym((db_addr_t)tsw, DB_STGY_ANY);
 	db_printf(" (%p)\n", tsw);
 	DB_PRINTSYM(open, tsw->tsw_open);
 	DB_PRINTSYM(close, tsw->tsw_close);
 	DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup);
 	DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup);
 	DB_PRINTSYM(ioctl, tsw->tsw_ioctl);
 	DB_PRINTSYM(param, tsw->tsw_param);
 	DB_PRINTSYM(modem, tsw->tsw_modem);
 	DB_PRINTSYM(mmap, tsw->tsw_mmap);
 	DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify);
 	DB_PRINTSYM(free, tsw->tsw_free);
 }
 
 static void
 _db_show_hooks(const char *sep, const struct ttyhook *th)
 {
 
 	db_printf("%shook: ", sep);
 	db_printsym((db_addr_t)th, DB_STGY_ANY);
 	db_printf(" (%p)\n", th);
 	if (th == NULL)
 		return;
 	DB_PRINTSYM(rint, th->th_rint);
 	DB_PRINTSYM(rint_bypass, th->th_rint_bypass);
 	DB_PRINTSYM(rint_done, th->th_rint_done);
 	DB_PRINTSYM(rint_poll, th->th_rint_poll);
 	DB_PRINTSYM(getc_inject, th->th_getc_inject);
 	DB_PRINTSYM(getc_capture, th->th_getc_capture);
 	DB_PRINTSYM(getc_poll, th->th_getc_poll);
 	DB_PRINTSYM(close, th->th_close);
 }
 
 static void
 _db_show_termios(const char *name, const struct termios *t)
 {
 
 	db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x "
 	    "lflag 0x%x ispeed %u ospeed %u\n", name,
 	    t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag,
 	    t->c_ispeed, t->c_ospeed);
 }
 
 /* DDB command to show TTY statistics. */
 DB_SHOW_COMMAND(tty, db_show_tty)
 {
 	struct tty *tp;
 
 	if (!have_addr) {
 		db_printf("usage: show tty <addr>\n");
 		return;
 	}
 	tp = (struct tty *)addr;
 
 	db_printf("%p: %s\n", tp, tty_devname(tp));
 	db_printf("\tmtx: %p\n", tp->t_mtx);
 	db_printf("\tflags: 0x%b\n", tp->t_flags, TTY_FLAG_BITS);
 	db_printf("\trevokecnt: %u\n", tp->t_revokecnt);
 
 	/* Buffering mechanisms. */
 	db_printf("\tinq: %p begin %u linestart %u reprint %u end %u "
 	    "nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin,
 	    tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end,
 	    tp->t_inq.ti_nblocks, tp->t_inq.ti_quota);
 	db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n",
 	    &tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end,
 	    tp->t_outq.to_nblocks, tp->t_outq.to_quota);
 	db_printf("\tinlow: %zu\n", tp->t_inlow);
 	db_printf("\toutlow: %zu\n", tp->t_outlow);
 	_db_show_termios("\ttermios", &tp->t_termios);
 	db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n",
 	    tp->t_winsize.ws_row, tp->t_winsize.ws_col,
 	    tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel);
 	db_printf("\tcolumn: %u\n", tp->t_column);
 	db_printf("\twritepos: %u\n", tp->t_writepos);
 	db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags);
 
 	/* Init/lock-state devices. */
 	_db_show_termios("\ttermios_init_in", &tp->t_termios_init_in);
 	_db_show_termios("\ttermios_init_out", &tp->t_termios_init_out);
 	_db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in);
 	_db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out);
 
 	/* Hooks */
 	_db_show_devsw("\t", tp->t_devsw);
 	_db_show_hooks("\t", tp->t_hook);
 
 	/* Process info. */
 	db_printf("\tpgrp: %p gid %d jobc %d\n", tp->t_pgrp,
 	    tp->t_pgrp ? tp->t_pgrp->pg_id : 0,
 	    tp->t_pgrp ? tp->t_pgrp->pg_jobc : 0);
 	db_printf("\tsession: %p", tp->t_session);
 	if (tp->t_session != NULL)
 	    db_printf(" count %u leader %p tty %p sid %d login %s",
 		tp->t_session->s_count, tp->t_session->s_leader,
 		tp->t_session->s_ttyp, tp->t_session->s_sid,
 		tp->t_session->s_login);
 	db_printf("\n");
 	db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt);
 	db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc);
 	db_printf("\thooksoftc: %p\n", tp->t_hooksoftc);
 	db_printf("\tdev: %p\n", tp->t_dev);
 }
 
 /* DDB command to list TTYs. */
 DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys)
 {
 	struct tty *tp;
 	size_t isiz, osiz;
 	int i, j;
 
 	/* Make the output look like `pstat -t'. */
 	db_printf("PTR        ");
 #if defined(__LP64__)
 	db_printf("        ");
 #endif
 	db_printf("      LINE   INQ  CAN  LIN  LOW  OUTQ  USE  LOW   "
 	    "COL  SESS  PGID STATE\n");
 
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE;
 		osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE;
 
 		db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d "
 		    "%5d ", tp, tty_devname(tp), isiz,
 		    tp->t_inq.ti_linestart - tp->t_inq.ti_begin,
 		    tp->t_inq.ti_end - tp->t_inq.ti_linestart,
 		    isiz - tp->t_inlow, osiz,
 		    tp->t_outq.to_end - tp->t_outq.to_begin,
 		    osiz - tp->t_outlow, MIN(tp->t_column, 99999),
 		    tp->t_session ? tp->t_session->s_sid : 0,
 		    tp->t_pgrp ? tp->t_pgrp->pg_id : 0);
 
 		/* Flag bits. */
 		for (i = j = 0; ttystates[i].flag; i++)
 			if (tp->t_flags & ttystates[i].flag) {
 				db_printf("%c", ttystates[i].val);
 				j++;
 			}
 		if (j == 0)
 			db_printf("-");
 		db_printf("\n");
 	}
 }
 #endif /* DDB */
Index: head/sys/kern/tty_inq.c
===================================================================
--- head/sys/kern/tty_inq.c	(revision 326270)
+++ head/sys/kern/tty_inq.c	(revision 326271)
@@ -1,495 +1,497 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/uio.h>
 
 #include <vm/uma.h>
 
 /*
  * TTY input queue buffering.
  *
  * Unlike the output queue, the input queue has more features that are
  * needed to properly implement various features offered by the TTY
  * interface:
  *
  * - Data can be removed from the tail of the queue, which is used to
  *   implement backspace.
  * - Once in a while, input has to be `canonicalized'. When ICANON is
  *   turned on, this will be done after a CR has been inserted.
  *   Otherwise, it should be done after any character has been inserted.
  * - The input queue can store one bit per byte, called the quoting bit.
  *   This bit is used by TTYDISC to make backspace work on quoted
  *   characters.
  *
  * In most cases, there is probably less input than output, so unlike
  * the outq, we'll stick to 128 byte blocks here.
  */
 
 static int ttyinq_flush_secure = 1;
 SYSCTL_INT(_kern, OID_AUTO, tty_inq_flush_secure, CTLFLAG_RW,
 	&ttyinq_flush_secure, 0, "Zero buffers while flushing");
 
 #define TTYINQ_QUOTESIZE	(TTYINQ_DATASIZE / BMSIZE)
 #define BMSIZE			32
 #define GETBIT(tib,boff) \
 	((tib)->tib_quotes[(boff) / BMSIZE] & (1 << ((boff) % BMSIZE)))
 #define SETBIT(tib,boff) \
 	((tib)->tib_quotes[(boff) / BMSIZE] |= (1 << ((boff) % BMSIZE)))
 #define CLRBIT(tib,boff) \
 	((tib)->tib_quotes[(boff) / BMSIZE] &= ~(1 << ((boff) % BMSIZE)))
 
 struct ttyinq_block {
 	struct ttyinq_block	*tib_prev;
 	struct ttyinq_block	*tib_next;
 	uint32_t		tib_quotes[TTYINQ_QUOTESIZE];
 	char			tib_data[TTYINQ_DATASIZE];
 };
 
 static uma_zone_t ttyinq_zone;
 
 #define	TTYINQ_INSERT_TAIL(ti, tib) do {				\
 	if (ti->ti_end == 0) {						\
 		tib->tib_prev = NULL;					\
 		tib->tib_next = ti->ti_firstblock;			\
 		ti->ti_firstblock = tib;				\
 	} else {							\
 		tib->tib_prev = ti->ti_lastblock;			\
 		tib->tib_next = ti->ti_lastblock->tib_next;		\
 		ti->ti_lastblock->tib_next = tib;			\
 	}								\
 	if (tib->tib_next != NULL)					\
 		tib->tib_next->tib_prev = tib;				\
 	ti->ti_nblocks++;						\
 } while (0)
 
 #define	TTYINQ_REMOVE_HEAD(ti) do {					\
 	ti->ti_firstblock = ti->ti_firstblock->tib_next;		\
 	if (ti->ti_firstblock != NULL)					\
 		ti->ti_firstblock->tib_prev = NULL;			\
 	ti->ti_nblocks--;						\
 } while (0)
 
 #define	TTYINQ_RECYCLE(ti, tib) do {					\
 	if (ti->ti_quota <= ti->ti_nblocks)				\
 		uma_zfree(ttyinq_zone, tib);				\
 	else								\
 		TTYINQ_INSERT_TAIL(ti, tib);				\
 } while (0)
 
 int 
 ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t size)
 {
 	struct ttyinq_block *tib;
 
 	ti->ti_quota = howmany(size, TTYINQ_DATASIZE);
 
 	while (ti->ti_quota > ti->ti_nblocks) {
 		/*
 		 * List is getting bigger.
 		 * Add new blocks to the tail of the list.
 		 *
 		 * We must unlock the TTY temporarily, because we need
 		 * to allocate memory. This won't be a problem, because
 		 * in the worst case, another thread ends up here, which
 		 * may cause us to allocate too many blocks, but this
 		 * will be caught by the loop below.
 		 */
 		tty_unlock(tp);
 		tib = uma_zalloc(ttyinq_zone, M_WAITOK);
 		tty_lock(tp);
 
 		if (tty_gone(tp)) {
 			uma_zfree(ttyinq_zone, tib);
 			return (ENXIO);
 		}
 
 		TTYINQ_INSERT_TAIL(ti, tib);
 	}
 	return (0);
 }
 
 void
 ttyinq_free(struct ttyinq *ti)
 {
 	struct ttyinq_block *tib;
 
 	ttyinq_flush(ti);
 	ti->ti_quota = 0;
 
 	while ((tib = ti->ti_firstblock) != NULL) {
 		TTYINQ_REMOVE_HEAD(ti);
 		uma_zfree(ttyinq_zone, tib);
 	}
 
 	MPASS(ti->ti_nblocks == 0);
 }
 
 int
 ttyinq_read_uio(struct ttyinq *ti, struct tty *tp, struct uio *uio,
     size_t rlen, size_t flen)
 {
 
 	MPASS(rlen <= uio->uio_resid);
 
 	while (rlen > 0) {
 		int error;
 		struct ttyinq_block *tib;
 		size_t cbegin, cend, clen;
 
 		/* See if there still is data. */
 		if (ti->ti_begin == ti->ti_linestart)
 			return (0);
 		tib = ti->ti_firstblock;
 		if (tib == NULL)
 			return (0);
 
 		/*
 		 * The end address should be the lowest of these three:
 		 * - The write pointer
 		 * - The blocksize - we can't read beyond the block
 		 * - The end address if we could perform the full read
 		 */
 		cbegin = ti->ti_begin;
 		cend = MIN(MIN(ti->ti_linestart, ti->ti_begin + rlen),
 		    TTYINQ_DATASIZE);
 		clen = cend - cbegin;
 		MPASS(clen >= flen);
 		rlen -= clen;
 
 		/*
 		 * We can prevent buffering in some cases:
 		 * - We need to read the block until the end.
 		 * - We don't need to read the block until the end, but
 		 *   there is no data beyond it, which allows us to move
 		 *   the write pointer to a new block.
 		 */
 		if (cend == TTYINQ_DATASIZE || cend == ti->ti_end) {
 			/*
 			 * Fast path: zero copy. Remove the first block,
 			 * so we can unlock the TTY temporarily.
 			 */
 			TTYINQ_REMOVE_HEAD(ti);
 			ti->ti_begin = 0;
 
 			/*
 			 * Because we remove the first block, we must
 			 * fix up the block offsets.
 			 */
 #define CORRECT_BLOCK(t) do {			\
 	if (t <= TTYINQ_DATASIZE)		\
 		t = 0;				\
 	else					\
 		t -= TTYINQ_DATASIZE;		\
 } while (0)
 			CORRECT_BLOCK(ti->ti_linestart);
 			CORRECT_BLOCK(ti->ti_reprint);
 			CORRECT_BLOCK(ti->ti_end);
 #undef CORRECT_BLOCK
 
 			/*
 			 * Temporary unlock and copy the data to
 			 * userspace. We may need to flush trailing
 			 * bytes, like EOF characters.
 			 */
 			tty_unlock(tp);
 			error = uiomove(tib->tib_data + cbegin,
 			    clen - flen, uio);
 			tty_lock(tp);
 
 			/* Block can now be readded to the list. */
 			TTYINQ_RECYCLE(ti, tib);
 		} else {
 			char ob[TTYINQ_DATASIZE - 1];
 
 			/*
 			 * Slow path: store data in a temporary buffer.
 			 */
 			memcpy(ob, tib->tib_data + cbegin, clen - flen);
 			ti->ti_begin += clen;
 			MPASS(ti->ti_begin < TTYINQ_DATASIZE);
 
 			/* Temporary unlock and copy the data to userspace. */
 			tty_unlock(tp);
 			error = uiomove(ob, clen - flen, uio);
 			tty_lock(tp);
 		}
 
 		if (error != 0)
 			return (error);
 		if (tty_gone(tp))
 			return (ENXIO);
 	}
 
 	return (0);
 }
 
 static __inline void
 ttyinq_set_quotes(struct ttyinq_block *tib, size_t offset,
     size_t length, int value)
 {
 
 	if (value) {
 		/* Set the bits. */
 		for (; length > 0; length--, offset++)
 			SETBIT(tib, offset);
 	} else {
 		/* Unset the bits. */
 		for (; length > 0; length--, offset++)
 			CLRBIT(tib, offset);
 	}
 }
 
 size_t
 ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
 {
 	const char *cbuf = buf;
 	struct ttyinq_block *tib;
 	unsigned int boff;
 	size_t l;
 
 	while (nbytes > 0) {
 		boff = ti->ti_end % TTYINQ_DATASIZE;
 
 		if (ti->ti_end == 0) {
 			/* First time we're being used or drained. */
 			MPASS(ti->ti_begin == 0);
 			tib = ti->ti_firstblock;
 			if (tib == NULL) {
 				/* Queue has no blocks. */
 				break;
 			}
 			ti->ti_lastblock = tib;
 		} else if (boff == 0) {
 			/* We reached the end of this block on last write. */
 			tib = ti->ti_lastblock->tib_next;
 			if (tib == NULL) {
 				/* We've reached the watermark. */
 				break;
 			}
 			ti->ti_lastblock = tib;
 		} else {
 			tib = ti->ti_lastblock;
 		}
 
 		/* Don't copy more than was requested. */
 		l = MIN(nbytes, TTYINQ_DATASIZE - boff);
 		MPASS(l > 0);
 		memcpy(tib->tib_data + boff, cbuf, l);
 
 		/* Set the quoting bits for the proper region. */
 		ttyinq_set_quotes(tib, boff, l, quote);
 
 		cbuf += l;
 		nbytes -= l;
 		ti->ti_end += l;
 	}
 
 	return (cbuf - (const char *)buf);
 }
 
 int
 ttyinq_write_nofrag(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
 {
 	size_t ret;
 
 	if (ttyinq_bytesleft(ti) < nbytes)
 		return (-1);
 
 	/* We should always be able to write it back. */
 	ret = ttyinq_write(ti, buf, nbytes, quote);
 	MPASS(ret == nbytes);
 
 	return (0);
 }
 
 void
 ttyinq_canonicalize(struct ttyinq *ti)
 {
 
 	ti->ti_linestart = ti->ti_reprint = ti->ti_end;
 	ti->ti_startblock = ti->ti_reprintblock = ti->ti_lastblock;
 }
 
 size_t
 ttyinq_findchar(struct ttyinq *ti, const char *breakc, size_t maxlen,
     char *lastc)
 {
 	struct ttyinq_block *tib = ti->ti_firstblock;
 	unsigned int boff = ti->ti_begin;
 	unsigned int bend = MIN(MIN(TTYINQ_DATASIZE, ti->ti_linestart),
 	    ti->ti_begin + maxlen);
 
 	MPASS(maxlen > 0);
 
 	if (tib == NULL)
 		return (0);
 
 	while (boff < bend) {
 		if (strchr(breakc, tib->tib_data[boff]) && !GETBIT(tib, boff)) {
 			*lastc = tib->tib_data[boff];
 			return (boff - ti->ti_begin + 1);
 		}
 		boff++;
 	}
 
 	/* Not found - just process the entire block. */
 	return (bend - ti->ti_begin);
 }
 
 void
 ttyinq_flush(struct ttyinq *ti)
 {
 	struct ttyinq_block *tib;
 
 	ti->ti_begin = 0;
 	ti->ti_linestart = 0;
 	ti->ti_reprint = 0;
 	ti->ti_end = 0;
 
 	/* Zero all data in the input queue to get rid of passwords. */
 	if (ttyinq_flush_secure) {
 		for (tib = ti->ti_firstblock; tib != NULL; tib = tib->tib_next)
 			bzero(&tib->tib_data, sizeof tib->tib_data);
 	}
 }
 
 int
 ttyinq_peekchar(struct ttyinq *ti, char *c, int *quote)
 {
 	unsigned int boff;
 	struct ttyinq_block *tib = ti->ti_lastblock;
 
 	if (ti->ti_linestart == ti->ti_end)
 		return (-1);
 
 	MPASS(ti->ti_end > 0);
 	boff = (ti->ti_end - 1) % TTYINQ_DATASIZE;
 
 	*c = tib->tib_data[boff];
 	*quote = GETBIT(tib, boff);
 
 	return (0);
 }
 
 void
 ttyinq_unputchar(struct ttyinq *ti)
 {
 
 	MPASS(ti->ti_linestart < ti->ti_end);
 
 	if (--ti->ti_end % TTYINQ_DATASIZE == 0) {
 		/* Roll back to the previous block. */
 		ti->ti_lastblock = ti->ti_lastblock->tib_prev;
 		/*
 		 * This can only fail if we are unputchar()'ing the
 		 * first character in the queue.
 		 */
 		MPASS((ti->ti_lastblock == NULL) == (ti->ti_end == 0));
 	}
 }
 
 void
 ttyinq_reprintpos_set(struct ttyinq *ti)
 {
 
 	ti->ti_reprint = ti->ti_end;
 	ti->ti_reprintblock = ti->ti_lastblock;
 }
 
 void
 ttyinq_reprintpos_reset(struct ttyinq *ti)
 {
 
 	ti->ti_reprint = ti->ti_linestart;
 	ti->ti_reprintblock = ti->ti_startblock;
 }
 
 static void
 ttyinq_line_iterate(struct ttyinq *ti,
     ttyinq_line_iterator_t *iterator, void *data,
     unsigned int offset, struct ttyinq_block *tib)
 {
 	unsigned int boff;
 
 	/* Use the proper block when we're at the queue head. */
 	if (offset == 0)
 		tib = ti->ti_firstblock;
 
 	/* Iterate all characters and call the iterator function. */
 	for (; offset < ti->ti_end; offset++) {
 		boff = offset % TTYINQ_DATASIZE;
 		MPASS(tib != NULL);
 
 		/* Call back the iterator function. */
 		iterator(data, tib->tib_data[boff], GETBIT(tib, boff));
 
 		/* Last byte iterated - go to the next block. */
 		if (boff == TTYINQ_DATASIZE - 1)
 			tib = tib->tib_next;
 		MPASS(tib != NULL);
 	}
 }
 
 void
 ttyinq_line_iterate_from_linestart(struct ttyinq *ti,
     ttyinq_line_iterator_t *iterator, void *data)
 {
 
 	ttyinq_line_iterate(ti, iterator, data,
 	    ti->ti_linestart, ti->ti_startblock);
 }
 
 void
 ttyinq_line_iterate_from_reprintpos(struct ttyinq *ti,
     ttyinq_line_iterator_t *iterator, void *data)
 {
 
 	ttyinq_line_iterate(ti, iterator, data,
 	    ti->ti_reprint, ti->ti_reprintblock);
 }
 
 static void
 ttyinq_startup(void *dummy)
 {
 
 	ttyinq_zone = uma_zcreate("ttyinq", sizeof(struct ttyinq_block),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 SYSINIT(ttyinq, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyinq_startup, NULL);
Index: head/sys/kern/tty_outq.c
===================================================================
--- head/sys/kern/tty_outq.c	(revision 326270)
+++ head/sys/kern/tty_outq.c	(revision 326271)
@@ -1,345 +1,347 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/uio.h>
 
 #include <vm/uma.h>
 
 /*
  * TTY output queue buffering.
  *
  * The previous design of the TTY layer offered the so-called clists.
  * These clists were used for both the input queues and the output
  * queue. We don't use certain features on the output side, like quoting
  * bits for parity marking and such. This mechanism is similar to the
  * old clists, but only contains the features we need to buffer the
  * output.
  */
 
 struct ttyoutq_block {
 	struct ttyoutq_block	*tob_next;
 	char			tob_data[TTYOUTQ_DATASIZE];
 };
 
 static uma_zone_t ttyoutq_zone;
 
 #define	TTYOUTQ_INSERT_TAIL(to, tob) do {				\
 	if (to->to_end == 0) {						\
 		tob->tob_next = to->to_firstblock;			\
 		to->to_firstblock = tob;				\
 	} else {							\
 		tob->tob_next = to->to_lastblock->tob_next;		\
 		to->to_lastblock->tob_next = tob;			\
 	}								\
 	to->to_nblocks++;						\
 } while (0)
 
 #define	TTYOUTQ_REMOVE_HEAD(to) do {					\
 	to->to_firstblock = to->to_firstblock->tob_next;		\
 	to->to_nblocks--;						\
 } while (0)
 
 #define	TTYOUTQ_RECYCLE(to, tob) do {					\
 	if (to->to_quota <= to->to_nblocks)				\
 		uma_zfree(ttyoutq_zone, tob);				\
 	else								\
 		TTYOUTQ_INSERT_TAIL(to, tob);				\
 } while(0)
 
 void
 ttyoutq_flush(struct ttyoutq *to)
 {
 
 	to->to_begin = 0;
 	to->to_end = 0;
 }
 
 int
 ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t size)
 {
 	struct ttyoutq_block *tob;
 
 	to->to_quota = howmany(size, TTYOUTQ_DATASIZE);
 
 	while (to->to_quota > to->to_nblocks) {
 		/*
 		 * List is getting bigger.
 		 * Add new blocks to the tail of the list.
 		 *
 		 * We must unlock the TTY temporarily, because we need
 		 * to allocate memory. This won't be a problem, because
 		 * in the worst case, another thread ends up here, which
 		 * may cause us to allocate too many blocks, but this
 		 * will be caught by the loop below.
 		 */
 		tty_unlock(tp);
 		tob = uma_zalloc(ttyoutq_zone, M_WAITOK);
 		tty_lock(tp);
 
 		if (tty_gone(tp)) {
 			uma_zfree(ttyoutq_zone, tob);
 			return (ENXIO);
 		}
 
 		TTYOUTQ_INSERT_TAIL(to, tob);
 	}
 	return (0);
 }
 
 void
 ttyoutq_free(struct ttyoutq *to)
 {
 	struct ttyoutq_block *tob;
 
 	ttyoutq_flush(to);
 	to->to_quota = 0;
 
 	while ((tob = to->to_firstblock) != NULL) {
 		TTYOUTQ_REMOVE_HEAD(to);
 		uma_zfree(ttyoutq_zone, tob);
 	}
 
 	MPASS(to->to_nblocks == 0);
 }
 
 size_t
 ttyoutq_read(struct ttyoutq *to, void *buf, size_t len)
 {
 	char *cbuf = buf;
 
 	while (len > 0) {
 		struct ttyoutq_block *tob;
 		size_t cbegin, cend, clen;
 
 		/* See if there still is data. */
 		if (to->to_begin == to->to_end)
 			break;
 		tob = to->to_firstblock;
 		if (tob == NULL)
 			break;
 
 		/*
 		 * The end address should be the lowest of these three:
 		 * - The write pointer
 		 * - The blocksize - we can't read beyond the block
 		 * - The end address if we could perform the full read
 		 */
 		cbegin = to->to_begin;
 		cend = MIN(MIN(to->to_end, to->to_begin + len),
 		    TTYOUTQ_DATASIZE);
 		clen = cend - cbegin;
 
 		/* Copy the data out of the buffers. */
 		memcpy(cbuf, tob->tob_data + cbegin, clen);
 		cbuf += clen;
 		len -= clen;
 
 		if (cend == to->to_end) {
 			/* Read the complete queue. */
 			to->to_begin = 0;
 			to->to_end = 0;
 		} else if (cend == TTYOUTQ_DATASIZE) {
 			/* Read the block until the end. */
 			TTYOUTQ_REMOVE_HEAD(to);
 			to->to_begin = 0;
 			to->to_end -= TTYOUTQ_DATASIZE;
 			TTYOUTQ_RECYCLE(to, tob);
 		} else {
 			/* Read the block partially. */
 			to->to_begin += clen;
 		}
 	}
 
 	return (cbuf - (char *)buf);
 }
 
 /*
  * An optimized version of ttyoutq_read() which can be used in pseudo
  * TTY drivers to directly copy data from the outq to userspace, instead
  * of buffering it.
  *
  * We can only copy data directly if we need to read the entire block
  * back to the user, because we temporarily remove the block from the
  * queue. Otherwise we need to copy it to a temporary buffer first, to
  * make sure data remains in the correct order.
  */
 int
 ttyoutq_read_uio(struct ttyoutq *to, struct tty *tp, struct uio *uio)
 {
 
 	while (uio->uio_resid > 0) {
 		int error;
 		struct ttyoutq_block *tob;
 		size_t cbegin, cend, clen;
 
 		/* See if there still is data. */
 		if (to->to_begin == to->to_end)
 			return (0);
 		tob = to->to_firstblock;
 		if (tob == NULL)
 			return (0);
 
 		/*
 		 * The end address should be the lowest of these three:
 		 * - The write pointer
 		 * - The blocksize - we can't read beyond the block
 		 * - The end address if we could perform the full read
 		 */
 		cbegin = to->to_begin;
 		cend = MIN(MIN(to->to_end, to->to_begin + uio->uio_resid),
 		    TTYOUTQ_DATASIZE);
 		clen = cend - cbegin;
 
 		/*
 		 * We can prevent buffering in some cases:
 		 * - We need to read the block until the end.
 		 * - We don't need to read the block until the end, but
 		 *   there is no data beyond it, which allows us to move
 		 *   the write pointer to a new block.
 		 */
 		if (cend == TTYOUTQ_DATASIZE || cend == to->to_end) {
 			/*
 			 * Fast path: zero copy. Remove the first block,
 			 * so we can unlock the TTY temporarily.
 			 */
 			TTYOUTQ_REMOVE_HEAD(to);
 			to->to_begin = 0;
 			if (to->to_end <= TTYOUTQ_DATASIZE)
 				to->to_end = 0;
 			else
 				to->to_end -= TTYOUTQ_DATASIZE;
 
 			/* Temporary unlock and copy the data to userspace. */
 			tty_unlock(tp);
 			error = uiomove(tob->tob_data + cbegin, clen, uio);
 			tty_lock(tp);
 
 			/* Block can now be readded to the list. */
 			TTYOUTQ_RECYCLE(to, tob);
 		} else {
 			char ob[TTYOUTQ_DATASIZE - 1];
 
 			/*
 			 * Slow path: store data in a temporary buffer.
 			 */
 			memcpy(ob, tob->tob_data + cbegin, clen);
 			to->to_begin += clen;
 			MPASS(to->to_begin < TTYOUTQ_DATASIZE);
 
 			/* Temporary unlock and copy the data to userspace. */
 			tty_unlock(tp);
 			error = uiomove(ob, clen, uio);
 			tty_lock(tp);
 		}
 
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 size_t
 ttyoutq_write(struct ttyoutq *to, const void *buf, size_t nbytes)
 {
 	const char *cbuf = buf;
 	struct ttyoutq_block *tob;
 	unsigned int boff;
 	size_t l;
 
 	while (nbytes > 0) {
 		boff = to->to_end % TTYOUTQ_DATASIZE;
 
 		if (to->to_end == 0) {
 			/* First time we're being used or drained. */
 			MPASS(to->to_begin == 0);
 			tob = to->to_firstblock;
 			if (tob == NULL) {
 				/* Queue has no blocks. */
 				break;
 			}
 			to->to_lastblock = tob;
 		} else if (boff == 0) {
 			/* We reached the end of this block on last write. */
 			tob = to->to_lastblock->tob_next;
 			if (tob == NULL) {
 				/* We've reached the watermark. */
 				break;
 			}
 			to->to_lastblock = tob;
 		} else {
 			tob = to->to_lastblock;
 		}
 
 		/* Don't copy more than was requested. */
 		l = MIN(nbytes, TTYOUTQ_DATASIZE - boff);
 		MPASS(l > 0);
 		memcpy(tob->tob_data + boff, cbuf, l);
 
 		cbuf += l;
 		nbytes -= l;
 		to->to_end += l;
 	}
 
 	return (cbuf - (const char *)buf);
 }
 
 int
 ttyoutq_write_nofrag(struct ttyoutq *to, const void *buf, size_t nbytes)
 {
 	size_t ret;
 
 	if (ttyoutq_bytesleft(to) < nbytes)
 		return (-1);
 
 	/* We should always be able to write it back. */
 	ret = ttyoutq_write(to, buf, nbytes);
 	MPASS(ret == nbytes);
 
 	return (0);
 }
 
 static void
 ttyoutq_startup(void *dummy)
 {
 
 	ttyoutq_zone = uma_zcreate("ttyoutq", sizeof(struct ttyoutq_block),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 SYSINIT(ttyoutq, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyoutq_startup, NULL);
Index: head/sys/kern/tty_pts.c
===================================================================
--- head/sys/kern/tty_pts.c	(revision 326270)
+++ head/sys/kern/tty_pts.c	(revision 326271)
@@ -1,869 +1,871 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* Add compatibility bits for FreeBSD. */
 #define PTS_COMPAT
 /* Add pty(4) compat bits. */
 #define PTS_EXTERNAL
 /* Add bits to make Linux binaries work. */
 #define PTS_LINUX
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/serial.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/ttycom.h>
 #include <sys/user.h>
 
 #include <machine/stdarg.h>
 
 /*
  * Our utmp(5) format is limited to 8-byte TTY line names.  This means
  * we can at most allocate 1000 pseudo-terminals ("pts/999").  Allow
  * users to increase this number, assuming they have manually increased
  * UT_LINESIZE.
  */
 static struct unrhdr *pts_pool;
 
 static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device");
 
 /*
  * Per-PTS structure.
  *
  * List of locks
  * (t)	locked by tty_lock()
  * (c)	const until freeing
  */
 struct pts_softc {
 	int		pts_unit;	/* (c) Device unit number. */
 	unsigned int	pts_flags;	/* (t) Device flags. */
 #define	PTS_PKT		0x1	/* Packet mode. */
 #define	PTS_FINISHED	0x2	/* Return errors on read()/write(). */
 	char		pts_pkt;	/* (t) Unread packet mode data. */
 
 	struct cv	pts_inwait;	/* (t) Blocking write() on master. */
 	struct selinfo	pts_inpoll;	/* (t) Select queue for write(). */
 	struct cv	pts_outwait;	/* (t) Blocking read() on master. */
 	struct selinfo	pts_outpoll;	/* (t) Select queue for read(). */
 
 #ifdef PTS_EXTERNAL
 	struct cdev	*pts_cdev;	/* (c) Master device node. */
 #endif /* PTS_EXTERNAL */
 
 	struct ucred	*pts_cred;	/* (c) Resource limit. */
 };
 
 /*
  * Controller-side file operations.
  */
 
 static int
 ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0;
 	char pkt;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	tty_lock(tp);
 
 	for (;;) {
 		/*
 		 * Implement packet mode. When packet mode is turned on,
 		 * the first byte contains a bitmask of events that
 		 * occurred (start, stop, flush, window size, etc).
 		 */
 		if (psc->pts_flags & PTS_PKT && psc->pts_pkt) {
 			pkt = psc->pts_pkt;
 			psc->pts_pkt = 0;
 			tty_unlock(tp);
 
 			error = ureadc(pkt, uio);
 			return (error);
 		}
 
 		/*
 		 * Transmit regular data.
 		 *
 		 * XXX: We shouldn't use ttydisc_getc_poll()! Even
 		 * though in this implementation, there is likely going
 		 * to be data, we should just call ttydisc_getc_uio()
 		 * and use its return value to sleep.
 		 */
 		if (ttydisc_getc_poll(tp)) {
 			if (psc->pts_flags & PTS_PKT) {
 				/*
 				 * XXX: Small race. Fortunately PTY
 				 * consumers aren't multithreaded.
 				 */
 
 				tty_unlock(tp);
 				error = ureadc(TIOCPKT_DATA, uio);
 				if (error)
 					return (error);
 				tty_lock(tp);
 			}
 
 			error = ttydisc_getc_uio(tp, uio);
 			break;
 		}
 
 		/* Maybe the device isn't used anyway. */
 		if (psc->pts_flags & PTS_FINISHED)
 			break;
 
 		/* Wait for more data. */
 		if (fp->f_flag & O_NONBLOCK) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx);
 		if (error != 0)
 			break;
 	}
 
 	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	char ib[256], *ibstart;
 	size_t iblen, rintlen;
 	int error = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	for (;;) {
 		ibstart = ib;
 		iblen = MIN(uio->uio_resid, sizeof ib);
 		error = uiomove(ib, iblen, uio);
 
 		tty_lock(tp);
 		if (error != 0) {
 			iblen = 0;
 			goto done;
 		}
 
 		/*
 		 * When possible, avoid the slow path. rint_bypass()
 		 * copies all input to the input queue at once.
 		 */
 		MPASS(iblen > 0);
 		do {
 			rintlen = ttydisc_rint_simple(tp, ibstart, iblen);
 			ibstart += rintlen;
 			iblen -= rintlen;
 			if (iblen == 0) {
 				/* All data written. */
 				break;
 			}
 
 			/* Maybe the device isn't used anyway. */
 			if (psc->pts_flags & PTS_FINISHED) {
 				error = EIO;
 				goto done;
 			}
 
 			/* Wait for more data. */
 			if (fp->f_flag & O_NONBLOCK) {
 				error = EWOULDBLOCK;
 				goto done;
 			}
 
 			/* Wake up users on the slave side. */
 			ttydisc_rint_done(tp);
 			error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx);
 			if (error != 0)
 				goto done;
 		} while (iblen > 0);
 
 		if (uio->uio_resid == 0)
 			break;
 		tty_unlock(tp);
 	}
 
 done:	ttydisc_rint_done(tp);
 	tty_unlock(tp);
 
 	/*
 	 * Don't account for the part of the buffer that we couldn't
 	 * pass to the TTY.
 	 */
 	uio->uio_resid += iblen;
 	return (error);
 }
 
 static int
 ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0, sig;
 
 	switch (cmd) {
 	case FIODTYPE:
 		*(int *)data = D_TTY;
 		return (0);
 	case FIONBIO:
 		/* This device supports non-blocking operation. */
 		return (0);
 	case FIONREAD:
 		tty_lock(tp);
 		if (psc->pts_flags & PTS_FINISHED) {
 			/* Force read() to be called. */
 			*(int *)data = 1;
 		} else {
 			*(int *)data = ttydisc_getc_poll(tp);
 		}
 		tty_unlock(tp);
 		return (0);
 	case FIODGNAME: {
 		struct fiodgname_arg *fgn;
 		const char *p;
 		int i;
 
 		/* Reverse device name lookups, for ptsname() and ttyname(). */
 		fgn = data;
 		p = tty_devname(tp);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			return (EINVAL);
 		return copyout(p, fgn->buf, i);
 	}
 
 	/*
 	 * We need to implement TIOCGPGRP and TIOCGSID here again. When
 	 * called on the pseudo-terminal master, it should not check if
 	 * the terminal is the foreground terminal of the calling
 	 * process.
 	 *
 	 * TIOCGETA is also implemented here. Various Linux PTY routines
 	 * often call isatty(), which is implemented by tcgetattr().
 	 */
 #ifdef PTS_LINUX
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		tty_lock(tp);
 		*(struct termios*)data = tp->t_termios;
 		tty_unlock(tp);
 		return (0);
 #endif /* PTS_LINUX */
 	case TIOCSETAF:
 	case TIOCSETAW:
 		/*
 		 * We must make sure we turn tcsetattr() calls of TCSAFLUSH and
 		 * TCSADRAIN into something different. If an application would
 		 * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may
 		 * deadlock waiting for all data to be read.
 		 */
 		cmd = TIOCSETA;
 		break;
 #if defined(PTS_COMPAT) || defined(PTS_LINUX)
 	case TIOCGPTN:
 		/*
 		 * Get the device unit number.
 		 */
 		if (psc->pts_unit < 0)
 			return (ENOTTY);
 		*(unsigned int *)data = psc->pts_unit;
 		return (0);
 #endif /* PTS_COMPAT || PTS_LINUX */
 	case TIOCGPGRP:
 		/* Get the foreground process group ID. */
 		tty_lock(tp);
 		if (tp->t_pgrp != NULL)
 			*(int *)data = tp->t_pgrp->pg_id;
 		else
 			*(int *)data = NO_PID;
 		tty_unlock(tp);
 		return (0);
 	case TIOCGSID:
 		/* Get the session leader process ID. */
 		tty_lock(tp);
 		if (tp->t_session == NULL)
 			error = ENOTTY;
 		else
 			*(int *)data = tp->t_session->s_sid;
 		tty_unlock(tp);
 		return (error);
 	case TIOCPTMASTER:
 		/* Yes, we are a pseudo-terminal master. */
 		return (0);
 	case TIOCSIG:
 		/* Signal the foreground process group. */
 		sig = *(int *)data;
 		if (sig < 1 || sig >= NSIG)
 			return (EINVAL);
 
 		tty_lock(tp);
 		tty_signal_pgrp(tp, sig);
 		tty_unlock(tp);
 		return (0);
 	case TIOCPKT:
 		/* Enable/disable packet mode. */
 		tty_lock(tp);
 		if (*(int *)data)
 			psc->pts_flags |= PTS_PKT;
 		else
 			psc->pts_flags &= ~PTS_PKT;
 		tty_unlock(tp);
 		return (0);
 	}
 
 	/* Just redirect this ioctl to the slave device. */
 	tty_lock(tp);
 	error = tty_ioctl(tp, cmd, data, fp->f_flag, td);
 	tty_unlock(tp);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	return (error);
 }
 
 static int
 ptsdev_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int revents = 0;
 
 	tty_lock(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		/* Slave device is not opened. */
 		tty_unlock(tp);
 		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
 	}
 
 	if (events & (POLLIN|POLLRDNORM)) {
 		/* See if we can getc something. */
 		if (ttydisc_getc_poll(tp) ||
 		    (psc->pts_flags & PTS_PKT && psc->pts_pkt))
 			revents |= events & (POLLIN|POLLRDNORM);
 	}
 	if (events & (POLLOUT|POLLWRNORM)) {
 		/* See if we can rint something. */
 		if (ttydisc_rint_poll(tp))
 			revents |= events & (POLLOUT|POLLWRNORM);
 	}
 
 	/*
 	 * No need to check for POLLHUP here. This device cannot be used
 	 * as a callout device, which means we always have a carrier,
 	 * because the master is.
 	 */
 
 	if (revents == 0) {
 		/*
 		 * This code might look misleading, but the naming of
 		 * poll events on this side is the opposite of the slave
 		 * device.
 		 */
 		if (events & (POLLIN|POLLRDNORM))
 			selrecord(td, &psc->pts_outpoll);
 		if (events & (POLLOUT|POLLWRNORM))
 			selrecord(td, &psc->pts_inpoll);
 	}
 
 	tty_unlock(tp);
 
 	return (revents);
 }
 
 /*
  * kqueue support.
  */
 
 static void
 pts_kqops_read_detach(struct knote *kn)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	knlist_remove(&psc->pts_outpoll.si_note, kn, 0);
 }
 
 static int
 pts_kqops_read_event(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_getc_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static void
 pts_kqops_write_detach(struct knote *kn)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	knlist_remove(&psc->pts_inpoll.si_note, kn, 0);
 }
 
 static int
 pts_kqops_write_event(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_rint_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static struct filterops pts_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_read_detach,
 	.f_event = pts_kqops_read_event,
 };
 static struct filterops pts_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_write_detach,
 	.f_event = pts_kqops_write_event,
 };
 
 static int
 ptsdev_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0;
 
 	tty_lock(tp);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pts_kqops_read;
 		knlist_add(&psc->pts_outpoll.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pts_kqops_write;
 		knlist_add(&psc->pts_inpoll.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	tty_unlock(tp);
 	return (error);
 }
 
 static int
 ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 #ifdef PTS_EXTERNAL
 	struct pts_softc *psc = tty_softc(tp);
 #endif /* PTS_EXTERNAL */
 	struct cdev *dev = tp->t_dev;
 
 	/*
 	 * According to POSIX, we must implement an fstat(). This also
 	 * makes this implementation compatible with Linux binaries,
 	 * because Linux calls fstat() on the pseudo-terminal master to
 	 * obtain st_rdev.
 	 *
 	 * XXX: POSIX also mentions we must fill in st_dev, but how?
 	 */
 
 	bzero(sb, sizeof *sb);
 #ifdef PTS_EXTERNAL
 	if (psc->pts_cdev != NULL)
 		sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev);
 	else
 #endif /* PTS_EXTERNAL */
 		sb->st_ino = sb->st_rdev = tty_udev(tp);
 
 	sb->st_atim = dev->si_atime;
 	sb->st_ctim = dev->si_ctime;
 	sb->st_mtim = dev->si_mtime;
 	sb->st_uid = dev->si_uid;
 	sb->st_gid = dev->si_gid;
 	sb->st_mode = dev->si_mode | S_IFCHR;
 
 	return (0);
 }
 
 static int
 ptsdev_close(struct file *fp, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 
 	/* Deallocate TTY device. */
 	tty_lock(tp);
 	tty_rel_gone(tp);
 
 	/*
 	 * Open of /dev/ptmx or /dev/ptyXX changes the type of file
 	 * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode
 	 * use count, we need to decrement it, and possibly do other
 	 * required cleanup.
 	 */
 	if (fp->f_vnode != NULL)
 		return (vnops.fo_close(fp, td));
 
 	return (0);
 }
 
 static int
 ptsdev_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct tty *tp;
 
 	kif->kf_type = KF_TYPE_PTS;
 	tp = fp->f_data;
 	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
 	kif->kf_un.kf_pts.kf_pts_dev_freebsd11 =
 	    kif->kf_un.kf_pts.kf_pts_dev; /* truncate */
 	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
 	return (0);
 }
 
 static struct fileops ptsdev_ops = {
 	.fo_read	= ptsdev_read,
 	.fo_write	= ptsdev_write,
 	.fo_truncate	= invfo_truncate,
 	.fo_ioctl	= ptsdev_ioctl,
 	.fo_poll	= ptsdev_poll,
 	.fo_kqfilter	= ptsdev_kqfilter,
 	.fo_stat	= ptsdev_stat,
 	.fo_close	= ptsdev_close,
 	.fo_chmod	= invfo_chmod,
 	.fo_chown	= invfo_chown,
 	.fo_sendfile	= invfo_sendfile,
 	.fo_fill_kinfo	= ptsdev_fill_kinfo,
 	.fo_flags	= DFLAG_PASSABLE,
 };
 
 /*
  * Driver-side hooks.
  */
 
 static void
 ptsdrv_outwakeup(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	cv_broadcast(&psc->pts_outwait);
 	selwakeup(&psc->pts_outpoll);
 	KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0);
 }
 
 static void
 ptsdrv_inwakeup(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	cv_broadcast(&psc->pts_inwait);
 	selwakeup(&psc->pts_inpoll);
 	KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0);
 }
 
 static int
 ptsdrv_open(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	psc->pts_flags &= ~PTS_FINISHED;
 
 	return (0);
 }
 
 static void
 ptsdrv_close(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	/* Wake up any blocked readers/writers. */
 	psc->pts_flags |= PTS_FINISHED;
 	ptsdrv_outwakeup(tp);
 	ptsdrv_inwakeup(tp);
 }
 
 static void
 ptsdrv_pktnotify(struct tty *tp, char event)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	/*
 	 * Clear conflicting flags.
 	 */
 
 	switch (event) {
 	case TIOCPKT_STOP:
 		psc->pts_pkt &= ~TIOCPKT_START;
 		break;
 	case TIOCPKT_START:
 		psc->pts_pkt &= ~TIOCPKT_STOP;
 		break;
 	case TIOCPKT_NOSTOP:
 		psc->pts_pkt &= ~TIOCPKT_DOSTOP;
 		break;
 	case TIOCPKT_DOSTOP:
 		psc->pts_pkt &= ~TIOCPKT_NOSTOP;
 		break;
 	}
 
 	psc->pts_pkt |= event;
 	ptsdrv_outwakeup(tp);
 }
 
 static void
 ptsdrv_free(void *softc)
 {
 	struct pts_softc *psc = softc;
 
 	/* Make device number available again. */
 	if (psc->pts_unit >= 0)
 		free_unr(pts_pool, psc->pts_unit);
 
 	chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0);
 	racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1);
 	crfree(psc->pts_cred);
 
 	seldrain(&psc->pts_inpoll);
 	seldrain(&psc->pts_outpoll);
 	knlist_destroy(&psc->pts_inpoll.si_note);
 	knlist_destroy(&psc->pts_outpoll.si_note);
 
 #ifdef PTS_EXTERNAL
 	/* Destroy master device as well. */
 	if (psc->pts_cdev != NULL)
 		destroy_dev_sched(psc->pts_cdev);
 #endif /* PTS_EXTERNAL */
 
 	free(psc, M_PTS);
 }
 
 static struct ttydevsw pts_class = {
 	.tsw_flags	= TF_NOPREFIX,
 	.tsw_outwakeup	= ptsdrv_outwakeup,
 	.tsw_inwakeup	= ptsdrv_inwakeup,
 	.tsw_open	= ptsdrv_open,
 	.tsw_close	= ptsdrv_close,
 	.tsw_pktnotify	= ptsdrv_pktnotify,
 	.tsw_free	= ptsdrv_free,
 };
 
 #ifndef PTS_EXTERNAL
 static
 #endif /* !PTS_EXTERNAL */
 int
 pts_alloc(int fflags, struct thread *td, struct file *fp)
 {
 	int unit, ok, error;
 	struct tty *tp;
 	struct pts_softc *psc;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 
 	/* Resource limiting. */
 	PROC_LOCK(p);
 	error = racct_add(p, RACCT_NPTS, 1);
 	if (error != 0) {
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS));
 	if (!ok) {
 		racct_sub(p, RACCT_NPTS, 1);
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	PROC_UNLOCK(p);
 
 	/* Try to allocate a new pts unit number. */
 	unit = alloc_unr(pts_pool);
 	if (unit < 0) {
 		racct_sub(p, RACCT_NPTS, 1);
 		chgptscnt(cred->cr_ruidinfo, -1, 0);
 		return (EAGAIN);
 	}
 
 	/* Allocate TTY and softc. */
 	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
 	cv_init(&psc->pts_inwait, "ptsin");
 	cv_init(&psc->pts_outwait, "ptsout");
 
 	psc->pts_unit = unit;
 	psc->pts_cred = crhold(cred);
 
 	tp = tty_alloc(&pts_class, psc);
 	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
 
 	/* Expose the slave device as well. */
 	tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit);
 
 	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
 
 	return (0);
 }
 
 #ifdef PTS_EXTERNAL
 int
 pts_alloc_external(int fflags, struct thread *td, struct file *fp,
     struct cdev *dev, const char *name)
 {
 	int ok, error;
 	struct tty *tp;
 	struct pts_softc *psc;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 
 	/* Resource limiting. */
 	PROC_LOCK(p);
 	error = racct_add(p, RACCT_NPTS, 1);
 	if (error != 0) {
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS));
 	if (!ok) {
 		racct_sub(p, RACCT_NPTS, 1);
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	PROC_UNLOCK(p);
 
 	/* Allocate TTY and softc. */
 	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
 	cv_init(&psc->pts_inwait, "ptsin");
 	cv_init(&psc->pts_outwait, "ptsout");
 
 	psc->pts_unit = -1;
 	psc->pts_cdev = dev;
 	psc->pts_cred = crhold(cred);
 
 	tp = tty_alloc(&pts_class, psc);
 	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
 
 	/* Expose the slave device as well. */
 	tty_makedev(tp, td->td_ucred, "%s", name);
 
 	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
 
 	return (0);
 }
 #endif /* PTS_EXTERNAL */
 
 int
 sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap)
 {
 	int error, fd;
 	struct file *fp;
 
 	/*
 	 * POSIX states it's unspecified when other flags are passed. We
 	 * don't allow this.
 	 */
 	if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC))
 		return (EINVAL);
 
 	error = falloc(td, &fp, &fd, uap->flags);
 	if (error)
 		return (error);
 
 	/* Allocate the actual pseudo-TTY. */
 	error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp);
 	if (error != 0) {
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	/* Pass it back to userspace. */
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static void
 pts_init(void *unused)
 {
 
 	pts_pool = new_unrhdr(0, INT_MAX, NULL);
 }
 
 SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL);
Index: head/sys/kern/tty_tty.c
===================================================================
--- head/sys/kern/tty_tty.c	(revision 326270)
+++ head/sys/kern/tty_tty.c	(revision 326271)
@@ -1,96 +1,98 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003 Poul-Henning Kamp.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/vnode.h>
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 static	d_open_t	cttyopen;
 
 static struct cdevsw ctty_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	cttyopen,
 	.d_name =	"ctty",
 };
 
 static struct cdev *ctty;
 
 static	int
 cttyopen(struct cdev *dev, int flag, int mode, struct thread *td)
 {
 
 	return (ENXIO);
 }
 
 static void
 ctty_clone(void *arg, struct ucred *cred, char *name, int namelen,
     struct cdev **dev)
 {
 	struct proc *p;
 
 	if (*dev != NULL)
 		return;
 	if (strcmp(name, "tty"))
 		return;
 	p = curproc;
 	sx_sunlock(&clone_drain_lock);
 	sx_slock(&proctree_lock);
 	sx_slock(&clone_drain_lock);
 	dev_lock();
 	if (!(p->p_flag & P_CONTROLT))
 		*dev = ctty;
 	else if (p->p_session->s_ttyvp == NULL)
 		*dev = ctty;
 	else if (p->p_session->s_ttyvp->v_type == VBAD ||
 	    p->p_session->s_ttyvp->v_rdev == NULL) {
 		/* e.g. s_ttyvp was revoked */
 		*dev = ctty;
 	} else
 		*dev = p->p_session->s_ttyvp->v_rdev;
 	dev_refl(*dev);
 	dev_unlock();
 	sx_sunlock(&proctree_lock);
 }
 
 static void
 ctty_drvinit(void *unused)
 {
 
 	EVENTHANDLER_REGISTER(dev_clone, ctty_clone, 0, 1000);
 	ctty = make_dev_credf(MAKEDEV_ETERNAL, &ctty_cdevsw, 0, NULL, UID_ROOT,
 	    GID_WHEEL, 0666, "ctty");
 }
 
 SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,ctty_drvinit,NULL);
Index: head/sys/kern/tty_ttydisc.c
===================================================================
--- head/sys/kern/tty_ttydisc.c	(revision 326270)
+++ head/sys/kern/tty_ttydisc.c	(revision 326271)
@@ -1,1265 +1,1267 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/signal.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/ttycom.h>
 #include <sys/ttydefaults.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 
 /*
  * Standard TTYDISC `termios' line discipline.
  */
 
 /* Statistics. */
 static unsigned long tty_nin = 0;
 SYSCTL_ULONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD,
 	&tty_nin, 0, "Total amount of bytes received");
 static unsigned long tty_nout = 0;
 SYSCTL_ULONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
 	&tty_nout, 0, "Total amount of bytes transmitted");
 
 /* termios comparison macro's. */
 #define	CMP_CC(v,c) (tp->t_termios.c_cc[v] != _POSIX_VDISABLE && \
 			tp->t_termios.c_cc[v] == (c))
 #define	CMP_FLAG(field,opt) (tp->t_termios.c_ ## field ## flag & (opt))
 
 /* Characters that cannot be modified through c_cc. */
 #define CTAB	'\t'
 #define CNL	'\n'
 #define CCR	'\r'
 
 /* Character is a control character. */
 #define CTL_VALID(c)	((c) == 0x7f || (unsigned char)(c) < 0x20)
 /* Control character should be processed on echo. */
 #define CTL_ECHO(c,q)	(!(q) && ((c) == CERASE2 || (c) == CTAB || \
     (c) == CNL || (c) == CCR))
 /* Control character should be printed using ^X notation. */
 #define CTL_PRINT(c,q)	((c) == 0x7f || ((unsigned char)(c) < 0x20 && \
     ((q) || ((c) != CTAB && (c) != CNL))))
 /* Character is whitespace. */
 #define CTL_WHITE(c)	((c) == ' ' || (c) == CTAB)
 /* Character is alphanumeric. */
 #define CTL_ALNUM(c)	(((c) >= '0' && (c) <= '9') || \
     ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
 
 #define	TTY_STACKBUF	256
 
 void
 ttydisc_open(struct tty *tp)
 {
 	ttydisc_optimize(tp);
 }
 
 void
 ttydisc_close(struct tty *tp)
 {
 
 	/* Clean up our flags when leaving the discipline. */
 	tp->t_flags &= ~(TF_STOPPED|TF_HIWAT|TF_ZOMBIE);
 
 	/*
 	 * POSIX states that we must drain output and flush input on
 	 * last close.  Draining has already been done if possible.
 	 */
 	tty_flush(tp, FREAD | FWRITE);
 
 	if (ttyhook_hashook(tp, close))
 		ttyhook_close(tp);
 }
 
 static int
 ttydisc_read_canonical(struct tty *tp, struct uio *uio, int ioflag)
 {
 	char breakc[4] = { CNL }; /* enough to hold \n, VEOF and VEOL. */
 	int error;
 	size_t clen, flen = 0, n = 1;
 	unsigned char lastc = _POSIX_VDISABLE;
 
 #define BREAK_ADD(c) do { \
 	if (tp->t_termios.c_cc[c] != _POSIX_VDISABLE)	\
 		breakc[n++] = tp->t_termios.c_cc[c];	\
 } while (0)
 	/* Determine which characters we should trigger on. */
 	BREAK_ADD(VEOF);
 	BREAK_ADD(VEOL);
 #undef BREAK_ADD
 	breakc[n] = '\0';
 
 	do {
 		error = tty_wait_background(tp, curthread, SIGTTIN);
 		if (error)
 			return (error);
 
 		/*
 		 * Quite a tricky case: unlike the old TTY
 		 * implementation, this implementation copies data back
 		 * to userspace in large chunks. Unfortunately, we can't
 		 * calculate the line length on beforehand if it crosses
 		 * ttyinq_block boundaries, because multiple reads could
 		 * then make this code read beyond the newline.
 		 *
 		 * This is why we limit the read to:
 		 * - The size the user has requested
 		 * - The blocksize (done in tty_inq.c)
 		 * - The amount of bytes until the newline
 		 *
 		 * This causes the line length to be recalculated after
 		 * each block has been copied to userspace. This will
 		 * cause the TTY layer to return data in chunks using
 		 * the blocksize (except the first and last blocks).
 		 */
 		clen = ttyinq_findchar(&tp->t_inq, breakc, uio->uio_resid,
 		    &lastc);
 
 		/* No more data. */
 		if (clen == 0) {
 			if (tp->t_flags & TF_ZOMBIE)
 				return (0);
 			else if (ioflag & IO_NDELAY)
 				return (EWOULDBLOCK);
 
 			error = tty_wait(tp, &tp->t_inwait);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		/* Don't send the EOF char back to userspace. */
 		if (CMP_CC(VEOF, lastc))
 			flen = 1;
 
 		MPASS(flen <= clen);
 
 		/* Read and throw away the EOF character. */
 		error = ttyinq_read_uio(&tp->t_inq, tp, uio, clen, flen);
 		if (error)
 			return (error);
 
 	} while (uio->uio_resid > 0 && lastc == _POSIX_VDISABLE);
 
 	return (0);
 }
 
 static int
 ttydisc_read_raw_no_timer(struct tty *tp, struct uio *uio, int ioflag)
 {
 	size_t vmin = tp->t_termios.c_cc[VMIN];
 	ssize_t oresid = uio->uio_resid;
 	int error;
 
 	MPASS(tp->t_termios.c_cc[VTIME] == 0);
 
 	/*
 	 * This routine implements the easy cases of read()s while in
 	 * non-canonical mode, namely case B and D, where we don't have
 	 * any timers at all.
 	 */
 
 	for (;;) {
 		error = tty_wait_background(tp, curthread, SIGTTIN);
 		if (error)
 			return (error);
 
 		error = ttyinq_read_uio(&tp->t_inq, tp, uio,
 		    uio->uio_resid, 0);
 		if (error)
 			return (error);
 		if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
 			return (0);
 
 		/* We have to wait for more. */
 		if (tp->t_flags & TF_ZOMBIE)
 			return (0);
 		else if (ioflag & IO_NDELAY)
 			return (EWOULDBLOCK);
 
 		error = tty_wait(tp, &tp->t_inwait);
 		if (error)
 			return (error);
 	}
 }
 
 static int
 ttydisc_read_raw_read_timer(struct tty *tp, struct uio *uio, int ioflag,
     int oresid)
 {
 	size_t vmin = MAX(tp->t_termios.c_cc[VMIN], 1);
 	unsigned int vtime = tp->t_termios.c_cc[VTIME];
 	struct timeval end, now, left;
 	int error, hz;
 
 	MPASS(tp->t_termios.c_cc[VTIME] != 0);
 
 	/* Determine when the read should be expired. */
 	end.tv_sec = vtime / 10;
 	end.tv_usec = (vtime % 10) * 100000;
 	getmicrotime(&now);
 	timevaladd(&end, &now);
 
 	for (;;) {
 		error = tty_wait_background(tp, curthread, SIGTTIN);
 		if (error)
 			return (error);
 
 		error = ttyinq_read_uio(&tp->t_inq, tp, uio,
 		    uio->uio_resid, 0);
 		if (error)
 			return (error);
 		if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
 			return (0);
 
 		/* Calculate how long we should wait. */
 		getmicrotime(&now);
 		if (timevalcmp(&now, &end, >))
 			return (0);
 		left = end;
 		timevalsub(&left, &now);
 		hz = tvtohz(&left);
 
 		/*
 		 * We have to wait for more. If the timer expires, we
 		 * should return a 0-byte read.
 		 */
 		if (tp->t_flags & TF_ZOMBIE)
 			return (0);
 		else if (ioflag & IO_NDELAY)
 			return (EWOULDBLOCK);
 
 		error = tty_timedwait(tp, &tp->t_inwait, hz);
 		if (error)
 			return (error == EWOULDBLOCK ? 0 : error);
 	}
 
 	return (0);
 }
 
 static int
 ttydisc_read_raw_interbyte_timer(struct tty *tp, struct uio *uio, int ioflag)
 {
 	size_t vmin = tp->t_termios.c_cc[VMIN];
 	ssize_t oresid = uio->uio_resid;
 	int error;
 
 	MPASS(tp->t_termios.c_cc[VMIN] != 0);
 	MPASS(tp->t_termios.c_cc[VTIME] != 0);
 
 	/*
 	 * When using the interbyte timer, the timer should be started
 	 * after the first byte has been received. We just call into the
 	 * generic read timer code after we've received the first byte.
 	 */
 
 	for (;;) {
 		error = tty_wait_background(tp, curthread, SIGTTIN);
 		if (error)
 			return (error);
 
 		error = ttyinq_read_uio(&tp->t_inq, tp, uio,
 		    uio->uio_resid, 0);
 		if (error)
 			return (error);
 		if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
 			return (0);
 
 		/*
 		 * Not enough data, but we did receive some, which means
 		 * we'll now start using the interbyte timer.
 		 */
 		if (oresid != uio->uio_resid)
 			break;
 
 		/* We have to wait for more. */
 		if (tp->t_flags & TF_ZOMBIE)
 			return (0);
 		else if (ioflag & IO_NDELAY)
 			return (EWOULDBLOCK);
 
 		error = tty_wait(tp, &tp->t_inwait);
 		if (error)
 			return (error);
 	}
 
 	return ttydisc_read_raw_read_timer(tp, uio, ioflag, oresid);
 }
 
 int
 ttydisc_read(struct tty *tp, struct uio *uio, int ioflag)
 {
 	int error;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	if (CMP_FLAG(l, ICANON))
 		error = ttydisc_read_canonical(tp, uio, ioflag);
 	else if (tp->t_termios.c_cc[VTIME] == 0)
 		error = ttydisc_read_raw_no_timer(tp, uio, ioflag);
 	else if (tp->t_termios.c_cc[VMIN] == 0)
 		error = ttydisc_read_raw_read_timer(tp, uio, ioflag,
 		    uio->uio_resid);
 	else
 		error = ttydisc_read_raw_interbyte_timer(tp, uio, ioflag);
 
 	if (ttyinq_bytesleft(&tp->t_inq) >= tp->t_inlow ||
 	    ttyinq_bytescanonicalized(&tp->t_inq) == 0) {
 		/* Unset the input watermark when we've got enough space. */
 		tty_hiwat_in_unblock(tp);
 	}
 
 	return (error);
 }
 
 static __inline unsigned int
 ttydisc_findchar(const char *obstart, unsigned int oblen)
 {
 	const char *c = obstart;
 
 	while (oblen--) {
 		if (CTL_VALID(*c))
 			break;
 		c++;
 	}
 
 	return (c - obstart);
 }
 
 static int
 ttydisc_write_oproc(struct tty *tp, char c)
 {
 	unsigned int scnt, error;
 
 	MPASS(CMP_FLAG(o, OPOST));
 	MPASS(CTL_VALID(c));
 
 #define PRINT_NORMAL() ttyoutq_write_nofrag(&tp->t_outq, &c, 1)
 	switch (c) {
 	case CEOF:
 		/* End-of-text dropping. */
 		if (CMP_FLAG(o, ONOEOT))
 			return (0);
 		return PRINT_NORMAL();
 
 	case CERASE2:
 		/* Handle backspace to fix tab expansion. */
 		if (PRINT_NORMAL() != 0)
 			return (-1);
 		if (tp->t_column > 0)
 			tp->t_column--;
 		return (0);
 
 	case CTAB:
 		/* Tab expansion. */
 		scnt = 8 - (tp->t_column & 7);
 		if (CMP_FLAG(o, TAB3)) {
 			error = ttyoutq_write_nofrag(&tp->t_outq,
 			    "        ", scnt);
 		} else {
 			error = PRINT_NORMAL();
 		}
 		if (error)
 			return (-1);
 
 		tp->t_column += scnt;
 		MPASS((tp->t_column % 8) == 0);
 		return (0);
 
 	case CNL:
 		/* Newline conversion. */
 		if (CMP_FLAG(o, ONLCR)) {
 			/* Convert \n to \r\n. */
 			error = ttyoutq_write_nofrag(&tp->t_outq, "\r\n", 2);
 		} else {
 			error = PRINT_NORMAL();
 		}
 		if (error)
 			return (-1);
 
 		if (CMP_FLAG(o, ONLCR|ONLRET)) {
 			tp->t_column = tp->t_writepos = 0;
 			ttyinq_reprintpos_set(&tp->t_inq);
 		}
 		return (0);
 
 	case CCR:
 		/* Carriage return to newline conversion. */
 		if (CMP_FLAG(o, OCRNL))
 			c = CNL;
 		/* Omit carriage returns on column 0. */
 		if (CMP_FLAG(o, ONOCR) && tp->t_column == 0)
 			return (0);
 		if (PRINT_NORMAL() != 0)
 			return (-1);
 
 		tp->t_column = tp->t_writepos = 0;
 		ttyinq_reprintpos_set(&tp->t_inq);
 		return (0);
 	}
 
 	/*
 	 * Invisible control character. Print it, but don't
 	 * increase the column count.
 	 */
 	return PRINT_NORMAL();
 #undef PRINT_NORMAL
 }
 
 /*
  * Just like the old TTY implementation, we need to copy data in chunks
  * into a temporary buffer. One of the reasons why we need to do this,
  * is because output processing (only TAB3 though) may allow the buffer
  * to grow eight times.
  */
 int
 ttydisc_write(struct tty *tp, struct uio *uio, int ioflag)
 {
 	char ob[TTY_STACKBUF];
 	char *obstart;
 	int error = 0;
 	unsigned int oblen = 0;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tp->t_flags & TF_ZOMBIE)
 		return (EIO);
 
 	/*
 	 * We don't need to check whether the process is the foreground
 	 * process group or if we have a carrier. This is already done
 	 * in ttydev_write().
 	 */
 
 	while (uio->uio_resid > 0) {
 		unsigned int nlen;
 
 		MPASS(oblen == 0);
 
 		/* Step 1: read data. */
 		obstart = ob;
 		nlen = MIN(uio->uio_resid, sizeof ob);
 		tty_unlock(tp);
 		error = uiomove(ob, nlen, uio);
 		tty_lock(tp);
 		if (error != 0)
 			break;
 		oblen = nlen;
 
 		if (tty_gone(tp)) {
 			error = ENXIO;
 			break;
 		}
 
 		MPASS(oblen > 0);
 
 		/* Step 2: process data. */
 		do {
 			unsigned int plen, wlen;
 
 			/* Search for special characters for post processing. */
 			if (CMP_FLAG(o, OPOST)) {
 				plen = ttydisc_findchar(obstart, oblen);
 			} else {
 				plen = oblen;
 			}
 
 			if (plen == 0) {
 				/*
 				 * We're going to process a character
 				 * that needs processing
 				 */
 				if (ttydisc_write_oproc(tp, *obstart) == 0) {
 					obstart++;
 					oblen--;
 
 					tp->t_writepos = tp->t_column;
 					ttyinq_reprintpos_set(&tp->t_inq);
 					continue;
 				}
 			} else {
 				/* We're going to write regular data. */
 				wlen = ttyoutq_write(&tp->t_outq, obstart, plen);
 				obstart += wlen;
 				oblen -= wlen;
 				tp->t_column += wlen;
 
 				tp->t_writepos = tp->t_column;
 				ttyinq_reprintpos_set(&tp->t_inq);
 
 				if (wlen == plen)
 					continue;
 			}
 
 			/* Watermark reached. Try to sleep. */
 			tp->t_flags |= TF_HIWAT_OUT;
 
 			if (ioflag & IO_NDELAY) {
 				error = EWOULDBLOCK;
 				goto done;
 			}
 
 			/*
 			 * The driver may write back the data
 			 * synchronously. Be sure to check the high
 			 * water mark before going to sleep.
 			 */
 			ttydevsw_outwakeup(tp);
 			if ((tp->t_flags & TF_HIWAT_OUT) == 0)
 				continue;
 
 			error = tty_wait(tp, &tp->t_outwait);
 			if (error)
 				goto done;
 
 			if (tp->t_flags & TF_ZOMBIE) {
 				error = EIO;
 				goto done;
 			}
 		} while (oblen > 0);
 	}
 
 done:
 	if (!tty_gone(tp))
 		ttydevsw_outwakeup(tp);
 
 	/*
 	 * Add the amount of bytes that we didn't process back to the
 	 * uio counters. We need to do this to make sure write() doesn't
 	 * count the bytes we didn't store in the queue.
 	 */
 	uio->uio_resid += oblen;
 	return (error);
 }
 
 void
 ttydisc_optimize(struct tty *tp)
 {
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (ttyhook_hashook(tp, rint_bypass)) {
 		tp->t_flags |= TF_BYPASS;
 	} else if (ttyhook_hashook(tp, rint)) {
 		tp->t_flags &= ~TF_BYPASS;
 	} else if (!CMP_FLAG(i, ICRNL|IGNCR|IMAXBEL|INLCR|ISTRIP|IXON) &&
 	    (!CMP_FLAG(i, BRKINT) || CMP_FLAG(i, IGNBRK)) &&
 	    (!CMP_FLAG(i, PARMRK) ||
 		CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) &&
 	    !CMP_FLAG(l, ECHO|ICANON|IEXTEN|ISIG|PENDIN)) {
 		tp->t_flags |= TF_BYPASS;
 	} else {
 		tp->t_flags &= ~TF_BYPASS;
 	}
 }
 
 void
 ttydisc_modem(struct tty *tp, int open)
 {
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (open)
 		cv_broadcast(&tp->t_dcdwait);
 
 	/*
 	 * Ignore modem status lines when CLOCAL is turned on, but don't
 	 * enter the zombie state when the TTY isn't opened, because
 	 * that would cause the TTY to be in zombie state after being
 	 * opened.
 	 */
 	if (!tty_opened(tp) || CMP_FLAG(c, CLOCAL))
 		return;
 
 	if (open == 0) {
 		/*
 		 * Lost carrier.
 		 */
 		tp->t_flags |= TF_ZOMBIE;
 
 		tty_signal_sessleader(tp, SIGHUP);
 		tty_flush(tp, FREAD|FWRITE);
 	} else {
 		/*
 		 * Carrier is back again.
 		 */
 
 		/* XXX: what should we do here? */
 	}
 }
 
 static int
 ttydisc_echo_force(struct tty *tp, char c, int quote)
 {
 
 	if (CMP_FLAG(o, OPOST) && CTL_ECHO(c, quote)) {
 		/*
 		 * Only perform postprocessing when OPOST is turned on
 		 * and the character is an unquoted BS/TB/NL/CR.
 		 */
 		return ttydisc_write_oproc(tp, c);
 	} else if (CMP_FLAG(l, ECHOCTL) && CTL_PRINT(c, quote)) {
 		/*
 		 * Only use ^X notation when ECHOCTL is turned on and
 		 * we've got an quoted control character.
 		 *
 		 * Print backspaces when echoing an end-of-file.
 		 */
 		char ob[4] = "^?\b\b";
 
 		/* Print ^X notation. */
 		if (c != 0x7f)
 			ob[1] = c + 'A' - 1;
 
 		if (!quote && CMP_CC(VEOF, c)) {
 			return ttyoutq_write_nofrag(&tp->t_outq, ob, 4);
 		} else {
 			tp->t_column += 2;
 			return ttyoutq_write_nofrag(&tp->t_outq, ob, 2);
 		}
 	} else {
 		/* Can just be printed. */
 		tp->t_column++;
 		return ttyoutq_write_nofrag(&tp->t_outq, &c, 1);
 	}
 }
 
 static int
 ttydisc_echo(struct tty *tp, char c, int quote)
 {
 
 	/*
 	 * Only echo characters when ECHO is turned on, or ECHONL when
 	 * the character is an unquoted newline.
 	 */
 	if (!CMP_FLAG(l, ECHO) &&
 	    (!CMP_FLAG(l, ECHONL) || c != CNL || quote))
 		return (0);
 
 	return ttydisc_echo_force(tp, c, quote);
 }
 
 static void
 ttydisc_reprint_char(void *d, char c, int quote)
 {
 	struct tty *tp = d;
 
 	ttydisc_echo(tp, c, quote);
 }
 
 static void
 ttydisc_reprint(struct tty *tp)
 {
 	cc_t c;
 
 	/* Print  ^R\n, followed by the line. */
 	c = tp->t_termios.c_cc[VREPRINT];
 	if (c != _POSIX_VDISABLE)
 		ttydisc_echo(tp, c, 0);
 	ttydisc_echo(tp, CNL, 0);
 	ttyinq_reprintpos_reset(&tp->t_inq);
 
 	ttyinq_line_iterate_from_linestart(&tp->t_inq, ttydisc_reprint_char, tp);
 }
 
 struct ttydisc_recalc_length {
 	struct tty *tp;
 	unsigned int curlen;
 };
 
 static void
 ttydisc_recalc_charlength(void *d, char c, int quote)
 {
 	struct ttydisc_recalc_length *data = d;
 	struct tty *tp = data->tp;
 
 	if (CTL_PRINT(c, quote)) {
 		if (CMP_FLAG(l, ECHOCTL))
 			data->curlen += 2;
 	} else if (c == CTAB) {
 		data->curlen += 8 - (data->curlen & 7);
 	} else {
 		data->curlen++;
 	}
 }
 
 static unsigned int
 ttydisc_recalc_linelength(struct tty *tp)
 {
 	struct ttydisc_recalc_length data = { tp, tp->t_writepos };
 
 	ttyinq_line_iterate_from_reprintpos(&tp->t_inq,
 	    ttydisc_recalc_charlength, &data);
 	return (data.curlen);
 }
 
 static int
 ttydisc_rubchar(struct tty *tp)
 {
 	char c;
 	int quote;
 	unsigned int prevpos, tablen;
 
 	if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
 		return (-1);
 	ttyinq_unputchar(&tp->t_inq);
 
 	if (CMP_FLAG(l, ECHO)) {
 		/*
 		 * Remove the character from the screen. This is even
 		 * safe for characters that span multiple characters
 		 * (tabs, quoted, etc).
 		 */
 		if (tp->t_writepos >= tp->t_column) {
 			/* Retype the sentence. */
 			ttydisc_reprint(tp);
 		} else if (CMP_FLAG(l, ECHOE)) {
 			if (CTL_PRINT(c, quote)) {
 				/* Remove ^X formatted chars. */
 				if (CMP_FLAG(l, ECHOCTL)) {
 					tp->t_column -= 2;
 					ttyoutq_write_nofrag(&tp->t_outq,
 					    "\b\b  \b\b", 6);
 				}
 			} else if (c == ' ') {
 				/* Space character needs no rubbing. */
 				tp->t_column -= 1;
 				ttyoutq_write_nofrag(&tp->t_outq, "\b", 1);
 			} else if (c == CTAB) {
 				/*
 				 * Making backspace work with tabs is
 				 * quite hard. Recalculate the length of
 				 * this character and remove it.
 				 *
 				 * Because terminal settings could be
 				 * changed while the line is being
 				 * inserted, the calculations don't have
 				 * to be correct. Make sure we keep the
 				 * tab length within proper bounds.
 				 */
 				prevpos = ttydisc_recalc_linelength(tp);
 				if (prevpos >= tp->t_column)
 					tablen = 1;
 				else
 					tablen = tp->t_column - prevpos;
 				if (tablen > 8)
 					tablen = 8;
 
 				tp->t_column = prevpos;
 				ttyoutq_write_nofrag(&tp->t_outq,
 				    "\b\b\b\b\b\b\b\b", tablen);
 				return (0);
 			} else {
 				/*
 				 * Remove a regular character by
 				 * punching a space over it.
 				 */
 				tp->t_column -= 1;
 				ttyoutq_write_nofrag(&tp->t_outq, "\b \b", 3);
 			}
 		} else {
 			/* Don't print spaces. */
 			ttydisc_echo(tp, tp->t_termios.c_cc[VERASE], 0);
 		}
 	}
 
 	return (0);
 }
 
 static void
 ttydisc_rubword(struct tty *tp)
 {
 	char c;
 	int quote, alnum;
 
 	/* Strip whitespace first. */
 	for (;;) {
 		if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
 			return;
 		if (!CTL_WHITE(c))
 			break;
 		ttydisc_rubchar(tp);
 	}
 
 	/*
 	 * Record whether the last character from the previous iteration
 	 * was alphanumeric or not. We need this to implement ALTWERASE.
 	 */
 	alnum = CTL_ALNUM(c);
 	for (;;) {
 		ttydisc_rubchar(tp);
 
 		if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
 			return;
 		if (CTL_WHITE(c))
 			return;
 		if (CMP_FLAG(l, ALTWERASE) && CTL_ALNUM(c) != alnum)
 			return;
 	}
 }
 
 int
 ttydisc_rint(struct tty *tp, char c, int flags)
 {
 	int signal, quote = 0;
 	char ob[3] = { 0xff, 0x00 };
 	size_t ol;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	atomic_add_long(&tty_nin, 1);
 
 	if (ttyhook_hashook(tp, rint))
 		return ttyhook_rint(tp, c, flags);
 
 	if (tp->t_flags & TF_BYPASS)
 		goto processed;
 
 	if (flags) {
 		if (flags & TRE_BREAK) {
 			if (CMP_FLAG(i, IGNBRK)) {
 				/* Ignore break characters. */
 				return (0);
 			} else if (CMP_FLAG(i, BRKINT)) {
 				/* Generate SIGINT on break. */
 				tty_flush(tp, FREAD|FWRITE);
 				tty_signal_pgrp(tp, SIGINT);
 				return (0);
 			} else {
 				/* Just print it. */
 				goto parmrk;
 			}
 		} else if (flags & TRE_FRAMING ||
 		    (flags & TRE_PARITY && CMP_FLAG(i, INPCK))) {
 			if (CMP_FLAG(i, IGNPAR)) {
 				/* Ignore bad characters. */
 				return (0);
 			} else {
 				/* Just print it. */
 				goto parmrk;
 			}
 		}
 	}
 
 	/* Allow any character to perform a wakeup. */
 	if (CMP_FLAG(i, IXANY))
 		tp->t_flags &= ~TF_STOPPED;
 
 	/* Remove the top bit. */
 	if (CMP_FLAG(i, ISTRIP))
 		c &= ~0x80;
 
 	/* Skip input processing when we want to print it literally. */
 	if (tp->t_flags & TF_LITERAL) {
 		tp->t_flags &= ~TF_LITERAL;
 		quote = 1;
 		goto processed;
 	}
 
 	/* Special control characters that are implementation dependent. */
 	if (CMP_FLAG(l, IEXTEN)) {
 		/* Accept the next character as literal. */
 		if (CMP_CC(VLNEXT, c)) {
 			if (CMP_FLAG(l, ECHO)) {
 				if (CMP_FLAG(l, ECHOE))
 					ttyoutq_write_nofrag(&tp->t_outq, "^\b", 2);
 				else
 					ttydisc_echo(tp, c, 0);
 			}
 			tp->t_flags |= TF_LITERAL;
 			return (0);
 		}
 	}
 
 	/*
 	 * Handle signal processing.
 	 */
 	if (CMP_FLAG(l, ISIG)) {
 		if (CMP_FLAG(l, ICANON|IEXTEN) == (ICANON|IEXTEN)) {
 			if (CMP_CC(VSTATUS, c)) {
 				tty_signal_pgrp(tp, SIGINFO);
 				return (0);
 			}
 		}
 
 		/*
 		 * When compared to the old implementation, this
 		 * implementation also flushes the output queue. POSIX
 		 * is really brief about this, but does makes us assume
 		 * we have to do so.
 		 */
 		signal = 0;
 		if (CMP_CC(VINTR, c)) {
 			signal = SIGINT;
 		} else if (CMP_CC(VQUIT, c)) {
 			signal = SIGQUIT;
 		} else if (CMP_CC(VSUSP, c)) {
 			signal = SIGTSTP;
 		}
 
 		if (signal != 0) {
 			/*
 			 * Echo the character before signalling the
 			 * processes.
 			 */
 			if (!CMP_FLAG(l, NOFLSH))
 				tty_flush(tp, FREAD|FWRITE);
 			ttydisc_echo(tp, c, 0);
 			tty_signal_pgrp(tp, signal);
 			return (0);
 		}
 	}
 
 	/*
 	 * Handle start/stop characters.
 	 */
 	if (CMP_FLAG(i, IXON)) {
 		if (CMP_CC(VSTOP, c)) {
 			/* Stop it if we aren't stopped yet. */
 			if ((tp->t_flags & TF_STOPPED) == 0) {
 				tp->t_flags |= TF_STOPPED;
 				return (0);
 			}
 			/*
 			 * Fallthrough:
 			 * When VSTART == VSTOP, we should make this key
 			 * toggle it.
 			 */
 			if (!CMP_CC(VSTART, c))
 				return (0);
 		}
 		if (CMP_CC(VSTART, c)) {
 			tp->t_flags &= ~TF_STOPPED;
 			return (0);
 		}
 	}
 
 	/* Conversion of CR and NL. */
 	switch (c) {
 	case CCR:
 		if (CMP_FLAG(i, IGNCR))
 			return (0);
 		if (CMP_FLAG(i, ICRNL))
 			c = CNL;
 		break;
 	case CNL:
 		if (CMP_FLAG(i, INLCR))
 			c = CCR;
 		break;
 	}
 
 	/* Canonical line editing. */
 	if (CMP_FLAG(l, ICANON)) {
 		if (CMP_CC(VERASE, c) || CMP_CC(VERASE2, c)) {
 			ttydisc_rubchar(tp);
 			return (0);
 		} else if (CMP_CC(VKILL, c)) {
 			while (ttydisc_rubchar(tp) == 0);
 			return (0);
 		} else if (CMP_FLAG(l, IEXTEN)) {
 			if (CMP_CC(VWERASE, c)) {
 				ttydisc_rubword(tp);
 				return (0);
 			} else if (CMP_CC(VREPRINT, c)) {
 				ttydisc_reprint(tp);
 				return (0);
 			}
 		}
 	}
 
 processed:
 	if (CMP_FLAG(i, PARMRK) && (unsigned char)c == 0xff) {
 		/* Print 0xff 0xff. */
 		ob[1] = 0xff;
 		ol = 2;
 		quote = 1;
 	} else {
 		ob[0] = c;
 		ol = 1;
 	}
 
 	goto print;
 
 parmrk:
 	if (CMP_FLAG(i, PARMRK)) {
 		/* Prepend 0xff 0x00 0x.. */
 		ob[2] = c;
 		ol = 3;
 		quote = 1;
 	} else {
 		ob[0] = c;
 		ol = 1;
 	}
 
 print:
 	/* See if we can store this on the input queue. */
 	if (ttyinq_write_nofrag(&tp->t_inq, ob, ol, quote) != 0) {
 		if (CMP_FLAG(i, IMAXBEL))
 			ttyoutq_write_nofrag(&tp->t_outq, "\a", 1);
 
 		/*
 		 * Prevent a deadlock here. It may be possible that a
 		 * user has entered so much data, there is no data
 		 * available to read(), but the buffers are full anyway.
 		 *
 		 * Only enter the high watermark if the device driver
 		 * can actually transmit something.
 		 */
 		if (ttyinq_bytescanonicalized(&tp->t_inq) == 0)
 			return (0);
 
 		tty_hiwat_in_block(tp);
 		return (-1);
 	}
 
 	/*
 	 * In raw mode, we canonicalize after receiving a single
 	 * character. Otherwise, we canonicalize when we receive a
 	 * newline, VEOL or VEOF, but only when it isn't quoted.
 	 */
 	if (!CMP_FLAG(l, ICANON) ||
 	    (!quote && (c == CNL || CMP_CC(VEOL, c) || CMP_CC(VEOF, c)))) {
 		ttyinq_canonicalize(&tp->t_inq);
 	}
 
 	ttydisc_echo(tp, c, quote);
 
 	return (0);
 }
 
 size_t
 ttydisc_rint_simple(struct tty *tp, const void *buf, size_t len)
 {
 	const char *cbuf;
 
 	if (ttydisc_can_bypass(tp))
 		return (ttydisc_rint_bypass(tp, buf, len));
 
 	for (cbuf = buf; len-- > 0; cbuf++) {
 		if (ttydisc_rint(tp, *cbuf, 0) != 0)
 			break;
 	}
 
 	return (cbuf - (const char *)buf);
 }
 
 size_t
 ttydisc_rint_bypass(struct tty *tp, const void *buf, size_t len)
 {
 	size_t ret;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	MPASS(tp->t_flags & TF_BYPASS);
 
 	atomic_add_long(&tty_nin, len);
 
 	if (ttyhook_hashook(tp, rint_bypass))
 		return ttyhook_rint_bypass(tp, buf, len);
 
 	ret = ttyinq_write(&tp->t_inq, buf, len, 0);
 	ttyinq_canonicalize(&tp->t_inq);
 	if (ret < len)
 		tty_hiwat_in_block(tp);
 
 	return (ret);
 }
 
 void
 ttydisc_rint_done(struct tty *tp)
 {
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (ttyhook_hashook(tp, rint_done))
 		ttyhook_rint_done(tp);
 
 	/* Wake up readers. */
 	tty_wakeup(tp, FREAD);
 	/* Wake up driver for echo. */
 	ttydevsw_outwakeup(tp);
 }
 
 size_t
 ttydisc_rint_poll(struct tty *tp)
 {
 	size_t l;
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (ttyhook_hashook(tp, rint_poll))
 		return ttyhook_rint_poll(tp);
 
 	/*
 	 * XXX: Still allow character input when there's no space in the
 	 * buffers, but we haven't entered the high watermark. This is
 	 * to allow backspace characters to be inserted when in
 	 * canonical mode.
 	 */
 	l = ttyinq_bytesleft(&tp->t_inq);
 	if (l == 0 && (tp->t_flags & TF_HIWAT_IN) == 0)
 		return (1);
 
 	return (l);
 }
 
 static void
 ttydisc_wakeup_watermark(struct tty *tp)
 {
 	size_t c;
 
 	c = ttyoutq_bytesleft(&tp->t_outq);
 	if (tp->t_flags & TF_HIWAT_OUT) {
 		/* Only allow us to run when we're below the watermark. */
 		if (c < tp->t_outlow)
 			return;
 
 		/* Reset the watermark. */
 		tp->t_flags &= ~TF_HIWAT_OUT;
 	} else {
 		/* Only run when we have data at all. */
 		if (c == 0)
 			return;
 	}
 	tty_wakeup(tp, FWRITE);
 }
 
 size_t
 ttydisc_getc(struct tty *tp, void *buf, size_t len)
 {
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tp->t_flags & TF_STOPPED)
 		return (0);
 
 	if (ttyhook_hashook(tp, getc_inject))
 		return ttyhook_getc_inject(tp, buf, len);
 
 	len = ttyoutq_read(&tp->t_outq, buf, len);
 
 	if (ttyhook_hashook(tp, getc_capture))
 		ttyhook_getc_capture(tp, buf, len);
 
 	ttydisc_wakeup_watermark(tp);
 	atomic_add_long(&tty_nout, len);
 
 	return (len);
 }
 
 int
 ttydisc_getc_uio(struct tty *tp, struct uio *uio)
 {
 	int error = 0;
 	ssize_t obytes = uio->uio_resid;
 	size_t len;
 	char buf[TTY_STACKBUF];
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tp->t_flags & TF_STOPPED)
 		return (0);
 
 	/*
 	 * When a TTY hook is attached, we cannot perform unbuffered
 	 * copying to userspace. Just call ttydisc_getc() and
 	 * temporarily store data in a shadow buffer.
 	 */
 	if (ttyhook_hashook(tp, getc_capture) ||
 	    ttyhook_hashook(tp, getc_inject)) {
 		while (uio->uio_resid > 0) {
 			/* Read to shadow buffer. */
 			len = ttydisc_getc(tp, buf,
 			    MIN(uio->uio_resid, sizeof buf));
 			if (len == 0)
 				break;
 
 			/* Copy to userspace. */
 			tty_unlock(tp);
 			error = uiomove(buf, len, uio);
 			tty_lock(tp);
 
 			if (error != 0)
 				break;
 		}
 	} else {
 		error = ttyoutq_read_uio(&tp->t_outq, tp, uio);
 
 		ttydisc_wakeup_watermark(tp);
 		atomic_add_long(&tty_nout, obytes - uio->uio_resid);
 	}
 
 	return (error);
 }
 
 size_t
 ttydisc_getc_poll(struct tty *tp)
 {
 
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tp->t_flags & TF_STOPPED)
 		return (0);
 
 	if (ttyhook_hashook(tp, getc_poll))
 		return ttyhook_getc_poll(tp);
 
 	return ttyoutq_bytesused(&tp->t_outq);
 }
 
 /*
  * XXX: not really related to the TTYDISC, but we'd better put
  * tty_putchar() here, because we need to perform proper output
  * processing.
  */
 
 int
 tty_putchar(struct tty *tp, char c)
 {
 	tty_lock_assert(tp, MA_OWNED);
 
 	if (tty_gone(tp))
 		return (-1);
 
 	ttydisc_echo_force(tp, c, 0);
 	tp->t_writepos = tp->t_column;
 	ttyinq_reprintpos_set(&tp->t_inq);
 
 	ttydevsw_outwakeup(tp);
 	return (0);
 }
Index: head/sys/kern/uipc_accf.c
===================================================================
--- head/sys/kern/uipc_accf.c	(revision 326270)
+++ head/sys/kern/uipc_accf.c	(revision 326271)
@@ -1,306 +1,308 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000 Paycounter, Inc.
  * Copyright (c) 2005 Robert N. M. Watson
  * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define ACCEPT_FILTER_MOD
 
 #include "opt_param.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/queue.h>
 
 static struct mtx accept_filter_mtx;
 MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx",
 	MTX_DEF);
 #define	ACCEPT_FILTER_LOCK()	mtx_lock(&accept_filter_mtx)
 #define	ACCEPT_FILTER_UNLOCK()	mtx_unlock(&accept_filter_mtx)
 
 static SLIST_HEAD(, accept_filter) accept_filtlsthd =
 	SLIST_HEAD_INITIALIZER(accept_filtlsthd);
 
 MALLOC_DEFINE(M_ACCF, "accf", "accept filter data");
 
 static int unloadable = 0;
 
 SYSCTL_NODE(_net, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters");
 SYSCTL_INT(_net_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
 	"Allow unload of accept filters (not recommended)");
 
 /*
  * Must be passed a malloc'd structure so we don't explode if the kld is
  * unloaded, we leak the struct on deallocation to deal with this, but if a
  * filter is loaded with the same name as a leaked one we re-use the entry.
  */
 int
 accept_filt_add(struct accept_filter *filt)
 {
 	struct accept_filter *p;
 
 	ACCEPT_FILTER_LOCK();
 	SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
 		if (strcmp(p->accf_name, filt->accf_name) == 0)  {
 			if (p->accf_callback != NULL) {
 				ACCEPT_FILTER_UNLOCK();
 				return (EEXIST);
 			} else {
 				p->accf_callback = filt->accf_callback;
 				ACCEPT_FILTER_UNLOCK();
 				free(filt, M_ACCF);
 				return (0);
 			}
 		}
 				
 	if (p == NULL)
 		SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
 	ACCEPT_FILTER_UNLOCK();
 	return (0);
 }
 
 int
 accept_filt_del(char *name)
 {
 	struct accept_filter *p;
 
 	p = accept_filt_get(name);
 	if (p == NULL)
 		return (ENOENT);
 
 	p->accf_callback = NULL;
 	return (0);
 }
 
 struct accept_filter *
 accept_filt_get(char *name)
 {
 	struct accept_filter *p;
 
 	ACCEPT_FILTER_LOCK();
 	SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
 		if (strcmp(p->accf_name, name) == 0)
 			break;
 	ACCEPT_FILTER_UNLOCK();
 
 	return (p);
 }
 
 int
 accept_filt_generic_mod_event(module_t mod, int event, void *data)
 {
 	struct accept_filter *p;
 	struct accept_filter *accfp = (struct accept_filter *) data;
 	int error;
 
 	switch (event) {
 	case MOD_LOAD:
 		p = malloc(sizeof(*p), M_ACCF, M_WAITOK);
 		bcopy(accfp, p, sizeof(*p));
 		error = accept_filt_add(p);
 		break;
 
 	case MOD_UNLOAD:
 		/*
 		 * Do not support unloading yet. we don't keep track of
 		 * refcounts and unloading an accept filter callback and then
 		 * having it called is a bad thing.  A simple fix would be to
 		 * track the refcount in the struct accept_filter.
 		 */
 		if (unloadable != 0) {
 			error = accept_filt_del(accfp->accf_name);
 		} else
 			error = EOPNOTSUPP;
 		break;
 
 	case MOD_SHUTDOWN:
 		error = 0;
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 int
 accept_filt_getopt(struct socket *so, struct sockopt *sopt)
 {
 	struct accept_filter_arg *afap;
 	int error;
 
 	error = 0;
 	afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO);
 	SOCK_LOCK(so);
 	if ((so->so_options & SO_ACCEPTCONN) == 0) {
 		error = EINVAL;
 		goto out;
 	}
 	if (so->sol_accept_filter == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	strcpy(afap->af_name, so->sol_accept_filter->accf_name);
 	if (so->sol_accept_filter_str != NULL)
 		strcpy(afap->af_arg, so->sol_accept_filter_str);
 out:
 	SOCK_UNLOCK(so);
 	if (error == 0)
 		error = sooptcopyout(sopt, afap, sizeof(*afap));
 	free(afap, M_TEMP);
 	return (error);
 }
 
 int
 accept_filt_setopt(struct socket *so, struct sockopt *sopt)
 {
 	struct accept_filter_arg *afap;
 	struct accept_filter *afp;
 	char *accept_filter_str = NULL;
 	void *accept_filter_arg = NULL;
 	int error;
 
 	/*
 	 * Handle the simple delete case first.
 	 */
 	if (sopt == NULL || sopt->sopt_val == NULL) {
 		struct socket *sp, *sp1;
 		int wakeup;
 
 		SOCK_LOCK(so);
 		if ((so->so_options & SO_ACCEPTCONN) == 0) {
 			SOCK_UNLOCK(so);
 			return (EINVAL);
 		}
 		if (so->sol_accept_filter == NULL) {
 			SOCK_UNLOCK(so);
 			return (0);
 		}
 		if (so->sol_accept_filter->accf_destroy != NULL)
 			so->sol_accept_filter->accf_destroy(so);
 		if (so->sol_accept_filter_str != NULL)
 			free(so->sol_accept_filter_str, M_ACCF);
 		so->sol_accept_filter = NULL;
 		so->sol_accept_filter_arg = NULL;
 		so->sol_accept_filter_str = NULL;
 		so->so_options &= ~SO_ACCEPTFILTER;
 
 		/*
 		 * Move from incomplete queue to complete only those
 		 * connections, that are blocked by us.
 		 */
 		wakeup = 0;
 		TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) {
 			SOCK_LOCK(sp);
 			if (sp->so_options & SO_ACCEPTFILTER) {
 				TAILQ_REMOVE(&so->sol_incomp, sp, so_list);
 				TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list);
 				sp->so_qstate = SQ_COMP;
 				sp->so_options &= ~SO_ACCEPTFILTER;
 				so->sol_incqlen--;
 				so->sol_qlen++;
 				wakeup = 1;
 			}
 			SOCK_UNLOCK(sp);
 		}
 		if (wakeup)
 			solisten_wakeup(so);  /* unlocks */
 		else
 			SOLISTEN_UNLOCK(so);
 		return (0);
 	}
 
 	/*
 	 * Pre-allocate any memory we may need later to avoid blocking at
 	 * untimely moments.  This does not optimize for invalid arguments.
 	 */
 	afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK);
 	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
 	afap->af_name[sizeof(afap->af_name)-1] = '\0';
 	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
 	if (error) {
 		free(afap, M_TEMP);
 		return (error);
 	}
 	afp = accept_filt_get(afap->af_name);
 	if (afp == NULL) {
 		free(afap, M_TEMP);
 		return (ENOENT);
 	}
 	if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
 		size_t len = strlen(afap->af_name) + 1;
 		accept_filter_str = malloc(len, M_ACCF, M_WAITOK);
 		strcpy(accept_filter_str, afap->af_name);
 	}
 
 	/*
 	 * Require a listen socket; don't try to replace an existing filter
 	 * without first removing it.
 	 */
 	SOCK_LOCK(so);
 	if ((so->so_options & SO_ACCEPTCONN) == 0 ||
 	    so->sol_accept_filter != NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * Invoke the accf_create() method of the filter if required.  The
 	 * socket mutex is held over this call, so create methods for filters
 	 * can't block.
 	 */
 	if (afp->accf_create != NULL) {
 		accept_filter_arg = afp->accf_create(so, afap->af_arg);
 		if (accept_filter_arg == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 	so->sol_accept_filter = afp;
 	so->sol_accept_filter_arg = accept_filter_arg;
 	so->sol_accept_filter_str = accept_filter_str;
 	so->so_options |= SO_ACCEPTFILTER;
 out:
 	SOCK_UNLOCK(so);
 	if (accept_filter_str != NULL)
 		free(accept_filter_str, M_ACCF);
 	free(afap, M_TEMP);
 	return (error);
 }
Index: head/sys/kern/uipc_debug.c
===================================================================
--- head/sys/kern/uipc_debug.c	(revision 326270)
+++ head/sys/kern/uipc_debug.c	(revision 326271)
@@ -1,534 +1,536 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2007 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Debugger routines relating to sockets, protocols, etc, for use in DDB.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 static void
 db_print_sotype(short so_type)
 {
 
 	switch (so_type) {
 	case SOCK_STREAM:
 		db_printf("SOCK_STREAM");
 		break;
 
 	case SOCK_DGRAM:
 		db_printf("SOCK_DGRAM");
 		break;
 
 	case SOCK_RAW:
 		db_printf("SOCK_RAW");
 		break;
 
 	case SOCK_RDM:
 		db_printf("SOCK_RDM");
 		break;
 
 	case SOCK_SEQPACKET:
 		db_printf("SOCK_SEQPACKET");
 		break;
 
 	default:
 		db_printf("unknown");
 		break;
 	}
 }
 
 static void
 db_print_sooptions(short so_options)
 {
 	int comma;
 
 	comma = 0;
 	if (so_options & SO_DEBUG) {
 		db_printf("%sSO_DEBUG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_ACCEPTCONN) {
 		db_printf("%sSO_ACCEPTCONN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_REUSEADDR) {
 		db_printf("%sSO_REUSEADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_KEEPALIVE) {
 		db_printf("%sSO_KEEPALIVE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_DONTROUTE) {
 		db_printf("%sSO_DONTROUTE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_BROADCAST) {
 		db_printf("%sSO_BROADCAST", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_USELOOPBACK) {
 		db_printf("%sSO_USELOOPBACK", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_LINGER) {
 		db_printf("%sSO_LINGER", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_OOBINLINE) {
 		db_printf("%sSO_OOBINLINE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_REUSEPORT) {
 		db_printf("%sSO_REUSEPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_TIMESTAMP) {
 		db_printf("%sSO_TIMESTAMP", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_NOSIGPIPE) {
 		db_printf("%sSO_NOSIGPIPE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_ACCEPTFILTER) {
 		db_printf("%sSO_ACCEPTFILTER", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_BINTIME) {
 		db_printf("%sSO_BINTIME", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_NO_OFFLOAD) {
 		db_printf("%sSO_NO_OFFLOAD", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_NO_DDP) {
 		db_printf("%sSO_NO_DDP", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_sostate(short so_state)
 {
 	int comma;
 
 	comma = 0;
 	if (so_state & SS_NOFDREF) {
 		db_printf("%sSS_NOFDREF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_state & SS_ISCONNECTED) {
 		db_printf("%sSS_ISCONNECTED", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_state & SS_ISCONNECTING) {
 		db_printf("%sSS_ISCONNECTING", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_state & SS_ISDISCONNECTING) {
 		db_printf("%sSS_ISDISCONNECTING", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_state & SS_NBIO) {
 		db_printf("%sSS_NBIO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_state & SS_ASYNC) {
 		db_printf("%sSS_ASYNC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_state & SS_ISCONFIRMING) {
 		db_printf("%sSS_ISCONFIRMING", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_state & SS_PROTOREF) {
 		db_printf("%sSS_PROTOREF", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_soqstate(int so_qstate)
 {
 	int comma;
 
 	comma = 0;
 	if (so_qstate & SQ_INCOMP) {
 		db_printf("%sSQ_INCOMP", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_qstate & SQ_COMP) {
 		db_printf("%sSQ_COMP", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_sbstate(short sb_state)
 {
 	int comma;
 
 	comma = 0;
 	if (sb_state & SBS_CANTSENDMORE) {
 		db_printf("%sSBS_CANTSENDMORE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_state & SBS_CANTRCVMORE) {
 		db_printf("%sSBS_CANTRCVMORE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_state & SBS_RCVATMARK) {
 		db_printf("%sSBS_RCVATMARK", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_domain(struct domain *d, const char *domain_name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", domain_name, d);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("dom_family: %d   ", d->dom_family);
 	db_printf("dom_name: %s\n", d->dom_name);
 
 	db_print_indent(indent);
 	db_printf("dom_init: %p   ", d->dom_init);
 	db_printf("dom_externalize: %p   ", d->dom_externalize);
 	db_printf("dom_dispose: %p\n", d->dom_dispose);
 
 	db_print_indent(indent);
 	db_printf("dom_protosw: %p   ", d->dom_protosw);
 	db_printf("dom_next: %p\n", d->dom_next);
 
 	db_print_indent(indent);
 	db_printf("dom_rtattach: %p   ", d->dom_rtattach);
 
 	db_print_indent(indent);
 	db_printf("dom_ifattach: %p   ", d->dom_ifattach);
 	db_printf("dom_ifdetach: %p\n", d->dom_ifdetach);
 }
 
 static void
 db_print_prflags(short pr_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (pr_flags & PR_ATOMIC) {
 		db_printf("%sPR_ATOMIC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (pr_flags & PR_ADDR) {
 		db_printf("%sPR_ADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (pr_flags & PR_CONNREQUIRED) {
 		db_printf("%sPR_CONNREQUIRED", comma ? ", " : "");
 		comma = 1;
 	}
 	if (pr_flags & PR_WANTRCVD) {
 		db_printf("%sPR_WANTRCVD", comma ? ", " : "");
 		comma = 1;
 	}
 	if (pr_flags & PR_RIGHTS) {
 		db_printf("%sPR_RIGHTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (pr_flags & PR_IMPLOPCL) {
 		db_printf("%sPR_IMPLOPCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (pr_flags & PR_LASTHDR) {
 		db_printf("%sPR_LASTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_protosw(struct protosw *pr, const char *prname, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", prname, pr);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("pr_type: %d   ", pr->pr_type);
 	db_printf("pr_domain: %p\n", pr->pr_domain);
 	if (pr->pr_domain != NULL)
 		db_print_domain(pr->pr_domain, "pr_domain", indent);
 
 	db_print_indent(indent);
 	db_printf("pr_protocol: %d\n", pr->pr_protocol);
 
 	db_print_indent(indent);
 	db_printf("pr_flags: %d (", pr->pr_flags);
 	db_print_prflags(pr->pr_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("pr_input: %p   ", pr->pr_input);
 	db_printf("pr_output: %p   ", pr->pr_output);
 	db_printf("pr_ctlinput: %p\n", pr->pr_ctlinput);
 
 	db_print_indent(indent);
 	db_printf("pr_ctloutput: %p   ", pr->pr_ctloutput);
 	db_printf("pr_init: %p\n", pr->pr_init);
 
 	db_print_indent(indent);
 	db_printf("pr_fasttimo: %p   ", pr->pr_fasttimo);
 	db_printf("pr_slowtimo: %p   ", pr->pr_slowtimo);
 	db_printf("pr_drain: %p\n", pr->pr_drain);
 }
 
 static void
 db_print_sbflags(short sb_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (sb_flags & SB_WAIT) {
 		db_printf("%sSB_WAIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_flags & SB_SEL) {
 		db_printf("%sSB_SEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_flags & SB_ASYNC) {
 		db_printf("%sSB_ASYNC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_flags & SB_UPCALL) {
 		db_printf("%sSB_UPCALL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_flags & SB_NOINTR) {
 		db_printf("%sSB_NOINTR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_flags & SB_AIO) {
 		db_printf("%sSB_AIO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_flags & SB_KNOTE) {
 		db_printf("%sSB_KNOTE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (sb_flags & SB_AUTOSIZE) {
 		db_printf("%sSB_AUTOSIZE", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_sockbuf(struct sockbuf *sb, const char *sockbufname, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", sockbufname, sb);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("sb_state: 0x%x (", sb->sb_state);
 	db_print_sbstate(sb->sb_state);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("sb_mb: %p   ", sb->sb_mb);
 	db_printf("sb_mbtail: %p   ", sb->sb_mbtail);
 	db_printf("sb_lastrecord: %p\n", sb->sb_lastrecord);
 
 	db_print_indent(indent);
 	db_printf("sb_sndptr: %p   ", sb->sb_sndptr);
 	db_printf("sb_sndptroff: %u\n", sb->sb_sndptroff);
 
 	db_print_indent(indent);
 	db_printf("sb_acc: %u   ", sb->sb_acc);
 	db_printf("sb_ccc: %u   ", sb->sb_ccc);
 	db_printf("sb_hiwat: %u   ", sb->sb_hiwat);
 	db_printf("sb_mbcnt: %u   ", sb->sb_mbcnt);
 	db_printf("sb_mbmax: %u\n", sb->sb_mbmax);
 
 	db_print_indent(indent);
 	db_printf("sb_mcnt: %u   ", sb->sb_mcnt);
 	db_printf("sb_ccnt: %u   ", sb->sb_ccnt);
 	db_printf("sb_ctl: %u   ", sb->sb_ctl);
 	db_printf("sb_lowat: %d   ", sb->sb_lowat);
 	db_printf("sb_timeo: %jd\n", sb->sb_timeo);
 
 	db_print_indent(indent);
 	db_printf("sb_flags: 0x%x (", sb->sb_flags);
 	db_print_sbflags(sb->sb_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("sb_aiojobq first: %p\n", TAILQ_FIRST(&sb->sb_aiojobq));
 }
 
 static void
 db_print_socket(struct socket *so, const char *socketname, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", socketname, so);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("so_count: %d   ", so->so_count);
 	db_printf("so_type: %d (", so->so_type);
 	db_print_sotype(so->so_type);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("so_options: 0x%x (", so->so_options);
 	db_print_sooptions(so->so_options);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("so_linger: %d   ", so->so_linger);
 	db_printf("so_state: 0x%x (", so->so_state);
 	db_print_sostate(so->so_state);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("so_pcb: %p   ", so->so_pcb);
 	db_printf("so_proto: %p\n", so->so_proto);
 
 	if (so->so_proto != NULL)
 		db_print_protosw(so->so_proto, "so_proto", indent);
 
 	db_print_indent(indent);
 	if (so->so_options & SO_ACCEPTCONN) {
 		db_printf("sol_incomp first: %p   ",
 		    TAILQ_FIRST(&so->sol_incomp));
 		db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp));
 		db_printf("sol_qlen: %d   ", so->sol_qlen);
 		db_printf("sol_incqlen: %d   ", so->sol_incqlen);
 		db_printf("sol_qlimit: %d   ", so->sol_qlimit);
 	} else {
 		db_printf("so_qstate: 0x%x (", so->so_qstate);
 		db_print_soqstate(so->so_qstate);
 		db_printf(")   ");
 		db_printf("so_listen: %p   ", so->so_listen);
 		/* so_list skipped */
 		db_printf("so_timeo: %d   ", so->so_timeo);
 		db_printf("so_error: %d\n", so->so_error);
 
 		db_print_indent(indent);
 		db_printf("so_sigio: %p   ", so->so_sigio);
 		db_printf("so_oobmark: %lu\n", so->so_oobmark);
 
 		db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
 		db_print_sockbuf(&so->so_snd, "so_snd", indent);
 	}
 }
 
 DB_SHOW_COMMAND(socket, db_show_socket)
 {
 	struct socket *so;
 
 	if (!have_addr) {
 		db_printf("usage: show socket <addr>\n");
 		return;
 	}
 	so = (struct socket *)addr;
 
 	db_print_socket(so, "socket", 0);
 }
 
 DB_SHOW_COMMAND(sockbuf, db_show_sockbuf)
 {
 	struct sockbuf *sb;
 
 	if (!have_addr) {
 		db_printf("usage: show sockbuf <addr>\n");
 		return;
 	}
 	sb = (struct sockbuf *)addr;
 
 	db_print_sockbuf(sb, "sockbuf", 0);
 }
 
 DB_SHOW_COMMAND(protosw, db_show_protosw)
 {
 	struct protosw *pr;
 
 	if (!have_addr) {
 		db_printf("usage: show protosw <addr>\n");
 		return;
 	}
 	pr = (struct protosw *)addr;
 
 	db_print_protosw(pr, "protosw", 0);
 }
 
 DB_SHOW_COMMAND(domain, db_show_domain)
 {
 	struct domain *d;
 
 	if (!have_addr) {
 		db_printf("usage: show protosw <addr>\n");
 		return;
 	}
 	d = (struct domain *)addr;
 
 	db_print_domain(d, "domain", 0);
 }
 #endif
Index: head/sys/kern/uipc_mqueue.c
===================================================================
--- head/sys/kern/uipc_mqueue.c	(revision 326270)
+++ head/sys/kern/uipc_mqueue.c	(revision 326271)
@@ -1,2939 +1,2941 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 #include <security/audit/audit.h>
 
 FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	const void		*mn_pr_root;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	uint32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
 static unsigned			mqfs_osd_jail_slot;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 static int	mqfs_prison_remove(void *obj, void *data);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
 struct filterops mq_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqread,
 };
 struct filterops mq_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqwrite,
 };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_fetchadd_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	struct mqfs_info *mqfs;
 	int old, exp;
 
 	mqfs = node->mn_info;
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp) {
 		int locked = sx_xlocked(&mqfs->mi_lock);
 		if (!locked)
 			sx_xlock(&mqfs->mi_lock);
 		mqfs_destroy(node);
 		if (!locked)
 			sx_xunlock(&mqfs->mi_lock);
 	}
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_pr_root = cred->cr_prison->pr_root;
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	vfs_timestamp(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime
 		= node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
 	    curthread);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_REMOVE] = mqfs_prison_remove,
 	};
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	osd_jail_deregister(mqfs_osd_jail_slot);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vrecycle(vp);
 	VOP_UNLOCK(vp, 0);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	struct mqfs_info  *mqfs;
 	struct vnode *newvpp;
 	int error;
 
 	mqfs = pn->mn_info;
 	*vpp = NULL;
 	sx_xlock(&mqfs->mi_lock);
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			break;
 		}
 	}
 
 	if (vd != NULL) {
 found:
 		*vpp = vd->mv_vnode;
 		sx_xunlock(&mqfs->mi_lock);
 		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
 		vdrop(*vpp);
 		return (error);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
 	if (error)
 		return (error);
 	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(newvpp, mp);
 	if (error != 0)
 		return (error);
 
 	sx_xlock(&mqfs->mi_lock);
 	/*
 	 * Check if it has already been allocated
 	 * while we were blocked.
 	 */
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			sx_xunlock(&mqfs->mi_lock);
 
 			vgone(newvpp);
 			vput(newvpp);
 			goto found;
 		}
 	}
 
 	*vpp = newvpp;
 
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
 {
 	struct mqfs_node *pn;
 	const void *pr_root;
 
 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
 	pr_root = cred->cr_prison->pr_root;
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		/* Only match names within the same prison root directory */
 		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
 		    strncmp(pn->mn_name, name, len) == 0 &&
 		    pn->mn_name[len] == '\0')
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
  * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqfs_info *mqfs;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	mqfs = pd->mn_info;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp, 0);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	/* named node */
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error) {
 				mqnode_release(pn);
 				return (error);
 			}
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				mqnode_release(pn);
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		mqnode_release(pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 	
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int rc;
 
 	rc = mqfs_lookupx(ap);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		error = ENOSPC;
 	} else {
 		mqnode_addref(pn);
 		sx_xunlock(&mqfs->mi_lock);
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static
 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	uma_zfree(mvdata_zone, vd);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	struct file *a_fp;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
 	    vattr.va_gid, ap->a_accmode, ap->a_cred, NULL);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = NODEV;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	td = curthread;
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	pn = VTON(vp);
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 		mq->mq_totalbytes,
 		mq->mq_maxmsg,
 		mq->mq_curmsgs,
 		mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	const void *pr_root;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	pr_root = ap->a_cred->cr_prison->pr_root;
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 
 		/*
 		 * Only show names within the same prison root directory
 		 * (or not associated with a prison, e.g. "." and "..").
 		 */
 		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
 			continue;
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_name[i] = 0;
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	if (pn == NULL) {
 		error = ENOSPC;
 	} else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 	}
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * See if this prison root is obsolete, and clean up associated queues if it is.
  */
 static int
 mqfs_prison_remove(void *obj, void *data __unused)
 {
 	const struct prison *pr = obj;
 	const struct prison *tpr;
 	struct mqfs_node *pn, *tpn;
 	int found;
 
 	found = 0;
 	TAILQ_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0)
 			found = 1;
 	}
 	if (!found) {
 		/*
 		 * No jails are rooted in this directory anymore,
 		 * so no queues should be either.
 		 */
 		sx_xlock(&mqfs_data.mi_lock);
 		LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
 		    mn_sibling, tpn) {
 			if (pn->mn_pr_root == pr->pr_root)
 				(void)do_unlink(pn, curthread->td_ucred);
 		}
 		sx_xunlock(&mqfs_data.mi_lock);
 	}
 	return (0);
 }
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
 	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
 	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		free(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	seldrain(&mq->mq_rsel);
 	seldrain(&mq->mq_wsel);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		free(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	free(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
 		ts2 = *abs_timeout;
 		getnanotime(&ts);
 		timespecsub(&ts2, &ts);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		error = sigev_findtd(p, &nt->nt_sigev, &td);
 		if (error) {
 			mq->mq_notifier = NULL;
 			return;
 		}
 		if (!KSI_ONQ(&nt->nt_ksi)) {
 			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
 			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
 		}
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
 		ts2 = *abs_timeout;
 		getnanotime(&ts);
 		timespecsub(&ts2, &ts);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 	
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 static int
 kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
     const struct mq_attr *attr)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mqfs_node *pn;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, cmode;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
 	mq = NULL;
 	if ((flags & O_CREAT) != 0 && attr != NULL) {
 		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
 			return (EINVAL);
 	}
 
 	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(attr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			accmode_t accmode = 0;
 
 			if (flags & FREAD)
 				accmode |= VREAD;
 			if (flags & FWRITE)
 				accmode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 				    pn->mn_gid, accmode, td->td_ucred, NULL);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
 	    &mqueueops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
  * Syscall to open a message queue.
  */
 int
 sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 /*
  * Syscall to unlink a message queue.
  */
 int
 sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, rightsp, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 	cap_rights_t rights;
 
 	return _getmq(td, fd, cap_rights_init(&rights, CAP_EVENT), fget,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	cap_rights_t rights;
 
 	return _getmq(td, fd, cap_rights_init(&rights, CAP_READ), fget_read,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	cap_rights_t rights;
 
 	return _getmq(td, fd, cap_rights_init(&rights, CAP_WRITE), fget_write,
 	    fpp, ppn, pmq);
 }
 
 static int
 kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
     struct mq_attr *oattr)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	u_int oflag, flag;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
 		return (EINVAL);
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr->mq_maxmsg  = mq->mq_maxmsg;
 	oattr->mq_msgsize = mq->mq_msgsize;
 	oattr->mq_curmsgs = mq->mq_curmsgs;
 	if (attr != NULL) {
 		do {
 			oflag = flag = fp->f_flag;
 			flag &= ~O_NONBLOCK;
 			flag |= (attr->mq_flags & O_NONBLOCK);
 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
 	} else
 		oflag = fp->f_flag;
 	oattr->mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error != 0)
 			return (error);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		bzero(oattr.__reserved, sizeof(oattr.__reserved));
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	}
 	return (error);
 }
 
 int
 sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			return (error);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			return (error);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
 {
 #ifdef CAPABILITIES
 	cap_rights_t rights;
 #endif
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp, *fp2;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (sigev != NULL) {
 		if (sigev->sigev_notify != SIGEV_SIGNAL &&
 		    sigev->sigev_notify != SIGEV_THREAD_ID &&
 		    sigev->sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
 		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
 		    !_SIG_VALID(sigev->sigev_signo))
 			return (EINVAL);
 	}
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
 	fp2 = fget_locked(fdp, mqd);
 	if (fp2 == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights(fdp, mqd),
 	    cap_rights_init(&rights, CAP_EVENT));
 	if (error) {
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
 #endif
 	if (fp2 != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
 	if (sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = *sigev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 int
 sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct filedesc *fdp;
 	struct mqueue *mq;
  
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_locked(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	fp->f_ops = &badfileops;
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	sx_xlock(&mqfs_data.mi_lock);
 	st->st_atim = pn->mn_atime;
 	st->st_mtim = pn->mn_mtime;
 	st->st_ctim = pn->mn_ctime;
 	st->st_birthtim = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
 	    active_cred, NULL);
 	if (error != 0)
 		goto out;
 	pn->mn_mode = mode & ACCESSPERMS;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	if (uid == (uid_t)-1)
 		uid = pn->mn_uid;
 	if (gid == (gid_t)-1)
 		gid = pn->mn_gid;
 	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
 	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	pn->mn_uid = uid;
 	pn->mn_gid = gid;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static int
 mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_MQUEUE;
 	return (0);
 }
 
 static struct fileops mqueueops = {
 	.fo_read		= invfo_rdwr,
 	.fo_write		= invfo_rdwr,
 	.fo_truncate		= invfo_truncate,
 	.fo_ioctl		= invfo_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
 	.fo_sendfile		= invfo_sendfile,
 	.fo_fill_kinfo		= mqf_fill_kinfo,
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 static struct vfsconf mqueuefs_vfsconf = {
 	.vfc_version = VFS_VERSION,
 	.vfc_name = "mqueuefs",
 	.vfc_vfsops = &mqfs_vfsops,
 	.vfc_typenum = -1,
 	.vfc_flags = VFCF_SYNTHETIC
 };
 
 static struct syscall_helper_data mq_syscalls[] = {
 	SYSCALL_INIT_HELPER(kmq_open),
 	SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static void
 mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 static void
 mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 int
 freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	struct mq_attr32 attr32;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 int
 freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	struct mq_attr32 attr32, oattr32;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error != 0)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		mq_attr_to32(&oattr, &oattr32);
 		bzero(oattr32.__reserved, sizeof(oattr32.__reserved));
 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
 	}
 	return (error);
 }
 
 int
 freebsd32_kmq_timedsend(struct thread *td,
     struct freebsd32_kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			return (error);
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_timedreceive(struct thread *td,
     struct freebsd32_kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			return (error);
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	struct sigevent32 ev32;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static struct syscall_helper_data mq32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 mqinit(void)
 {
 	int error;
 
 	error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 mqunload(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(mq32_syscalls);
 #endif
 	syscall_helper_unregister(mq_syscalls);
 	return (0);
 }
 
 static int
 mq_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	error = vfs_modevent(module, cmd, arg);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = mqinit();
 		if (error != 0)
 			mqunload();
 		break;
 	case MOD_UNLOAD:
 		error = mqunload();
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t mqueuefs_mod = {
 	"mqueuefs",
 	mq_modload,
 	&mqueuefs_vfsconf
 };
 DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
 MODULE_VERSION(mqueuefs, 1);
Index: head/sys/kern/uipc_sem.c
===================================================================
--- head/sys/kern/uipc_sem.c	(revision 326270)
+++ head/sys/kern/uipc_sem.c	(revision 326271)
@@ -1,1111 +1,1113 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
  * Copyright (c) 2003-2005 SPARTA, Inc.
  * Copyright (c) 2005, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/_semaphore.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
 /*
  * TODO
  *
  * - Resource limits?
  * - Replace global sem_lock with mtx_pool locks?
  * - Add a MAC check_create() hook for creating new named semaphores.
  */
 
 #ifndef SEM_MAX
 #define	SEM_MAX	30
 #endif
 
 #ifdef SEM_DEBUG
 #define	DP(x)	printf x
 #else
 #define	DP(x)
 #endif
 
 struct ksem_mapping {
 	char		*km_path;
 	Fnv32_t		km_fnv;
 	struct ksem	*km_ksem;
 	LIST_ENTRY(ksem_mapping) km_link;
 };
 
 static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
 static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
 static struct sx ksem_dict_lock;
 static struct mtx ksem_count_lock;
 static struct mtx sem_lock;
 static u_long ksem_hash;
 static int ksem_dead;
 
 #define	KSEM_HASH(fnv)	(&ksem_dictionary[(fnv) & ksem_hash])
 
 static int nsems = 0;
 SYSCTL_DECL(_p1003_1b);
 SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
     "Number of active kernel POSIX semaphores");
 
 static int	kern_sem_wait(struct thread *td, semid_t id, int tryflag,
 		    struct timespec *abstime);
 static int	ksem_access(struct ksem *ks, struct ucred *ucred);
 static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
 		    unsigned int value);
 static int	ksem_create(struct thread *td, const char *path,
 		    semid_t *semidp, mode_t mode, unsigned int value,
 		    int flags, int compat32);
 static void	ksem_drop(struct ksem *ks);
 static int	ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp);
 static struct ksem *ksem_hold(struct ksem *ks);
 static void	ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
 static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
 static void	ksem_module_destroy(void);
 static int	ksem_module_init(void);
 static int	ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	sem_modload(struct module *module, int cmd, void *arg);
 
 static fo_stat_t	ksem_stat;
 static fo_close_t	ksem_closef;
 static fo_chmod_t	ksem_chmod;
 static fo_chown_t	ksem_chown;
 static fo_fill_kinfo_t	ksem_fill_kinfo;
 
 /* File descriptor operations. */
 static struct fileops ksem_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = ksem_stat,
 	.fo_close = ksem_closef,
 	.fo_chmod = ksem_chmod,
 	.fo_chown = ksem_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = ksem_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 FEATURE(posix_sem, "POSIX semaphores");
 
 static int
 ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 #ifdef MAC
 	int error;
 #endif
 
 	ks = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a semaphore
 	 * file descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 
 	mtx_lock(&sem_lock);
 	sb->st_atim = ks->ks_atime;
 	sb->st_ctim = ks->ks_ctime;
 	sb->st_mtim = ks->ks_mtime;
 	sb->st_birthtim = ks->ks_birthtime;
 	sb->st_uid = ks->ks_uid;
 	sb->st_gid = ks->ks_gid;
 	sb->st_mode = S_IFREG | ks->ks_mode;		/* XXX */
 	mtx_unlock(&sem_lock);
 
 	return (0);
 }
 
 static int
 ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setmode(active_cred, ks, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
 	    active_cred, NULL);
 	if (error != 0)
 		goto out;
 	ks->ks_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = ks->ks_uid;
 	if (gid == (gid_t)-1)
                  gid = ks->ks_gid;
 	if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
 	    (gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	ks->ks_uid = uid;
 	ks->ks_gid = gid;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_closef(struct file *fp, struct thread *td)
 {
 	struct ksem *ks;
 
 	ks = fp->f_data;
 	fp->f_data = NULL;
 	ksem_drop(ks);
 
 	return (0);
 }
 
 static int
 ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	const char *path, *pr_path;
 	struct ksem *ks;
 	size_t pr_pathlen;
 
 	kif->kf_type = KF_TYPE_SEM;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	kif->kf_un.kf_sem.kf_sem_value = ks->ks_value;
 	kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode;	/* XXX */
 	mtx_unlock(&sem_lock);
 	if (ks->ks_path != NULL) {
 		sx_slock(&ksem_dict_lock);
 		if (ks->ks_path != NULL) {
 			path = ks->ks_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				if (strncmp(path, pr_path, pr_pathlen) == 0 &&
 				    path[pr_pathlen] == '/')
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 		sx_sunlock(&ksem_dict_lock);
 	}
 	return (0);
 }
 
 /*
  * ksem object management including creation and reference counting
  * routines.
  */
 static struct ksem *
 ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
 {
 	struct ksem *ks;
 
 	mtx_lock(&ksem_count_lock);
 	if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
 		mtx_unlock(&ksem_count_lock);
 		return (NULL);
 	}
 	nsems++;
 	mtx_unlock(&ksem_count_lock);
 	ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
 	ks->ks_uid = ucred->cr_uid;
 	ks->ks_gid = ucred->cr_gid;
 	ks->ks_mode = mode;
 	ks->ks_value = value;
 	cv_init(&ks->ks_cv, "ksem");
 	vfs_timestamp(&ks->ks_birthtime);
 	ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
 	refcount_init(&ks->ks_ref, 1);
 #ifdef MAC
 	mac_posixsem_init(ks);
 	mac_posixsem_create(ucred, ks);
 #endif
 
 	return (ks);
 }
 
 static struct ksem *
 ksem_hold(struct ksem *ks)
 {
 
 	refcount_acquire(&ks->ks_ref);
 	return (ks);
 }
 
 static void
 ksem_drop(struct ksem *ks)
 {
 
 	if (refcount_release(&ks->ks_ref)) {
 #ifdef MAC
 		mac_posixsem_destroy(ks);
 #endif
 		cv_destroy(&ks->ks_cv);
 		free(ks, M_KSEM);
 		mtx_lock(&ksem_count_lock);
 		nsems--;
 		mtx_unlock(&ksem_count_lock);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for read
  * and write access.
  */
 static int
 ksem_access(struct ksem *ks, struct ucred *ucred)
 {
 	int error;
 
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
 	    VREAD | VWRITE, ucred, NULL);
 	if (error)
 		error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to semaphore objects.  We use the FNV hash on the path to
  * store the mappings in a hash table.
  */
 static struct ksem *
 ksem_lookup(char *path, Fnv32_t fnv)
 {
 	struct ksem_mapping *map;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0)
 			return (map->km_ksem);
 	}
 
 	return (NULL);
 }
 
 static void
 ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
 {
 	struct ksem_mapping *map;
 
 	map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
 	map->km_path = path;
 	map->km_fnv = fnv;
 	map->km_ksem = ksem_hold(ks);
 	ks->ks_path = path;
 	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
 }
 
 static int
 ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct ksem_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixsem_check_unlink(ucred, map->km_ksem);
 			if (error)
 				return (error);
 #endif
 			error = ksem_access(map->km_ksem, ucred);
 			if (error)
 				return (error);
 			map->km_ksem->ks_path = NULL;
 			LIST_REMOVE(map, km_link);
 			ksem_drop(map->km_ksem);
 			free(map->km_path, M_KSEM);
 			free(map, M_KSEM);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static int
 ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
     int compat32)
 {
 	semid_t semid;
 #ifdef COMPAT_FREEBSD32
 	int32_t semid32;
 #endif
 	void *ptr;
 	size_t ptrs;
 
 #ifdef COMPAT_FREEBSD32
 	if (compat32) {
 		semid32 = fd;
 		ptr = &semid32;
 		ptrs = sizeof(semid32);
 	} else {
 #endif
 		semid = fd;
 		ptr = &semid;
 		ptrs = sizeof(semid);
 		compat32 = 0; /* silence gcc */
 #ifdef COMPAT_FREEBSD32
 	}
 #endif
 
 	return (copyout(ptr, semidp, ptrs));
 }
 
 /* Other helper routines. */
 static int
 ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
     unsigned int value, int flags, int compat32)
 {
 	struct filedesc *fdp;
 	struct ksem *ks;
 	struct file *fp;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error, fd;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_VALUE(value);
 
 	if (value > SEM_VALUE_MAX)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error) {
 		if (name == NULL)
 			error = ENOSPC;
 		return (error);
 	}
 
 	/*
 	 * Go ahead and copyout the file descriptor now.  This is a bit
 	 * premature, but it is a lot easier to handle errors as opposed
 	 * to later when we've possibly created a new semaphore, etc.
 	 */
 	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
 	if (error) {
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	if (name == NULL) {
 		/* Create an anonymous semaphore. */
 		ks = ksem_alloc(td->td_ucred, mode, value);
 		if (ks == NULL)
 			error = ENOSPC;
 		else
 			ks->ks_flags |= KS_ANONYMOUS;
 	} else {
 		path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
 		pr_path = td->td_ucred->cr_prison->pr_path;
 
 		/* Construct a full pathname for jailed callers. */
 		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 		    : strlcpy(path, pr_path, MAXPATHLEN);
 		error = copyinstr(name, path + pr_pathlen,
 		    MAXPATHLEN - pr_pathlen, NULL);
 
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_KSEM);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&ksem_dict_lock);
 		ks = ksem_lookup(path, fnv);
 		if (ks == NULL) {
 			/* Object does not exist, create it if requested. */
 			if (flags & O_CREAT) {
 				ks = ksem_alloc(td->td_ucred, mode, value);
 				if (ks == NULL)
 					error = ENFILE;
 				else {
 					ksem_insert(path, fnv, ks);
 					path = NULL;
 				}
 			} else
 				error = ENOENT;
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixsem_check_open(td->td_ucred,
 				    ks);
 				if (error == 0)
 #endif
 				error = ksem_access(ks, td->td_ucred);
 			}
 			if (error == 0)
 				ksem_hold(ks);
 #ifdef INVARIANTS
 			else
 				ks = NULL;
 #endif
 		}
 		sx_xunlock(&ksem_dict_lock);
 		if (path)
 			free(path, M_KSEM);
 	}
 
 	if (error) {
 		KASSERT(ks == NULL, ("ksem_create error with a ksem"));
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 	KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
 
 	finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
 
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static int
 ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	error = fget(td, id, rightsp, &fp);
 	if (error)
 		return (EINVAL);
 	if (fp->f_type != DTYPE_SEM) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_DEAD) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 /* System calls. */
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_init_args {
 	unsigned int	value;
 	semid_t		*idp;
 };
 #endif
 int
 sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_open_args {
 	char		*name;
 	int		oflag;
 	mode_t		mode;
 	unsigned int	value;
 	semid_t		*idp;	
 };
 #endif
 int
 sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
 {
 
 	DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_unlink_args {
 	char		*name;
 };
 #endif
 int
 sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 	    : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
 	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&ksem_dict_lock);
 	error = ksem_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&ksem_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_close_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
 {
 	cap_rights_t rights;
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, cap_rights_init(&rights), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_ANONYMOUS) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = kern_close(td, uap->id);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_post_args {
 	semid_t	id;
 };
 #endif
 int
 sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
 	    cap_rights_init(&rights, CAP_SEM_POST), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
 	if (error)
 		goto err;
 #endif
 	if (ks->ks_value == SEM_VALUE_MAX) {
 		error = EOVERFLOW;
 		goto err;
 	}
 	++ks->ks_value;
 	if (ks->ks_waiters > 0)
 		cv_signal(&ks->ks_cv);
 	error = 0;
 	vfs_timestamp(&ks->ks_ctime);
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_wait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 0, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_timedwait_args {
 	semid_t		id;
 	const struct timespec *abstime;
 };
 #endif
 int
 sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
 {
 	struct timespec abstime;
 	struct timespec *ts;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime, sizeof(abstime));
 		if (error != 0)
 			return (error);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_trywait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 1, NULL));
 }
 
 static int
 kern_sem_wait(struct thread *td, semid_t id, int tryflag,
     struct timespec *abstime)
 {
 	struct timespec ts1, ts2;
 	struct timeval tv;
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
 	AUDIT_ARG_FD(id);
 	error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	DP((">>> kern_sem_wait critical section entered! pid=%d\n",
 	    (int)td->td_proc->p_pid));
 #ifdef MAC
 	error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		DP(("kern_sem_wait mac failed\n"));
 		goto err;
 	}
 #endif
 	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
 	vfs_timestamp(&ks->ks_atime);
 	while (ks->ks_value == 0) {
 		ks->ks_waiters++;
 		if (tryflag != 0)
 			error = EAGAIN;
 		else if (abstime == NULL)
 			error = cv_wait_sig(&ks->ks_cv, &sem_lock);
 		else {
 			for (;;) {
 				ts1 = *abstime;
 				getnanotime(&ts2);
 				timespecsub(&ts1, &ts2);
 				TIMESPEC_TO_TIMEVAL(&tv, &ts1);
 				if (tv.tv_sec < 0) {
 					error = ETIMEDOUT;
 					break;
 				}
 				error = cv_timedwait_sig(&ks->ks_cv,
 				    &sem_lock, tvtohz(&tv));
 				if (error != EWOULDBLOCK)
 					break;
 			}
 		}
 		ks->ks_waiters--;
 		if (error)
 			goto err;
 	}
 	ks->ks_value--;
 	DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
 	error = 0;
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
 	    (int)td->td_proc->p_pid, error));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_getvalue_args {
 	semid_t		id;
 	int		*val;
 };
 #endif
 int
 sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error, val;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
 	    cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		mtx_unlock(&sem_lock);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 	val = ks->ks_value;
 	vfs_timestamp(&ks->ks_atime);
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	error = copyout(&val, uap->val, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_destroy_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, cap_rights_init(&rights), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (!(ks->ks_flags & KS_ANONYMOUS)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	mtx_lock(&sem_lock);
 	if (ks->ks_waiters != 0) {
 		mtx_unlock(&sem_lock);
 		error = EBUSY;
 		goto err;
 	}
 	ks->ks_flags |= KS_DEAD;
 	mtx_unlock(&sem_lock);
 
 	error = kern_close(td, uap->id);
 err:
 	fdrop(fp, td);
 	return (error);
 }
 
 static struct syscall_helper_data ksem_syscalls[] = {
 	SYSCALL_INIT_HELPER(ksem_init),
 	SYSCALL_INIT_HELPER(ksem_open),
 	SYSCALL_INIT_HELPER(ksem_unlink),
 	SYSCALL_INIT_HELPER(ksem_close),
 	SYSCALL_INIT_HELPER(ksem_post),
 	SYSCALL_INIT_HELPER(ksem_wait),
 	SYSCALL_INIT_HELPER(ksem_timedwait),
 	SYSCALL_INIT_HELPER(ksem_trywait),
 	SYSCALL_INIT_HELPER(ksem_getvalue),
 	SYSCALL_INIT_HELPER(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 int
 freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 1));
 }
 
 int
 freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
 {
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 1));
 }
 
 int
 freebsd32_ksem_timedwait(struct thread *td,
     struct freebsd32_ksem_timedwait_args *uap)
 {
 	struct timespec32 abstime32;
 	struct timespec *ts, abstime;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
 		if (error != 0)
 			return (error);
 		CP(abstime32, abstime, tv_sec);
 		CP(abstime32, abstime, tv_nsec);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 static struct syscall_helper_data ksem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 ksem_module_init(void)
 {
 	int error;
 
 	mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
 	mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
 	sx_init(&ksem_dict_lock, "ksem dictionary");
 	ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
 	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
 	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
 
 	error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #endif
 	return (0);
 }
 
 static void
 ksem_module_destroy(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(ksem32_syscalls);
 #endif
 	syscall_helper_unregister(ksem_syscalls);
 
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
 	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
 	sx_destroy(&ksem_dict_lock);
 	mtx_destroy(&ksem_count_lock);
 	mtx_destroy(&sem_lock);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
 }
 
 static int
 sem_modload(struct module *module, int cmd, void *arg)
 {
         int error = 0;
 
         switch (cmd) {
         case MOD_LOAD:
 		error = ksem_module_init();
 		if (error)
 			ksem_module_destroy();
                 break;
 
         case MOD_UNLOAD:
 		mtx_lock(&ksem_count_lock);
 		if (nsems != 0) {
 			error = EOPNOTSUPP;
 			mtx_unlock(&ksem_count_lock);
 			break;
 		}
 		ksem_dead = 1;
 		mtx_unlock(&ksem_count_lock);
 		ksem_module_destroy();
                 break;
 
         case MOD_SHUTDOWN:
                 break;
         default:
                 error = EINVAL;
                 break;
         }
         return (error);
 }
 
 static moduledata_t sem_mod = {
         "sem",
         &sem_modload,
         NULL
 };
 
 DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sem, 1);
Index: head/sys/kern/uipc_shm.c
===================================================================
--- head/sys/kern/uipc_shm.c	(revision 326270)
+++ head/sys/kern/uipc_shm.c	(revision 326271)
@@ -1,1119 +1,1121 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for shared swap-backed anonymous memory objects via
  * shm_open(2) and shm_unlink(2).  While most of the implementation is
  * here, vm_mmap.c contains mapping logic changes.
  *
  * TODO:
  *
  * (1) Need to export data to a userland tool via a sysctl.  Should ipcs(1)
  *     and ipcrm(1) be expanded or should new tools to manage both POSIX
  *     kernel semaphores and POSIX shared memory be written?
  *
  * (2) Add support for this file type to fstat(1).
  *
  * (3) Resource limits?  Does this need its own resource limits or are the
  *     existing limits in mmap(2) sufficient?
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/uio.h>
 #include <sys/signal.h>
 #include <sys/jail.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 struct shm_mapping {
 	char		*sm_path;
 	Fnv32_t		sm_fnv;
 	struct shmfd	*sm_shmfd;
 	LIST_ENTRY(shm_mapping) sm_link;
 };
 
 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
 static LIST_HEAD(, shm_mapping) *shm_dictionary;
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
 static struct unrhdr *shm_ino_unr;
 static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
 static void	shm_init(void *arg);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
 static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 
 static fo_rdwr_t	shm_read;
 static fo_rdwr_t	shm_write;
 static fo_truncate_t	shm_truncate;
 static fo_stat_t	shm_stat;
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
 static fo_seek_t	shm_seek;
 static fo_fill_kinfo_t	shm_fill_kinfo;
 static fo_mmap_t	shm_mmap;
 
 /* File descriptor operations. */
 struct fileops shm_ops = {
 	.fo_read = shm_read,
 	.fo_write = shm_write,
 	.fo_truncate = shm_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = shm_stat,
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = shm_seek,
 	.fo_fill_kinfo = shm_fill_kinfo,
 	.fo_mmap = shm_mmap,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 {
 	vm_page_t m;
 	vm_pindex_t idx;
 	size_t tlen;
 	int error, offset, rv;
 
 	idx = OFF_TO_IDX(uio->uio_offset);
 	offset = uio->uio_offset & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	VM_OBJECT_WLOCK(obj);
 
 	/*
 	 * Read I/O without either a corresponding resident page or swap
 	 * page: use zero_region.  This is intended to avoid instantiating
 	 * pages on read from a sparse region.
 	 */
 	if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL &&
 	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
 	}
 
 	/*
 	 * Parallel reads of the page content from disk are prevented
 	 * by exclusive busy.
 	 *
 	 * Although the tmpfs vnode lock is held here, it is
 	 * nonetheless safe to sleep waiting for a free page.  The
 	 * pageout daemon does not need to acquire the tmpfs vnode
 	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
 	 * type object.
 	 */
 	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		vm_page_xbusy(m);
 		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
 			rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				printf(
 	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
 				    obj, idx, m->valid, rv);
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				VM_OBJECT_WUNLOCK(obj);
 				return (EIO);
 			}
 		} else
 			vm_page_zero_invalid(m, TRUE);
 		vm_page_xunbusy(m);
 	}
 	vm_page_lock(m);
 	vm_page_hold(m);
 	if (vm_page_active(m))
 		vm_page_reference(m);
 	else
 		vm_page_activate(m);
 	vm_page_unlock(m);
 	VM_OBJECT_WUNLOCK(obj);
 	error = uiomove_fromphys(&m, offset, tlen, uio);
 	if (uio->uio_rw == UIO_WRITE && error == 0) {
 		VM_OBJECT_WLOCK(obj);
 		vm_page_dirty(m);
 		vm_pager_page_unswapped(m);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 
 	return (error);
 }
 
 int
 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
 {
 	ssize_t resid;
 	size_t len;
 	int error;
 
 	error = 0;
 	while ((resid = uio->uio_resid) > 0) {
 		if (obj_size <= uio->uio_offset)
 			break;
 		len = MIN(obj_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = uiomove_object_page(obj, len, uio);
 		if (error != 0 || resid == uio->uio_resid)
 			break;
 	}
 	return (error);
 }
 
 static int
 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct shmfd *shmfd;
 	off_t foffset;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset)) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += shmfd->shm_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0) {
 		if (offset < 0 || offset > shmfd->shm_size)
 			error = EINVAL;
 		else
 			td->td_uretoff.tdu_off = offset;
 	}
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
 	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	if ((flags & FOF_OFFSET) == 0) {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 	} else {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
 		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	}
 
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	return (shm_dotruncate(shmfd, length));
 }
 
 static int
 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a memory file
 	 * descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 	sb->st_blksize = PAGE_SIZE;
 	sb->st_size = shmfd->shm_size;
 	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
 	mtx_lock(&shm_timestamp_lock);
 	sb->st_atim = shmfd->shm_atime;
 	sb->st_ctim = shmfd->shm_ctime;
 	sb->st_mtim = shmfd->shm_mtime;
 	sb->st_birthtim = shmfd->shm_birthtime;
 	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
 	sb->st_dev = shm_dev_ino;
 	sb->st_ino = shmfd->shm_ino;
 
 	return (0);
 }
 
 static int
 shm_close(struct file *fp, struct thread *td)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	fp->f_data = NULL;
 	shm_drop(shmfd);
 
 	return (0);
 }
 
 int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t idx, nobjsize;
 	vm_ooffset_t delta;
 	int base, rv;
 
 	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
 	object = shmfd->shm_object;
 	VM_OBJECT_WLOCK(object);
 	if (length == shmfd->shm_size) {
 		VM_OBJECT_WUNLOCK(object);
 		return (0);
 	}
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
 
 	/* Are we shrinking?  If so, trim the end. */
 	if (length < shmfd->shm_size) {
 		/*
 		 * Disallow any requests to shrink the size if this
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0) {
 			VM_OBJECT_WUNLOCK(object);
 			return (EBUSY);
 		}
 
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = length & PAGE_MASK;
 		if (base != 0) {
 			idx = OFF_TO_IDX(length);
 retry:
 			m = vm_page_lookup(object, idx);
 			if (m != NULL) {
 				if (vm_page_sleep_if_busy(m, "shmtrc"))
 					goto retry;
 			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 				m = vm_page_alloc(object, idx,
 				    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
 				if (m == NULL)
 					goto retry;
 				rv = vm_pager_get_pages(object, &m, 1, NULL,
 				    NULL);
 				vm_page_lock(m);
 				if (rv == VM_PAGER_OK) {
 					/*
 					 * Since the page was not resident,
 					 * and therefore not recently
 					 * accessed, immediately enqueue it
 					 * for asynchronous laundering.  The
 					 * current operation is not regarded
 					 * as an access.
 					 */
 					vm_page_launder(m);
 					vm_page_unlock(m);
 					vm_page_xunbusy(m);
 				} else {
 					vm_page_free(m);
 					vm_page_unlock(m);
 					VM_OBJECT_WUNLOCK(object);
 					return (EIO);
 				}
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
 				    ("shm_dotruncate: page %p is invalid", m));
 				vm_page_dirty(m);
 				vm_pager_page_unswapped(m);
 			}
 		}
 		delta = IDX_TO_OFF(object->size - nobjsize);
 
 		/* Toss in memory pages. */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 
 		/* Toss pages from swap. */
 		if (object->type == OBJT_SWAP)
 			swap_pager_freespace(object, nobjsize, delta);
 
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
 		/* Try to reserve additional swap space. */
 		delta = IDX_TO_OFF(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred)) {
 			VM_OBJECT_WUNLOCK(object);
 			return (ENOMEM);
 		}
 		object->charge += delta;
 	}
 	shmfd->shm_size = length;
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_ctime);
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * shmfd object management including creation and reference counting
  * routines.
  */
 struct shmfd *
 shm_alloc(struct ucred *ucred, mode_t mode)
 {
 	struct shmfd *shmfd;
 	int ino;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
 	shmfd->shm_uid = ucred->cr_uid;
 	shmfd->shm_gid = ucred->cr_gid;
 	shmfd->shm_mode = mode;
 	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
 	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
 	shmfd->shm_object->pg_color = 0;
 	VM_OBJECT_WLOCK(shmfd->shm_object);
 	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT);
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
 	ino = alloc_unr(shm_ino_unr);
 	if (ino == -1)
 		shmfd->shm_ino = 0;
 	else
 		shmfd->shm_ino = ino;
 	refcount_init(&shmfd->shm_refs, 1);
 	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
 	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
 #endif
 
 	return (shmfd);
 }
 
 struct shmfd *
 shm_hold(struct shmfd *shmfd)
 {
 
 	refcount_acquire(&shmfd->shm_refs);
 	return (shmfd);
 }
 
 void
 shm_drop(struct shmfd *shmfd)
 {
 
 	if (refcount_release(&shmfd->shm_refs)) {
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
 		rangelock_destroy(&shmfd->shm_rl);
 		mtx_destroy(&shmfd->shm_mtx);
 		vm_object_deallocate(shmfd->shm_object);
 		if (shmfd->shm_ino != 0)
 			free_unr(shm_ino_unr, shmfd->shm_ino);
 		free(shmfd, M_SHMFD);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for a
  * specified combination of FREAD and FWRITE.
  */
 int
 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
 {
 	accmode_t accmode;
 	int error;
 
 	accmode = 0;
 	if (flags & FREAD)
 		accmode |= VREAD;
 	if (flags & FWRITE)
 		accmode |= VWRITE;
 	mtx_lock(&shm_timestamp_lock);
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    accmode, ucred, NULL);
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to shmfd objects.  We use the FNV hash on the path to store
  * the mappings in a hash table.
  */
 static void
 shm_init(void *arg)
 {
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 	shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL);
 	KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized"));
 	shm_dev_ino = devfs_alloc_cdp_inode();
 	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 }
 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
 {
 	struct shm_mapping *map;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0)
 			return (map->sm_shmfd);
 	}
 
 	return (NULL);
 }
 
 static void
 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 {
 	struct shm_mapping *map;
 
 	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 	map->sm_path = path;
 	map->sm_fnv = fnv;
 	map->sm_shmfd = shm_hold(shmfd);
 	shmfd->shm_path = path;
 	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 }
 
 static int
 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct shm_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 			if (error)
 				return (error);
 #endif
 			error = shm_access(map->sm_shmfd, ucred,
 			    FREAD | FWRITE);
 			if (error)
 				return (error);
 			map->sm_shmfd->shm_path = NULL;
 			LIST_REMOVE(map, sm_link);
 			shm_drop(map->sm_shmfd);
 			free(map->sm_path, M_SHMFD);
 			free(map, M_SHMFD);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 int
 kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp;
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int fd, error;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * shm_open(2) is only allowed for anonymous objects.
 	 */
 	if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
 		return (ECAPMODE);
 #endif
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
 	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
 
 	error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps);
 	if (error)
 		return (error);
 
 	/* A SHM_ANON path pointer creates an anonymous object. */
 	if (userpath == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((flags & O_ACCMODE) == O_RDONLY) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		shmfd = shm_alloc(td->td_ucred, cmode);
 	} else {
 		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
 		pr_path = td->td_ucred->cr_prison->pr_path;
 
 		/* Construct a full pathname for jailed callers. */
 		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 		    : strlcpy(path, pr_path, MAXPATHLEN);
 		error = copyinstr(userpath, path + pr_pathlen,
 		    MAXPATHLEN - pr_pathlen, NULL);
 #ifdef KTRACE
 		if (error == 0 && KTRPOINT(curthread, KTR_NAMEI))
 			ktrnamei(path);
 #endif
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_SHMFD);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
 			if (flags & O_CREAT) {
 #ifdef MAC
 				error = mac_posixshm_check_create(td->td_ucred,
 				    path);
 				if (error == 0) {
 #endif
 					shmfd = shm_alloc(td->td_ucred, cmode);
 					shm_insert(path, fnv, shmfd);
 #ifdef MAC
 				}
 #endif
 			} else {
 				free(path, M_SHMFD);
 				error = ENOENT;
 			}
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			free(path, M_SHMFD);
 			if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixshm_check_open(td->td_ucred,
 				    shmfd, FFLAGS(flags & O_ACCMODE));
 				if (error == 0)
 #endif
 				error = shm_access(shmfd, td->td_ucred,
 				    FFLAGS(flags & O_ACCMODE));
 			}
 
 			/*
 			 * Truncate the file back to zero length if
 			 * O_TRUNC was specified and the object was
 			 * opened with read/write.
 			 */
 			if (error == 0 &&
 			    (flags & (O_ACCMODE | O_TRUNC)) ==
 			    (O_RDWR | O_TRUNC)) {
 #ifdef MAC
 				error = mac_posixshm_check_truncate(
 					td->td_ucred, fp->f_cred, shmfd);
 				if (error == 0)
 #endif
 					shm_dotruncate(shmfd, 0);
 			}
 			if (error == 0)
 				shm_hold(shmfd);
 		}
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 /* System calls. */
 int
 sys_shm_open(struct thread *td, struct shm_open_args *uap)
 {
 
 	return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL));
 }
 
 int
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 	    : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
 	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(path);
 #endif
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&shm_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 int
 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
     vm_ooffset_t foff, struct thread *td)
 {
 	struct shmfd *shmfd;
 	vm_prot_t maxprot;
 	int error;
 
 	shmfd = fp->f_data;
 	maxprot = VM_PROT_NONE;
 
 	/* FREAD should always be set. */
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
 	if ((fp->f_flag & FWRITE) != 0)
 		maxprot |= VM_PROT_WRITE;
 
 	/* Don't permit shared writable mappings on read-only descriptors. */
 	if ((flags & MAP_SHARED) != 0 &&
 	    (maxprot & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	maxprot &= cap_maxprot;
 
 	/* See comment in vn_mmap(). */
 	if (
 #ifdef _LP64
 	    objsize > OFF_MAX ||
 #endif
 	    foff < 0 || foff > OFF_MAX - objsize)
 		return (EINVAL);
 
 #ifdef MAC
 	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
 	if (error != 0)
 		return (error);
 #endif
 	
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_atime);
 	mtx_unlock(&shm_timestamp_lock);
 	vm_object_reference(shmfd->shm_object);
 
 	error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
 	    shmfd->shm_object, foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(shmfd->shm_object);
 	return (error);
 }
 
 static int
 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 	/*
 	 * SUSv4 says that x bits of permission need not be affected.
 	 * Be consistent with our shm_open there.
 	 */
 #ifdef MAC
 	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
 	    shmfd->shm_gid, VADMIN, active_cred, NULL);
 	if (error != 0)
 		goto out;
 	shmfd->shm_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static int
 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 #ifdef MAC
 	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = shmfd->shm_uid;
 	if (gid == (gid_t)-1)
                  gid = shmfd->shm_gid;
 	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	shmfd->shm_uid = uid;
 	shmfd->shm_gid = gid;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Helper routines to allow the backing object of a shared memory file
  * descriptor to be mapped in the kernel.
  */
 int
 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 {
 	struct shmfd *shmfd;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
 	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
 	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
 		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 		if (rv == KERN_SUCCESS) {
 			*memp = (void *)(kva + ofs);
 			return (0);
 		}
 		vm_map_remove(kernel_map, kva, kva + size);
 	} else
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
 	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * We require the caller to unmap the entire entry.  This allows us to
  * safely decrement shm_kmappings when a mapping is removed.
  */
 int
 shm_unmap(struct file *fp, void *mem, size_t size)
 {
 	struct shmfd *shmfd;
 	vm_map_entry_t entry;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 	vm_map_t map;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	kva = (vm_offset_t)mem;
 	ofs = kva & PAGE_MASK;
 	kva = trunc_page(kva);
 	size = round_page(size + ofs);
 	map = kernel_map;
 	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 	    &obj, &pindex, &prot, &wired);
 	if (rv != KERN_SUCCESS)
 		return (EINVAL);
 	if (entry->start != kva || entry->end != kva + size) {
 		vm_map_lookup_done(map, entry);
 		return (EINVAL);
 	}
 	vm_map_lookup_done(map, entry);
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
 static int
 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	const char *path, *pr_path;
 	struct shmfd *shmfd;
 	size_t pr_pathlen;
 
 	kif->kf_type = KF_TYPE_SHM;
 	shmfd = fp->f_data;
 
 	mtx_lock(&shm_timestamp_lock);
 	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;	/* XXX */
 	mtx_unlock(&shm_timestamp_lock);
 	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 	if (shmfd->shm_path != NULL) {
 		sx_slock(&shm_dict_lock);
 		if (shmfd->shm_path != NULL) {
 			path = shmfd->shm_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				if (strncmp(path, pr_path, pr_pathlen) == 0 &&
 				    path[pr_pathlen] == '/')
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 		sx_sunlock(&shm_dict_lock);
 	}
 	return (0);
 }
Index: head/sys/kern/vfs_acl.c
===================================================================
--- head/sys/kern/vfs_acl.c	(revision 326270)
+++ head/sys/kern/vfs_acl.c	(revision 326271)
@@ -1,588 +1,590 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999-2006, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Developed by the TrustedBSD Project.
  *
  * ACL system calls and other functions common across different ACL types.
  * Type-specific routines go into subr_acl_<type>.c.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/acl.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES);
 
 MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists");
 
 static int	vacl_set_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_get_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 
 int
 acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest)
 {
 	int i;
 
 	if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES)
 		return (EINVAL);
 	
 	bzero(dest, sizeof(*dest));
 
 	dest->acl_cnt = source->acl_cnt;
 	dest->acl_maxcnt = ACL_MAX_ENTRIES;
 
 	for (i = 0; i < dest->acl_cnt; i++) {
 		dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
 		dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
 		dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
 	}
 
 	return (0);
 }
 
 int
 acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest)
 {
 	int i;
 
 	if (source->acl_cnt > OLDACL_MAX_ENTRIES)
 		return (EINVAL);
 
 	bzero(dest, sizeof(*dest));
 
 	dest->acl_cnt = source->acl_cnt;
 
 	for (i = 0; i < dest->acl_cnt; i++) {
 		dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
 		dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
 		dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
 	}
 
 	return (0);
 }
 
 /*
  * At one time, "struct ACL" was extended in order to add support for NFSv4
  * ACLs.  Instead of creating compatibility versions of all the ACL-related
  * syscalls, they were left intact.  It's possible to find out what the code
  * calling these syscalls (libc) expects basing on "type" argument - if it's
  * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were
  * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct
  * oldacl".  If it's something else, then it's the new "struct acl".  In the
  * latter case, the routines below just copyin/copyout the contents.  In the
  * former case, they copyin the "struct oldacl" and convert it to the new
  * format.
  */
 static int
 acl_copyin(void *user_acl, struct acl *kernel_acl, acl_type_t type)
 {
 	int error;
 	struct oldacl old;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS_OLD:
 	case ACL_TYPE_DEFAULT_OLD:
 		error = copyin(user_acl, &old, sizeof(old));
 		if (error != 0)
 			break;
 		acl_copy_oldacl_into_acl(&old, kernel_acl);
 		break;
 
 	default:
 		error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl));
 		if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES)
 			return (EINVAL);
 	}
 
 	return (error);
 }
 
 static int
 acl_copyout(struct acl *kernel_acl, void *user_acl, acl_type_t type)
 {
 	uint32_t am;
 	int error;
 	struct oldacl old;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS_OLD:
 	case ACL_TYPE_DEFAULT_OLD:
 		error = acl_copy_acl_into_oldacl(kernel_acl, &old);
 		if (error != 0)
 			break;
 
 		error = copyout(&old, user_acl, sizeof(old));
 		break;
 
 	default:
 		error = fueword32((char *)user_acl +
 		    offsetof(struct acl, acl_maxcnt), &am);
 		if (error == -1)
 			return (EFAULT);
 		if (am != ACL_MAX_ENTRIES)
 			return (EINVAL);
 
 		error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl));
 	}
 
 	return (error);
 }
 
 /*
  * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new"
  * counterpart.  It's required for old (pre-NFSv4 ACLs) libc to work
  * with new kernel.  Fixing 'type' for old binaries with new libc
  * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold().
  */
 static int
 acl_type_unold(int type)
 {
 	switch (type) {
 	case ACL_TYPE_ACCESS_OLD:
 		return (ACL_TYPE_ACCESS);
 
 	case ACL_TYPE_DEFAULT_OLD:
 		return (ACL_TYPE_DEFAULT);
 
 	default:
 		return (type);
 	}
 }
 
 /*
  * These calls wrap the real vnode operations, and are called by the syscall
  * code once the syscall has converted the path or file descriptor to a vnode
  * (unlocked).  The aclp pointer is assumed still to point to userland, so
  * this should not be consumed within the kernel except by syscall code.
  * Other code should directly invoke VOP_{SET,GET}ACL.
  */
 
 /*
  * Given a vnode, set its ACL.
  */
 static int
 vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl *inkernelacl;
 	struct mount *mp;
 	int error;
 
 	AUDIT_ARG_VALUE(type);
 	inkernelacl = acl_alloc(M_WAITOK);
 	error = acl_copyin(aclp, inkernelacl, type);
 	if (error != 0)
 		goto out;
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		goto out;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 #ifdef MAC
 	error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl);
 	if (error != 0)
 		goto out_unlock;
 #endif
 	error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl,
 	    td->td_ucred, td);
 #ifdef MAC
 out_unlock:
 #endif
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 out:
 	acl_free(inkernelacl);
 	return (error);
 }
 
 /*
  * Given a vnode, get its ACL.
  */
 static int
 vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl *inkernelacl;
 	int error;
 
 	AUDIT_ARG_VALUE(type);
 	inkernelacl = acl_alloc(M_WAITOK | M_ZERO);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 #ifdef MAC
 	error = mac_vnode_check_getacl(td->td_ucred, vp, type);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl,
 	    td->td_ucred, td);
 
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0);
 	if (error == 0)
 		error = acl_copyout(inkernelacl, aclp, type);
 	acl_free(inkernelacl);
 	return (error);
 }
 
 /*
  * Given a vnode, delete its ACL.
  */
 static int
 vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
 {
 	struct mount *mp;
 	int error;
 
 	AUDIT_ARG_VALUE(type);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 #ifdef MAC
 	error = mac_vnode_check_deleteacl(td->td_ucred, vp, type);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Given a vnode, check whether an ACL is appropriate for it
  *
  * XXXRW: No vnode lock held so can't audit vnode state...?
  */
 static int
 vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl *inkernelacl;
 	int error;
 
 	inkernelacl = acl_alloc(M_WAITOK);
 	error = acl_copyin(aclp, inkernelacl, type);
 	if (error != 0)
 		goto out;
 	error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl,
 	    td->td_ucred, td);
 out:
 	acl_free(inkernelacl);
 	return (error);
 }
 
 /*
  * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.  Don't
  * need to lock, as the vacl_ code will get/release any locks required.
  */
 
 /*
  * Given a file path, get an ACL for it
  */
 int
 sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
 	    td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, get an ACL for it; don't follow links.
  */
 int
 sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
 	    td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it.
  */
 int
 sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
 	    td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it; don't follow links.
  */
 int
 sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
 	    td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	return (error);
 }
 
 /*
  * Given a file descriptor, get an ACL for it.
  */
 int
 sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->filedes);
 	error = getvnode(td, uap->filedes,
 	    cap_rights_init(&rights, CAP_ACL_GET), &fp);
 	if (error == 0) {
 		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * Given a file descriptor, set an ACL for it.
  */
 int
 sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->filedes);
 	error = getvnode(td, uap->filedes,
 	    cap_rights_init(&rights, CAP_ACL_SET), &fp);
 	if (error == 0) {
 		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  */
 int
 sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it; don't follow links.
  */
 int
 sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  */
 int
 sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->filedes);
 	error = getvnode(td, uap->filedes,
 	    cap_rights_init(&rights, CAP_ACL_DELETE), &fp);
 	if (error == 0) {
 		error = vacl_delete(td, fp->f_vnode, uap->type);
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it.
  */
 int
 sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it; don't follow links.
  */
 int
 sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	return (error);
 }
 
 /*
  * Given a file descriptor, check an ACL for it.
  */
 int
 sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->filedes);
 	error = getvnode(td, uap->filedes,
 	    cap_rights_init(&rights, CAP_ACL_CHECK), &fp);
 	if (error == 0) {
 		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 struct acl *
 acl_alloc(int flags)
 {
 	struct acl *aclp;
 
 	aclp = malloc(sizeof(*aclp), M_ACL, flags);
 	if (aclp == NULL)
 		return (NULL);
 
 	aclp->acl_maxcnt = ACL_MAX_ENTRIES;
 
 	return (aclp);
 }
 
 void
 acl_free(struct acl *aclp)
 {
 
 	free(aclp, M_ACL);
 }
Index: head/sys/kern/vfs_aio.c
===================================================================
--- head/sys/kern/vfs_aio.c	(revision 326270)
+++ head/sys/kern/vfs_aio.c	(revision 326271)
@@ -1,3003 +1,3005 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. John S. Dyson's name may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  */
 
 /*
  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
 #include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
  * overflow. (XXX will be removed soon.)
  */
 static u_long jobrefid;
 
 /*
  * Counter for aio_fsync.
  */
 static uint64_t jobseqno;
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
 #endif
 
 #ifndef MAX_AIO_QUEUE_PER_PROC
 #define MAX_AIO_QUEUE_PER_PROC	256
 #endif
 
 #ifndef MAX_AIO_QUEUE
 #define MAX_AIO_QUEUE		1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
 #endif
 
 #ifndef MAX_BUF_AIO
 #define MAX_BUF_AIO		16
 #endif
 
 FEATURE(aio, "Asynchronous I/O");
 SYSCTL_DECL(_p1003_1b);
 
 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
 static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
     "Async IO management");
 
 static int enable_aio_unsafe = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
     "Permit asynchronous IO on all file types, not just known-safe types");
 
 static unsigned int unsafe_warningcnt = 1;
 SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
     &unsafe_warningcnt, 0,
     "Warnings that will be triggered upon failed IO requests on unsafe files");
 
 static int max_aio_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
     "Maximum number of kernel processes to use for handling async IO ");
 
 static int num_aio_procs = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
     "Number of presently active kernel processes for async IO");
 
 /*
  * The code will adjust the actual number of AIO processes towards this
  * number when it gets a chance.
  */
 static int target_aio_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
     0,
     "Preferred number of ready kernel processes for async IO");
 
 static int max_queue_count = MAX_AIO_QUEUE;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
     "Maximum number of aio requests to queue, globally");
 
 static int num_queue_count = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
     "Number of queued aio requests");
 
 static int num_buf_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
     "Number of aio requests presently handled by the buf subsystem");
 
 /* Number of async I/O processes in the process of being started */
 /* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_lifetime;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static int max_aio_per_proc = MAX_AIO_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
     0,
     "Maximum active aio requests per process (stored in the process)");
 
 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
     &max_aio_queue_per_proc, 0,
     "Maximum queued aio requests per process (stored in the process)");
 
 static int max_buf_aio = MAX_BUF_AIO;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process (stored in the process)");
 
 /* 
  * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
  * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
  * vfs.aio.aio_listio_max.
  */
 SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
     CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
     0, "Maximum aio requests for a single lio_listio call");
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb {
 	int	aio_fildes;		/* File descriptor */
 	off_t	aio_offset;		/* File offset for I/O */
 	volatile void *aio_buf;         /* I/O buffer in process space */
 	size_t	aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent aio_sigevent;	/* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private	_aiocb_private;
 } oaiocb_t;
 #endif
 
 /*
  * Below is a key of locks used to protect each member of struct kaiocb
  * aioliojob and kaioinfo and any backends.
  *
  * * - need not protected
  * a - locked by kaioinfo lock
  * b - locked by backend lock, the backend lock can be null in some cases,
  *     for example, BIO belongs to this type, in this case, proc lock is
  *     reused.
  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
  */
 
 /*
  * If the routine that services an AIO request blocks while running in an
  * AIO kernel process it can starve other I/O requests.  BIO requests
  * queued via aio_qphysio() complete in GEOM and do not use AIO kernel
  * processes at all.  Socket I/O requests use a separate pool of
  * kprocs and also force non-blocking I/O.  Other file I/O requests
  * use the generic fo_read/fo_write operations which can block.  The
  * fsync and mlock operations can also block while executing.  Ideally
  * none of these requests would block while executing.
  *
  * Note that the service routines cannot toggle O_NONBLOCK in the file
  * structure directly while handling a request due to races with
  * userland threads.
  */
 
 /* jobflags */
 #define	KAIOCB_QUEUEING		0x01
 #define	KAIOCB_CANCELLED	0x02
 #define	KAIOCB_CANCELLING	0x04
 #define	KAIOCB_CHECKSYNC	0x08
 #define	KAIOCB_CLEARED		0x10
 #define	KAIOCB_FINISHED		0x20
 
 /*
  * AIO process info
  */
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aioproc {
 	int	aioprocflags;			/* (c) AIO proc flags */
 	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
 	struct	proc *aioproc;			/* (*) the AIO proc */
 };
 
 /*
  * data-structure for lio signal management
  */
 struct aioliojob {
 	int	lioj_flags;			/* (a) listio flags */
 	int	lioj_count;			/* (a) listio flags */
 	int	lioj_finished_count;		/* (a) listio flags */
 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
 	struct	knlist klist;			/* (a) list of knotes */
 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
 
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
 	struct	mtx kaio_mtx;		/* the lock to protect this struct */
 	int	kaio_flags;		/* (a) per process kaio flags */
 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
 	int	kaio_active_count;	/* (c) number of currently used AIOs */
 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
 	int	kaio_count;		/* (a) size of AIO queue */
 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
 	int	kaio_buffer_count;	/* (a) number of physio buffers */
 	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
 	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
 	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
 	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
 	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
 	struct	task kaio_task;		/* (*) task to kick aio processes */
 	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
 };
 
 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
 
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
 
 /*
  * Operations used to interact with userland aio control blocks.
  * Different ABIs provide their own operations.
  */
 struct aiocb_ops {
 	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
 	long	(*fetch_status)(struct aiocb *ujob);
 	long	(*fetch_error)(struct aiocb *ujob);
 	int	(*store_status)(struct aiocb *ujob, long status);
 	int	(*store_error)(struct aiocb *ujob, long error);
 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
 };
 
 static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
 static struct sema aio_newproc_sem;
 static struct mtx aio_job_mtx;
 static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 void		aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
 static void	aio_process_rw(struct kaiocb *job);
 static void	aio_process_sync(struct kaiocb *job);
 static void	aio_process_mlock(struct kaiocb *job);
 static void	aio_schedule_fsync(void *context, int pending);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *ujob,
 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
 static int	aio_queue_file(struct file *fp, struct kaiocb *job);
 static void	aio_physwakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p,
 		    struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct kaiocb *job);
 static void	aio_daemon(void *param);
 static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
 static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
 static int	aio_kick(struct proc *userp);
 static void	aio_kick_nowait(struct proc *userp);
 static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
 static int	filt_lioattach(struct knote *kn);
 static void	filt_liodetach(struct knote *kn);
 static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
  * 	kaio	Per process async io info
  *	aiop	async io process data
  *	aiocb	async io jobs
  *	aiolio	list io jobs
  */
 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone;
 
 /* kqueue filters for aio */
 static struct filterops aio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_aioattach,
 	.f_detach = filt_aiodetach,
 	.f_event = filt_aio,
 };
 static struct filterops lio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_lioattach,
 	.f_detach = filt_liodetach,
 	.f_event = filt_lio
 };
 
 static eventhandler_tag exit_tag, exec_tag;
 
 TASKQUEUE_DEFINE_THREAD(aiod_kick);
 
 /*
  * Main operations function for use as a kernel module.
  */
 static int
 aio_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		aio_onceonly();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t aio_mod = {
 	"aio",
 	&aio_modload,
 	NULL
 };
 
 DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(aio, 1);
 
 /*
  * Startup initialization
  */
 static int
 aio_onceonly(void)
 {
 
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
 	jobrefid = 1;
 	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
 
 	return (0);
 }
 
 /*
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
 void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
 	ki->kaio_flags = 0;
 	ki->kaio_maxactive_count = max_aio_per_proc;
 	ki->kaio_active_count = 0;
 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
 	ki->kaio_count = 0;
 	ki->kaio_ballowed_count = max_buf_aio;
 	ki->kaio_buffer_count = 0;
 	TAILQ_INIT(&ki->kaio_all);
 	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_syncqueue);
 	TAILQ_INIT(&ki->kaio_syncready);
 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
 		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
 		aio_newproc(NULL);
 }
 
 static int
 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	struct thread *td;
 	int error;
 
 	error = sigev_findtd(p, sigev, &td);
 	if (error)
 		return (error);
 	if (!KSI_ONQ(ksi)) {
 		ksiginfo_set_sigev(ksi, sigev);
 		ksi->ksi_code = SI_ASYNCIO;
 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Free a job entry.  Wait for completion if it is currently active, but don't
  * delay forever.  If we delay, we return a flag that says that we have to
  * restart the queue scan.
  */
 static int
 aio_free_entry(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct proc *p;
 
 	p = job->userproc;
 	MPASS(curproc == p);
 	ki = p->p_aioinfo;
 	MPASS(ki != NULL);
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	atomic_subtract_int(&num_queue_count, 1);
 
 	ki->kaio_count--;
 	MPASS(ki->kaio_count >= 0);
 
 	TAILQ_REMOVE(&ki->kaio_done, job, plist);
 	TAILQ_REMOVE(&ki->kaio_all, job, allist);
 
 	lj = job->lio;
 	if (lj) {
 		lj->lioj_count--;
 		lj->lioj_finished_count--;
 
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			/* lio is going away, we need to destroy any knotes */
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		}
 	}
 
 	/* job is going away, we need to destroy any knotes */
 	knlist_delete(&job->klist, curthread, 1);
 	PROC_LOCK(p);
 	sigqueue_take(&job->ksi);
 	PROC_UNLOCK(p);
 
 	AIO_UNLOCK(ki);
 
 	/*
 	 * The thread argument here is used to find the owning process
 	 * and is also passed to fo_close() which may pass it to various
 	 * places such as devsw close() routines.  Because of that, we
 	 * need a thread pointer from the process owning the job that is
 	 * persistent and won't disappear out from under us or move to
 	 * another process.
 	 *
 	 * Currently, all the callers of this function call it to remove
 	 * a kaiocb from the current process' job list either via a
 	 * syscall or due to the current process calling exit() or
 	 * execve().  Thus, we know that p == curproc.  We also know that
 	 * curthread can't exit since we are curthread.
 	 *
 	 * Therefore, we use curthread as the thread to pass to
 	 * knlist_delete().  This does mean that it is possible for the
 	 * thread pointer at close time to differ from the thread pointer
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
 	if (job->fd_file)
 		fdrop(job->fd_file, curthread);
 	crfree(job->cred);
 	uma_zfree(aiocb_zone, job);
 	AIO_LOCK(ki);
 
 	return (0);
 }
 
 static void
 aio_proc_rundown_exec(void *arg, struct proc *p,
     struct image_params *imgp __unused)
 {
    	aio_proc_rundown(arg, p);
 }
 
 static int
 aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
 {
 	aio_cancel_fn_t *func;
 	int cancelled;
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
 		return (0);
 	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
 	job->jobflags |= KAIOCB_CANCELLED;
 
 	func = job->cancel_fn;
 
 	/*
 	 * If there is no cancel routine, just leave the job marked as
 	 * cancelled.  The job should be in active use by a caller who
 	 * should complete it normally or when it fails to install a
 	 * cancel routine.
 	 */
 	if (func == NULL)
 		return (0);
 
 	/*
 	 * Set the CANCELLING flag so that aio_complete() will defer
 	 * completions of this job.  This prevents the job from being
 	 * freed out from under the cancel callback.  After the
 	 * callback any deferred completion (whether from the callback
 	 * or any other source) will be completed.
 	 */
 	job->jobflags |= KAIOCB_CANCELLING;
 	AIO_UNLOCK(ki);
 	func(job);
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_CANCELLING;
 	if (job->jobflags & KAIOCB_FINISHED) {
 		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(p, job);
 	} else {
 		/*
 		 * The cancel callback might have scheduled an
 		 * operation to cancel this request, but it is
 		 * only counted as cancelled if the request is
 		 * cancelled when the callback returns.
 		 */
 		cancelled = 0;
 	}
 	return (cancelled);
 }
 
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kaiocb *job, *jobn;
 
 	KASSERT(curthread->td_proc == p,
 	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
 	AIO_LOCK(ki);
 	ki->kaio_flags |= KAIO_RUNDOWN;
 
 restart:
 
 	/*
 	 * Try to cancel all pending requests. This code simulates
 	 * aio_cancel on all pending I/O requests.
 	 */
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		aio_cancel_job(p, ki, job);
 	}
 
 	/* Wait for all running I/O to be finished */
 	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
 		goto restart;
 	}
 
 	/* Free all completed I/O requests. */
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
 		aio_free_entry(job);
 
 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
 			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
 	AIO_UNLOCK(ki);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
 	mtx_destroy(&ki->kaio_mtx);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
 }
 
 /*
  * Select a job to run (called by an AIO daemon).
  */
 static struct kaiocb *
 aio_selectjob(struct aioproc *aiop)
 {
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 restart:
 	TAILQ_FOREACH(job, &aio_jobs, list) {
 		userp = job->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
 			TAILQ_REMOVE(&aio_jobs, job, list);
 			if (!aio_clear_cancel_function(job))
 				goto restart;
 
 			/* Account for currently active jobs. */
 			ki->kaio_active_count++;
 			break;
 		}
 	}
 	return (job);
 }
 
 /*
  * Move all data to a permanent storage device.  This code
  * simulates the fsync syscall.
  */
 static int
 aio_fsync_vnode(struct thread *td, struct vnode *vp)
 {
 	struct mount *mp;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 drop:
 	return (error);
 }
 
 /*
  * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
  * does the I/O request for the non-physio version of the operations.  The
  * normal vn operations are used, and this code should work in all instances
  * for every type of file, including pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process_rw(struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct aiocb *cb;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	ssize_t cnt;
 	long msgsnd_st, msgsnd_end;
 	long msgrcv_st, msgrcv_end;
 	long oublock_st, oublock_end;
 	long inblock_st, inblock_end;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
 	    job->uaiocb.aio_lio_opcode == LIO_WRITE,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
 	aiov.iov_len = cb->aio_nbytes;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = cb->aio_offset;
 	auio.uio_resid = cb->aio_nbytes;
 	cnt = cb->aio_nbytes;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 
 	msgrcv_st = td->td_ru.ru_msgrcv;
 	msgsnd_st = td->td_ru.ru_msgsnd;
 	inblock_st = td->td_ru.ru_inblock;
 	oublock_st = td->td_ru.ru_oublock;
 
 	/*
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (cb->aio_lio_opcode == LIO_READ) {
 		auio.uio_rw = UIO_READ;
 		if (auio.uio_resid == 0)
 			error = 0;
 		else
 			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
 		auio.uio_rw = UIO_WRITE;
 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	}
 	msgrcv_end = td->td_ru.ru_msgrcv;
 	msgsnd_end = td->td_ru.ru_msgsnd;
 	inblock_end = td->td_ru.ru_inblock;
 	oublock_end = td->td_ru.ru_oublock;
 
 	job->msgrcv = msgrcv_end - msgrcv_st;
 	job->msgsnd = msgsnd_end - msgsnd_st;
 	job->inblock = inblock_end - inblock_st;
 	job->outblock = oublock_end - oublock_st;
 
 	if ((error) && (auio.uio_resid != cnt)) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
 	cnt -= auio.uio_resid;
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, cnt, 0);
 }
 
 static void
 aio_process_sync(struct kaiocb *job)
 {
 	struct thread *td = curthread;
 	struct ucred *td_savedcred = td->td_ucred;
 	struct file *fp = job->fd_file;
 	int error = 0;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	td->td_ucred = job->cred;
 	if (fp->f_vnode != NULL)
 		error = aio_fsync_vnode(td, fp->f_vnode);
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, 0, 0);
 }
 
 static void
 aio_process_mlock(struct kaiocb *job)
 {
 	struct aiocb *cb = &job->uaiocb;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	error = kern_mlock(job->userproc, job->cred,
 	    __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
 	aio_complete(job, error != 0 ? -1 : 0, error);
 }
 
 static void
 aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *sjob, *sjobn;
 	int lj_done;
 	bool schedule_fsync;
 
 	ki = userp->p_aioinfo;
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	lj = job->lio;
 	lj_done = 0;
 	if (lj) {
 		lj->lioj_finished_count++;
 		if (lj->lioj_count == lj->lioj_finished_count)
 			lj_done = 1;
 	}
 	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	if (ki->kaio_flags & KAIO_RUNDOWN)
 		goto notification_done;
 
 	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
 		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
 
 	KNOTE_LOCKED(&job->klist, 1);
 
 	if (lj_done) {
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 			KNOTE_LOCKED(&lj->klist, 1);
 		}
 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 		    == LIOJ_SIGNAL
 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 		}
 	}
 
 notification_done:
 	if (job->jobflags & KAIOCB_CHECKSYNC) {
 		schedule_fsync = false;
 		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
 			if (job->fd_file != sjob->fd_file ||
 			    job->seqno >= sjob->seqno)
 				continue;
 			if (--sjob->pending > 0)
 				continue;
 			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
 			if (!aio_clear_cancel_function_locked(sjob))
 				continue;
 			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
 			schedule_fsync = true;
 		}
 		if (schedule_fsync)
 			taskqueue_enqueue(taskqueue_aiod_kick,
 			    &ki->kaio_sync_task);
 	}
 	if (ki->kaio_flags & KAIO_WAKEUP) {
 		ki->kaio_flags &= ~KAIO_WAKEUP;
 		wakeup(&userp->p_aioinfo);
 	}
 }
 
 static void
 aio_schedule_fsync(void *context, int pending)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 
 	ki = context;
 	AIO_LOCK(ki);
 	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
 		job = TAILQ_FIRST(&ki->kaio_syncready);
 		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		AIO_LOCK(ki);
 	}
 	AIO_UNLOCK(ki);
 }
 
 bool
 aio_cancel_cleared(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	/*
 	 * The caller should hold the same queue lock held when
 	 * aio_clear_cancel_function() was called and set this flag
 	 * ensuring this check sees an up-to-date value.  However,
 	 * there is no way to assert that.
 	 */
 	ki = job->userproc->p_aioinfo;
 	return ((job->jobflags & KAIOCB_CLEARED) != 0);
 }
 
 static bool
 aio_clear_cancel_function_locked(struct kaiocb *job)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	MPASS(job->cancel_fn != NULL);
 	if (job->jobflags & KAIOCB_CANCELLING) {
 		job->jobflags |= KAIOCB_CLEARED;
 		return (false);
 	}
 	job->cancel_fn = NULL;
 	return (true);
 }
 
 bool
 aio_clear_cancel_function(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_clear_cancel_function_locked(job);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 static bool
 aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	if (job->jobflags & KAIOCB_CANCELLED)
 		return (false);
 	job->cancel_fn = func;
 	return (true);
 }
 
 bool
 aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_set_cancel_function_locked(job, func);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 void
 aio_complete(struct kaiocb *job, long status, int error)
 {
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	job->uaiocb._aiocb_private.error = error;
 	job->uaiocb._aiocb_private.status = status;
 
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 
 	AIO_LOCK(ki);
 	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
 	    ("duplicate aio_complete"));
 	job->jobflags |= KAIOCB_FINISHED;
 	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(userp, job);
 	}
 	AIO_UNLOCK(ki);
 }
 
 void
 aio_cancel(struct kaiocb *job)
 {
 
 	aio_complete(job, -1, ECANCELED);
 }
 
 void
 aio_switch_vmspace(struct kaiocb *job)
 {
 
 	vmspace_switch_aio(job->userproc->p_vmspace);
 }
 
 /*
  * The AIO daemon, most of the actual work is done in aio_process_*,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
 aio_daemon(void *_id)
 {
 	struct kaiocb *job;
 	struct aioproc *aiop;
 	struct kaioinfo *ki;
 	struct proc *p;
 	struct vmspace *myvm;
 	struct thread *td = curthread;
 	int id = (intptr_t)_id;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = td->td_proc;
 	myvm = vmspace_acquire_ref(p);
 
 	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
 
 	/*
 	 * Allocate and ready the aio control info.  There is one aiop structure
 	 * per daemon.
 	 */
 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
 	aiop->aioproc = p;
 	aiop->aioprocflags = 0;
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
 	sema_post(&aio_newproc_sem);
 
 	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * Take daemon off of free queue
 		 */
 		if (aiop->aioprocflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aioprocflags &= ~AIOP_FREE;
 		}
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((job = aio_selectjob(aiop)) != NULL) {
 			mtx_unlock(&aio_job_mtx);
 
 			ki = job->userproc->p_aioinfo;
 			job->handle_fn(job);
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&aio_job_mtx);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&aio_job_mtx);
 			/*
 			 * We have to restart to avoid race, we only sleep if
 			 * no job can be selected.
 			 */
 			continue;
 		}
 
 		mtx_assert(&aio_job_mtx, MA_OWNED);
 
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aioprocflags |= AIOP_FREE;
 
 		/*
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
 		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
 		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
 		    (aiop->aioprocflags & AIOP_FREE) &&
 		    num_aio_procs > target_aio_procs)
 			break;
 	}
 	TAILQ_REMOVE(&aio_freeproc, aiop, list);
 	num_aio_procs--;
 	mtx_unlock(&aio_job_mtx);
 	uma_zfree(aiop_zone, aiop);
 	free_unr(aiod_unr, id);
 	vmspace_free(myvm);
 
 	KASSERT(p->p_vmspace == myvm,
 	    ("AIOD: bad vmspace for exiting daemon"));
 	KASSERT(myvm->vm_refcnt > 1,
 	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
 	kproc_exit(0);
 }
 
 /*
  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
 aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
 	int id;
 
 	id = alloc_unr(aiod_unr);
 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
 		RFNOWAIT, 0, "aiod%d", id);
 	if (error == 0) {
 		/*
 		 * Wait until daemon is started.
 		 */
 		sema_wait(&aio_newproc_sem);
 		mtx_lock(&aio_job_mtx);
 		num_aio_procs++;
 		if (start != NULL)
 			(*start)--;
 		mtx_unlock(&aio_job_mtx);
 	} else {
 		free_unr(aiod_unr, id);
 	}
 	return (error);
 }
 
 /*
  * Try the high-performance, low-overhead physio method for eligible
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
  * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qphysio(struct proc *p, struct kaiocb *job)
 {
 	struct aiocb *cb;
 	struct file *fp;
 	struct bio *bp;
 	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	int error, ref, poff;
 	vm_prot_t prot;
 
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VCHR)
 		return (-1);
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
 
 	if ((csw->d_flags & D_DISK) == 0) {
 		error = -1;
 		goto unref;
 	}
 	if (cb->aio_nbytes > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
 	ki = p->p_aioinfo;
 	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
 	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
 		if (cb->aio_nbytes > MAXPHYS) {
 			error = -1;
 			goto unref;
 		}
 
 		pbuf = NULL;
 	} else {
 		if (cb->aio_nbytes > MAXPHYS - poff) {
 			error = -1;
 			goto unref;
 		}
 		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
 			error = -1;
 			goto unref;
 		}
 
 		job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
 		BUF_KERNPROC(pbuf);
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count++;
 		AIO_UNLOCK(ki);
 	}
 	job->bp = bp = g_alloc_bio();
 
 	bp->bio_length = cb->aio_nbytes;
 	bp->bio_bcount = cb->aio_nbytes;
 	bp->bio_done = aio_physwakeup;
 	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
 	bp->bio_offset = cb->aio_offset;
 	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
 	bp->bio_dev = dev;
 	bp->bio_caller1 = (void *)job;
 
 	prot = VM_PROT_READ;
 	if (cb->aio_lio_opcode == LIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
 	    nitems(job->pages));
 	if (job->npages < 0) {
 		error = EFAULT;
 		goto doerror;
 	}
 	if (pbuf != NULL) {
 		pmap_qenter((vm_offset_t)pbuf->b_data,
 		    job->pages, job->npages);
 		bp->bio_data = pbuf->b_data + poff;
 		atomic_add_int(&num_buf_aio, 1);
 	} else {
 		bp->bio_ma = job->pages;
 		bp->bio_ma_n = job->npages;
 		bp->bio_ma_offset = poff;
 		bp->bio_data = unmapped_buf;
 		bp->bio_flags |= BIO_UNMAPPED;
 	}
 
 	/* Perform transfer. */
 	csw->d_strategy(bp);
 	dev_relthread(dev, ref);
 	return (0);
 
 doerror:
 	if (pbuf != NULL) {
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 		relpbuf(pbuf, NULL);
 		job->pbuf = NULL;
 	}
 	g_destroy_bio(bp);
 	job->bp = NULL;
 unref:
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	nsig->sigev_notify = osig->sigev_notify;
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct oaiocb *ojob;
 	int error;
 
 	bzero(kjob, sizeof(struct aiocb));
 	error = copyin(ujob, kjob, sizeof(struct oaiocb));
 	if (error)
 		return (error);
 	ojob = (struct oaiocb *)kjob;
 	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
 }
 #endif
 
 static int
 aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 
 	return (copyin(ujob, kjob, sizeof(struct aiocb)));
 }
 
 static long
 aiocb_fetch_status(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.status));
 }
 
 static long
 aiocb_fetch_error(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.error));
 }
 
 static int
 aiocb_store_status(struct aiocb *ujob, long status)
 {
 
 	return (suword(&ujob->_aiocb_private.status, status));
 }
 
 static int
 aiocb_store_error(struct aiocb *ujob, long error)
 {
 
 	return (suword(&ujob->_aiocb_private.error, error));
 }
 
 static int
 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 
 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb_ops = {
 	.copyin = aiocb_copyin,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb_ops_osigevent = {
 	.copyin = aiocb_copyin_old_sigevent,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 #endif
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
  * technique is done in this code.
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
     int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	cap_rights_t rights;
 	struct file *fp;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct kevent kev;
 	int opcode;
 	int error;
 	int fd, kqfd;
 	int jid;
 	u_short evflags;
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	ops->store_status(ujob, -1);
 	ops->store_error(ujob, 0);
 	ops->store_kernelinfo(ujob, -1);
 
 	if (num_queue_count >= max_queue_count ||
 	    ki->kaio_count >= ki->kaio_qallowed_count) {
 		ops->store_error(ujob, EAGAIN);
 		return (EAGAIN);
 	}
 
 	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
 	knlist_init_mtx(&job->klist, AIO_MTX(ki));
 
 	error = ops->copyin(ujob, &job->uaiocb);
 	if (error) {
 		ops->store_error(ujob, error);
 		uma_zfree(aiocb_zone, job);
 		return (error);
 	}
 
 	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
 		ops->store_error(ujob, EINVAL);
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
 		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
 		uma_zfree(aiocb_zone, job);
 		return (EINVAL);
 	}
 
 	ksiginfo_init(&job->ksi);
 
 	/* Save userspace address of the job info. */
 	job->ujob = ujob;
 
 	/* Get the opcode. */
 	if (type != LIO_NOP)
 		job->uaiocb.aio_lio_opcode = type;
 	opcode = job->uaiocb.aio_lio_opcode;
 
 	/*
 	 * Validate the opcode and fetch the file object for the specified
 	 * file descriptor.
 	 *
 	 * XXXRW: Moved the opcode validation up here so that we don't
 	 * retrieve a file descriptor without knowing what the capabiltity
 	 * should be.
 	 */
 	fd = job->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
 		error = fget_write(td, fd,
 		    cap_rights_init(&rights, CAP_PWRITE), &fp);
 		break;
 	case LIO_READ:
 		error = fget_read(td, fd,
 		    cap_rights_init(&rights, CAP_PREAD), &fp);
 		break;
 	case LIO_SYNC:
 		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
 		break;
 	case LIO_MLOCK:
 		fp = NULL;
 		break;
 	case LIO_NOP:
 		error = fget(td, fd, cap_rights_init(&rights), &fp);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error) {
 		uma_zfree(aiocb_zone, job);
 		ops->store_error(ujob, error);
 		return (error);
 	}
 
 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	if ((opcode == LIO_READ || opcode == LIO_WRITE) &&
 	    job->uaiocb.aio_offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	job->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
 	jid = jobrefid++;
 	job->seqno = jobseqno++;
 	mtx_unlock(&aio_job_mtx);
 	error = ops->store_kernelinfo(ujob, jid);
 	if (error) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, job);
 		return (0);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
 	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	kev.ident = (uintptr_t)job->ujob;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
 	kev.data = (intptr_t)job;
 	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
 	error = kqfd_register(kqfd, &kev, td, 1);
 	if (error)
 		goto aqueue_fail;
 
 no_kqueue:
 
 	ops->store_error(ujob, EINPROGRESS);
 	job->uaiocb._aiocb_private.error = EINPROGRESS;
 	job->userproc = p;
 	job->cred = crhold(td->td_ucred);
 	job->jobflags = KAIOCB_QUEUEING;
 	job->lio = lj;
 
 	if (opcode == LIO_MLOCK) {
 		aio_schedule(job, aio_process_mlock);
 		error = 0;
 	} else if (fp->f_ops->fo_aio_queue == NULL)
 		error = aio_queue_file(fp, job);
 	else
 		error = fo_aio_queue(fp, job);
 	if (error)
 		goto aqueue_fail;
 
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_QUEUEING;
 	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
 	ki->kaio_count++;
 	if (lj)
 		lj->lioj_count++;
 	atomic_add_int(&num_queue_count, 1);
 	if (job->jobflags & KAIOCB_FINISHED) {
 		/*
 		 * The queue callback completed the request synchronously.
 		 * The bulk of the completion is deferred in that case
 		 * until this point.
 		 */
 		aio_bio_done_notify(p, job);
 	} else
 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
 	AIO_UNLOCK(ki);
 	return (0);
 
 aqueue_fail:
 	knlist_delete(&job->klist, curthread, 0);
 	if (fp)
 		fdrop(fp, td);
 	uma_zfree(aiocb_zone, job);
 	ops->store_error(ujob, error);
 	return (error);
 }
 
 static void
 aio_cancel_daemon_job(struct kaiocb *job)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&aio_jobs, job, list);
 	mtx_unlock(&aio_job_mtx);
 	aio_cancel(job);
 }
 
 void
 aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
 		mtx_unlock(&aio_job_mtx);
 		aio_cancel(job);
 		return;
 	}
 	job->handle_fn = func;
 	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
 	aio_kick_nowait(job->userproc);
 	mtx_unlock(&aio_job_mtx);
 }
 
 static void
 aio_cancel_sync(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
 	AIO_UNLOCK(ki);
 	aio_cancel(job);
 }
 
 int
 aio_queue_file(struct file *fp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *job2;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, opcode;
 	bool safe;
 
 	lj = job->lio;
 	ki = job->userproc->p_aioinfo;
 	opcode = job->uaiocb.aio_lio_opcode;
 	if (opcode == LIO_SYNC)
 		goto queueit;
 
 	if ((error = aio_qphysio(job->userproc, job)) == 0)
 		goto done;
 #if 0
 	/*
 	 * XXX: This means qphysio() failed with EFAULT.  The current
 	 * behavior is to retry the operation via fo_read/fo_write.
 	 * Wouldn't it be better to just complete the request with an
 	 * error here?
 	 */
 	if (error > 0)
 		goto done;
 #endif
 queueit:
 	safe = false;
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vp->v_type == VREG || vp->v_type == VDIR) {
 			mp = fp->f_vnode->v_mount;
 			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
 				safe = true;
 		}
 	}
 	if (!(safe || enable_aio_unsafe)) {
 		counted_warning(&unsafe_warningcnt,
 		    "is attempting to use unsafe AIO requests");
 		return (EOPNOTSUPP);
 	}
 
 	if (opcode == LIO_SYNC) {
 		AIO_LOCK(ki);
 		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
 			if (job2->fd_file == job->fd_file &&
 			    job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
 			    job2->seqno < job->seqno) {
 				job2->jobflags |= KAIOCB_CHECKSYNC;
 				job->pending++;
 			}
 		}
 		if (job->pending != 0) {
 			if (!aio_set_cancel_function_locked(job,
 				aio_cancel_sync)) {
 				AIO_UNLOCK(ki);
 				aio_cancel(job);
 				return (0);
 			}
 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 		AIO_UNLOCK(ki);
 	}
 
 	switch (opcode) {
 	case LIO_READ:
 	case LIO_WRITE:
 		aio_schedule(job, aio_process_rw);
 		error = 0;
 		break;
 	case LIO_SYNC:
 		aio_schedule(job, aio_process_sync);
 		error = 0;
 		break;
 	default:
 		error = EINVAL;
 	}
 done:
 	return (error);
 }
 
 static void
 aio_kick_nowait(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start <
 	    ki->kaio_maxactive_count) {
 		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
 	}
 }
 
 static int
 aio_kick(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 	int error, ret = 0;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start <
 	    ki->kaio_maxactive_count) {
 		num_aio_resv_start++;
 		mtx_unlock(&aio_job_mtx);
 		error = aio_newproc(&num_aio_resv_start);
 		mtx_lock(&aio_job_mtx);
 		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
 	} else {
 		ret = -1;
 	}
 	return (ret);
 }
 
 static void
 aio_kick_helper(void *context, int pending)
 {
 	struct proc *userp = context;
 
 	mtx_lock(&aio_job_mtx);
 	while (--pending >= 0) {
 		if (aio_kick(userp))
 			break;
 	}
 	mtx_unlock(&aio_job_mtx);
 }
 
 /*
  * Support the aio_return system call, as a side-effect, kernel resources are
  * released.
  */
 static int
 kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	long status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
 		if (job->ujob == ujob)
 			break;
 	}
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else {
 		error = EINVAL;
 		AIO_UNLOCK(ki);
 	}
 	return (error);
 }
 
 int
 sys_aio_return(struct thread *td, struct aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
 }
 
 /*
  * Allow a process to wakeup when any of the I/O requests are completed.
  */
 static int
 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
     struct timespec *ts)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *firstjob, *job;
 	int error, i, timo;
 
 	timo = 0;
 	if (ts) {
 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EAGAIN);
 
 	if (njoblist == 0)
 		return (0);
 
 	AIO_LOCK(ki);
 	for (;;) {
 		firstjob = NULL;
 		error = 0;
 		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
 				if (job->ujob == ujoblist[i]) {
 					if (firstjob == NULL)
 						firstjob = job;
 					if (job->jobflags & KAIOCB_FINISHED)
 						goto RETURN;
 				}
 			}
 		}
 		/* All tasks were finished. */
 		if (firstjob == NULL)
 			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiospn", timo);
 		if (error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 RETURN:
 	AIO_UNLOCK(ki);
 	return (error);
 }
 
 int
 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
 {
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	int error;
 
 	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
 	if (error == 0)
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	free(ujoblist, M_AIOS);
 	return (error);
 }
 
 /*
  * aio_cancel cancels any non-physio aio operations not currently in
  * progress.
  */
 int
 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 	struct kaiocb *job, *jobn;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 	int cancelled = 0;
 	int notcancelled = 0;
 	struct vnode *vp;
 
 	/* Lookup file object. */
 	error = fget(td, uap->fd, cap_rights_init(&rights), &fp);
 	if (error)
 		return (error);
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vn_isdisk(vp, &error)) {
 			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		if ((uap->fd == job->uaiocb.aio_fildes) &&
 		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == job->ujob))) {
 			if (aio_cancel_job(p, ki, job)) {
 				cancelled++;
 			} else {
 				notcancelled++;
 			}
 			if (uap->aiocbp != NULL)
 				break;
 		}
 	}
 	AIO_UNLOCK(ki);
 
 done:
 	fdrop(fp, td);
 
 	if (uap->aiocbp != NULL) {
 		if (cancelled) {
 			td->td_retval[0] = AIO_CANCELED;
 			return (0);
 		}
 	}
 
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
 
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
 
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
  * aio_error is implemented in the kernel level for compatibility purposes
  * only.  For a user mode async implementation, it would be best to do it in
  * a userland subroutine.
  */
 static int
 kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	int status;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL) {
 		td->td_retval[0] = EINVAL;
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 		if (job->ujob == ujob) {
 			if (job->jobflags & KAIOCB_FINISHED)
 				td->td_retval[0] =
 					job->uaiocb._aiocb_private.error;
 			else
 				td->td_retval[0] = EINPROGRESS;
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Hack for failure of aio_aqueue.
 	 */
 	status = ops->fetch_status(ujob);
 	if (status == -1) {
 		td->td_retval[0] = ops->fetch_error(ujob);
 		return (0);
 	}
 
 	td->td_retval[0] = EINVAL;
 	return (0);
 }
 
 int
 sys_aio_error(struct thread *td, struct aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
 }
 
 int
 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
 }
 
 static int
 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
     struct aiocb **acb_list, int nent, struct sigevent *sig,
     struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct aiocb *job;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kevent kev;
 	int error;
 	int nerror;
 	int i;
 
 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
 		return (EINVAL);
 
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
 	lj->lioj_flags = 0;
 	lj->lioj_count = 0;
 	lj->lioj_finished_count = 0;
 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
 	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (sig && (mode == LIO_NOWAIT)) {
 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			/* Assume only new style KEVENT */
 			kev.filter = EVFILT_LIO;
 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 			kev.ident = (uintptr_t)uacb_list; /* something unique */
 			kev.data = (intptr_t)lj;
 			/* pass user defined sigval data */
 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
 			error = kqfd_register(
 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
 			if (error) {
 				uma_zfree(aiolio_zone, lj);
 				return (error);
 			}
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
 			;
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 					uma_zfree(aiolio_zone, lj);
 					return EINVAL;
 				}
 				lj->lioj_flags |= LIOJ_SIGNAL;
 		} else {
 			uma_zfree(aiolio_zone, lj);
 			return EINVAL;
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
 	 * Add extra aiocb count to avoid the lio to be freed
 	 * by other threads doing aio_waitcomplete or aio_return,
 	 * and prevent event from being sent until we have queued
 	 * all tasks.
 	 */
 	lj->lioj_count = 1;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nerror = 0;
 	for (i = 0; i < nent; i++) {
 		job = acb_list[i];
 		if (job != NULL) {
 			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
 			if (error != 0)
 				nerror++;
 		}
 	}
 
 	error = 0;
 	AIO_LOCK(ki);
 	if (mode == LIO_WAIT) {
 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
 			    PRIBIO | PCATCH, "aiospn", 0);
 			if (error == ERESTART)
 				error = EINTR;
 			if (error)
 				break;
 		}
 	} else {
 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 				KNOTE_LOCKED(&lj->klist, 1);
 			}
 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 			    == LIOJ_SIGNAL
 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 				aio_sendsig(p, &lj->lioj_signal,
 					    &lj->lioj_ksi);
 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 			}
 		}
 	}
 	lj->lioj_count--;
 	if (lj->lioj_count == 0) {
 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 		knlist_delete(&lj->klist, curthread, 1);
 		PROC_LOCK(p);
 		sigqueue_take(&lj->lioj_ksi);
 		PROC_UNLOCK(p);
 		AIO_UNLOCK(ki);
 		uma_zfree(aiolio_zone, lj);
 	} else
 		AIO_UNLOCK(ki);
 
 	if (nerror)
 		return (EIO);
 	return (error);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent osig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode,
 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 		    &aiocb_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 /* syscall - list directed I/O (REALTIME) */
 int
 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig, sizeof(sig));
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
 		    nent, sigp, &aiocb_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 static void
 aio_physwakeup(struct bio *bp)
 {
 	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	size_t nbytes;
 	int error, nblks;
 
 	/* Release mapping into kernel space. */
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 	if (job->pbuf) {
 		pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
 		relpbuf(job->pbuf, NULL);
 		job->pbuf = NULL;
 		atomic_subtract_int(&num_buf_aio, 1);
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 	}
 	vm_page_unhold_pages(job->pages, job->npages);
 
 	bp = job->bp;
 	job->bp = NULL;
 	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
 	error = 0;
 	if (bp->bio_flags & BIO_ERROR)
 		error = bp->bio_error;
 	nblks = btodb(nbytes);
 	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
 		job->outblock += nblks;
 	else
 		job->inblock += nblks;
 
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, nbytes, 0);
 
 	g_destroy_bio(bp);
 }
 
 /* syscall - wait for the next completion of an aio request */
 static int
 kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
     struct timespec *ts, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 	struct aiocb *ujob;
 	long error, status;
 	int timo;
 
 	ops->store_aiocb(ujobp, NULL);
 
 	if (ts == NULL) {
 		timo = 0;
 	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
 		timo = -1;
 	} else {
 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
 
 	error = 0;
 	job = NULL;
 	AIO_LOCK(ki);
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
 		if (timo == -1) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiowc", timo);
 		if (timo && error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		ujob = job->ujob;
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_aiocb(ujobp, ujob);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else
 		AIO_UNLOCK(ki);
 
 	return (error);
 }
 
 int
 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
 }
 
 static int
 kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
     struct aiocb_ops *ops)
 {
 
 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
 	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
 }
 
 int
 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
 }
 
 /* kqueue attach function */
 static int
 filt_aioattach(struct knote *kn)
 {
 	struct kaiocb *job;
 
 	job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;
 
 	/*
 	 * The job pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_aio = job;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&job->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_aiodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_aio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_aio(struct knote *kn, long hint)
 {
 	struct kaiocb *job = kn->kn_ptr.p_aio;
 
 	kn->kn_data = job->uaiocb._aiocb_private.error;
 	if (!(job->jobflags & KAIOCB_FINISHED))
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
 
 /* kqueue attach function */
 static int
 filt_lioattach(struct knote *kn)
 {
 	struct aioliojob *lj;
 
 	lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;
 
 	/*
 	 * The aioliojob pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_lio = lj;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&lj->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_liodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_lio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_lio(struct knote *kn, long hint)
 {
 	struct aioliojob * lj = kn->kn_ptr.p_lio;
 
 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
 }
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 struct __aiocb_private32 {
 	int32_t	status;
 	int32_t	error;
 	uint32_t kernelinfo;
 };
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb32 {
 	int	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 } oaiocb32_t;
 #endif
 
 typedef struct aiocb32 {
 	int32_t	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	int	__spare__[2];
 	uint32_t __spare2__;
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
 } aiocb32_t;
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	CP(*osig, *nsig, sigev_notify);
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct oaiocb32 job32;
 	int error;
 
 	bzero(kjob, sizeof(struct aiocb));
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 
 	CP(job32, *kjob, aio_fildes);
 	CP(job32, *kjob, aio_offset);
 	PTRIN_CP(job32, *kjob, aio_buf);
 	CP(job32, *kjob, aio_nbytes);
 	CP(job32, *kjob, aio_lio_opcode);
 	CP(job32, *kjob, aio_reqprio);
 	CP(job32, *kjob, _aiocb_private.status);
 	CP(job32, *kjob, _aiocb_private.error);
 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
 	return (convert_old_sigevent32(&job32.aio_sigevent,
 	    &kjob->aio_sigevent));
 }
 #endif
 
 static int
 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct aiocb32 job32;
 	int error;
 
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 	CP(job32, *kjob, aio_fildes);
 	CP(job32, *kjob, aio_offset);
 	PTRIN_CP(job32, *kjob, aio_buf);
 	CP(job32, *kjob, aio_nbytes);
 	CP(job32, *kjob, aio_lio_opcode);
 	CP(job32, *kjob, aio_reqprio);
 	CP(job32, *kjob, _aiocb_private.status);
 	CP(job32, *kjob, _aiocb_private.error);
 	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
 	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
 }
 
 static long
 aiocb32_fetch_status(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.status));
 }
 
 static long
 aiocb32_fetch_error(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.error));
 }
 
 static int
 aiocb32_store_status(struct aiocb *ujob, long status)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.status, status));
 }
 
 static int
 aiocb32_store_error(struct aiocb *ujob, long error)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.error, error));
 }
 
 static int
 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword32(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb32_ops = {
 	.copyin = aiocb32_copyin,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb32_ops_osigevent = {
 	.copyin = aiocb32_copyin_old_sigevent,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 #endif
 
 int
 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 int
 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	uint32_t *ujoblist32;
 	int error, i;
 
 	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
 	ujoblist32 = (uint32_t *)ujoblist;
 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
 	    sizeof(ujoblist32[0]));
 	if (error == 0) {
 		for (i = uap->nent - 1; i >= 0; i--)
 			ujoblist[i] = PTRIN(ujoblist32[i]);
 
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	}
 	free(ujoblist, M_AIOS);
 	return (error);
 }
 
 int
 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_read(struct thread *td,
     struct freebsd6_freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_write(struct thread *td,
     struct freebsd6_freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_waitcomplete(struct thread *td,
     struct freebsd32_aio_waitcomplete_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_lio_listio(struct thread *td,
     struct freebsd6_freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent32 osig;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent32(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 int
 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct sigevent32 sig32;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig32, sizeof(sig32));
 		if (error)
 			return (error);
 		error = convert_sigevent32(&sig32, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 #endif
Index: head/sys/kern/vfs_bio.c
===================================================================
--- head/sys/kern/vfs_bio.c	(revision 326270)
+++ head/sys/kern/vfs_bio.c	(revision 326271)
@@ -1,5053 +1,5055 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/fail.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/swap_pager.h>
 #include "opt_compat.h"
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 static struct buf *buf;		/* buffer header pool */
 extern struct buf *swbuf;	/* Swap buffer header pool. */
 caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
 struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_invalidate(struct buf *bp);
 static void vfs_vmio_truncate(struct buf *bp, int npages);
 static void vfs_vmio_extend(struct buf *bp, int npages, int size);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
 		void (*)(struct buf *));
 static int buf_flush(struct vnode *vp, int);
 static int buf_recycle(bool);
 static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
 static void bufkva_free(struct buf *);
 static int buf_import(void *, void **, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 #endif
 
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 static long bufspace;
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
     &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
 #else
 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "Physical memory used for buffers");
 #endif
 static long bufkvaspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
     0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 long hibufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding metadata)");
 long bufspacethresh;
 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
     0, "Bufspace consumed before waking the daemon to free some");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
 static int bufdefragcnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
     "Minimum preferred space used for in-progress I/O");
 static long hirunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
 static int numdirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
    "Target number of free buffers");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "Threshold for clean buffer recycling");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
     "Number of times getnewbuf has had to restart a buffer acquisition");
 static int mappingrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
 static int numbufallocfails;
 SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
     "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
 static long notbufdflushes;
 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
     "Number of barrier writes");
 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
     &unmapped_buf_allowed, 0,
     "Permit the use of the unmapped i/o");
 int maxbcachebuf = MAXBCACHEBUF;
 SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
     "Maximum size of a buffer cache block");
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign __exclusive_cache_line bdlock;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx_padalign __exclusive_cache_line rbreqlock;
 
 /*
  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
  */
 static struct rwlock_padalign __exclusive_cache_line nblock;
 
 /*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign __exclusive_cache_line bdirtylock;
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * Request/wakeup point for the bufspace daemon.
  */
 static int bufspace_request;
 
 /*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
  * buffers is insufficient to characterize the demand for flushing them.
  */
 static int bd_speedupreq;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
  * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static volatile int needsbuffer;
 
 /*
  * Synchronization for bwillwrite() waiters.
  */
 static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
 
 /* Maximum number of clean buffer queues. */
 #define	CLEAN_QUEUES	16
 
 /* Configured number of clean queues. */
 static int clean_queues;
 
 /* Maximum number of buffer queues. */
 #define BUFFER_QUEUES	(QUEUE_CLEAN + CLEAN_QUEUES)
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 #ifdef INVARIANTS
 static int bq_len[BUFFER_QUEUES];
 #endif
 
 /*
  * Lock for each bufqueue
  */
 static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES];
 
 /*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	mtx_lock(&rbreqlock);
 	if (arg1 == &hirunningspace) {
 		if (value < lorunningspace)
 			error = EINVAL;
 		else
 			hirunningspace = value;
 	} else {
 		KASSERT(arg1 == &lorunningspace,
 		    ("%s: unknown arg1", __func__));
 		if (value > hirunningspace)
 			error = EINVAL;
 		else
 			lorunningspace = value;
 	}
 	mtx_unlock(&rbreqlock);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
 
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 		return (sysctl_handle_long(oidp, arg1, arg2, req));
 	lvalue = *(long *)arg1;
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
 #endif
 
 static int
 bqcleanq(void)
 {
 	static int nextq;
 
 	return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
 }
 
 static int
 bqisclean(int qindex)
 {
 
 	return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
 }
 
 /*
  *	bqlock:
  *
  *	Return the appropriate queue lock based on the index.
  */
 static inline struct mtx *
 bqlock(int qindex)
 {
 
 	return (struct mtx *)&bqlocks[qindex];
 }
 
 /*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
  */
 static void
 bdirtywakeup(void)
 {
 	mtx_lock(&bdirtylock);
 	if (bdirtywait) {
 		bdirtywait = 0;
 		wakeup(&bdirtywait);
 	}
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bdirtysub:
  *
  *	Decrement the numdirtybuffers count by one and wakeup any
  *	threads blocked in bwillwrite().
  */
 static void
 bdirtysub(void)
 {
 
 	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bdirtywakeup();
 }
 
 /*
  *	bdirtyadd:
  *
  *	Increment the numdirtybuffers count by one and wakeup the buf 
  *	daemon if needed.
  */
 static void
 bdirtyadd(void)
 {
 
 	/*
 	 * Only do the wakeup once as we cross the boundary.  The
 	 * buf daemon will keep running until the condition clears.
 	 */
 	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bd_wakeup();
 }
 
 /*
  *	bufspace_wakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
 static void
 bufspace_wakeup(void)
 {
 
 	/*
 	 * If someone is waiting for bufspace, wake them up.
 	 *
 	 * Since needsbuffer is set prior to doing an additional queue
 	 * scan it is safe to check for the flag prior to acquiring the
 	 * lock.  The thread that is preparing to scan again before
 	 * blocking would discover the buf we released.
 	 */
 	if (needsbuffer) {
 		rw_rlock(&nblock);
 		if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
 			wakeup(__DEVOLATILE(void *, &needsbuffer));
 		rw_runlock(&nblock);
 	}
 }
 
 /*
  *	bufspace_daemonwakeup:
  *
  *	Wakeup the daemon responsible for freeing clean bufs.
  */
 static void
 bufspace_daemonwakeup(void)
 {
 	rw_rlock(&nblock);
 	if (bufspace_request == 0) {
 		bufspace_request = 1;
 		wakeup(&bufspace_request);
 	}
 	rw_runlock(&nblock);
 }
 
 /*
  *	bufspace_adjust:
  *
  *	Adjust the reported bufspace for a KVA managed buffer, possibly
  * 	waking any waiters.
  */
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
 	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
 		atomic_subtract_long(&bufspace, -diff);
 		bufspace_wakeup();
 	} else {
 		space = atomic_fetchadd_long(&bufspace, diff);
 		/* Wake up the daemon on the transition. */
 		if (space < bufspacethresh && space + diff >= bufspacethresh)
 			bufspace_daemonwakeup();
 	}
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	bufspace_reserve:
  *
  *	Reserve bufspace before calling allocbuf().  metadata has a
  *	different space limit than data.
  */
 static int
 bufspace_reserve(int size, bool metadata)
 {
 	long limit;
 	long space;
 
 	if (metadata)
 		limit = maxbufspace;
 	else
 		limit = hibufspace;
 	do {
 		space = bufspace;
 		if (space + size > limit)
 			return (ENOSPC);
 	} while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
 
 	/* Wake up the daemon on the transition. */
 	if (space < bufspacethresh && space + size >= bufspacethresh)
 		bufspace_daemonwakeup();
 
 	return (0);
 }
 
 /*
  *	bufspace_release:
  *
  *	Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
 bufspace_release(int size)
 {
 	atomic_subtract_long(&bufspace, size);
 	bufspace_wakeup();
 }
 
 /*
  *	bufspace_wait:
  *
  *	Wait for bufspace, acting as the buf daemon if a locked vnode is
  *	supplied.  needsbuffer must be set in a safe fashion prior to
  *	polling for space.  The operation must be re-tried on return.
  */
 static void
 bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
 {
 	struct thread *td;
 	int error, fl, norunbuf;
 
 	if ((gbflags & GB_NOWAIT_BD) != 0)
 		return;
 
 	td = curthread;
 	rw_wlock(&nblock);
 	while (needsbuffer != 0) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
 			rw_wunlock(&nblock);
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
 			 * well belong to the vnode.  Flushing the
 			 * buffers there would make a progress that
 			 * cannot be achieved by the buf_daemon, that
 			 * cannot lock the vnode.
 			 */
 			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
 			    (td->td_pflags & TDP_NORUNNINGBUF);
 
 			/*
 			 * Play bufdaemon.  The getnewbuf() function
 			 * may be called while the thread owns lock
 			 * for another dirty buffer for the same
 			 * vnode, which makes it impossible to use
 			 * VOP_FSYNC() there, due to the buffer lock
 			 * recursion.
 			 */
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 			fl = buf_flush(vp, flushbufqtarget);
 			td->td_pflags &= norunbuf;
 			rw_wlock(&nblock);
 			if (fl != 0)
 				continue;
 			if (needsbuffer == 0)
 				break;
 		}
 		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
 		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
 	rw_wunlock(&nblock);
 }
 
 
 /*
  *	bufspace_daemon:
  *
  *	buffer space management daemon.  Tries to maintain some marginal
  *	amount of free buffer space so that requesting processes neither
  *	block nor work to reclaim buffers.
  */
 static void
 bufspace_daemon(void)
 {
 	for (;;) {
 		kproc_suspend_check(bufspacedaemonproc);
 
 		/*
 		 * Free buffers from the clean queue until we meet our
 		 * targets.
 		 *
 		 * Theory of operation:  The buffer cache is most efficient
 		 * when some free buffer headers and space are always
 		 * available to getnewbuf().  This daemon attempts to prevent
 		 * the excessive blocking and synchronization associated
 		 * with shortfall.  It goes through three phases according
 		 * demand:
 		 *
 		 * 1)	The daemon wakes up voluntarily once per-second
 		 *	during idle periods when the counters are below
 		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
 		 *
 		 * 2)	The daemon wakes up as we cross the thresholds
 		 *	ahead of any potential blocking.  This may bounce
 		 *	slightly according to the rate of consumption and
 		 *	release.
 		 *
 		 * 3)	The daemon and consumers are starved for working
 		 *	clean buffers.  This is the 'bufspace' sleep below
 		 *	which will inefficiently trade bufs with bqrelse
 		 *	until we return to condition 2.
 		 */
 		while (bufspace > lobufspace ||
 		    numfreebuffers < hifreebuffers) {
 			if (buf_recycle(false) != 0) {
 				atomic_set_int(&needsbuffer, 1);
 				if (buf_recycle(false) != 0) {
 					rw_wlock(&nblock);
 					if (needsbuffer)
 						rw_sleep(__DEVOLATILE(void *,
 						    &needsbuffer), &nblock,
 						    PRIBIO|PDROP, "bufspace",
 						    hz/10);
 					else
 						rw_wunlock(&nblock);
 				}
 			}
 			maybe_yield();
 		}
 
 		/*
 		 * Re-check our limits under the exclusive nblock.
 		 */
 		rw_wlock(&nblock);
 		if (bufspace < bufspacethresh &&
 		    numfreebuffers > lofreebuffers) {
 			bufspace_request = 0;
 			rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
 			    "-", hz);
 		} else
 			rw_wunlock(&nblock);
 	}
 }
 
 static struct kproc_desc bufspace_kp = {
 	"bufspacedaemon",
 	bufspace_daemon,
 	&bufspacedaemonproc
 };
 SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
     &bufspace_kp);
 
 /*
  *	bufmallocadjust:
  *
  *	Adjust the reported bufspace for a malloc managed buffer, possibly
  *	waking any waiters.
  */
 static void
 bufmallocadjust(struct buf *bp, int bufsize)
 {
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) != 0,
 	    ("bufmallocadjust: non-malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0)
 		atomic_subtract_long(&bufmallocspace, -diff);
 	else
 		atomic_add_long(&bufmallocspace, diff);
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	runningwakeup:
  *
  *	Wake up processes that are waiting on asynchronous writes to fall
  *	below lorunningspace.
  */
 static void
 runningwakeup(void)
 {
 
 	mtx_lock(&rbreqlock);
 	if (runningbufreq) {
 		runningbufreq = 0;
 		wakeup(&runningbufreq);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 /*
  *	runningbufwakeup:
  *
  *	Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 	long space, bspace;
 
 	bspace = bp->b_runningbufspace;
 	if (bspace == 0)
 		return;
 	space = atomic_fetchadd_long(&runningbufspace, -bspace);
 	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 	    space, bspace));
 	bp->b_runningbufspace = 0;
 	/*
 	 * Only acquire the lock and wakeup on the transition from exceeding
 	 * the threshold to falling below it.
 	 */
 	if (space < lorunningspace)
 		return;
 	if (space - bspace > lorunningspace)
 		return;
 	runningwakeup();
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		runningbufreq = 1;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline void
 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
     vm_offset_t size, vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static __inline void
 bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
  * Adjust the maxbcachbuf tunable.
  */
 static void
 maxbcachebuf_adjust(void)
 {
 	int i;
 
 	/*
 	 * maxbcachebuf must be a power of 2 >= MAXBSIZE.
 	 */
 	i = 2;
 	while (i * 2 <= maxbcachebuf)
 		i *= 2;
 	maxbcachebuf = i;
 	if (maxbcachebuf < MAXBSIZE)
 		maxbcachebuf = MAXBSIZE;
 	if (maxbcachebuf > MAXPHYS)
 		maxbcachebuf = MAXPHYS;
 	if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
 		printf("maxbcachebuf=%d\n", maxbcachebuf);
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 void
 bd_speedup(void)
 {
 	int needwake;
 
 	mtx_lock(&bdlock);
 	needwake = 0;
 	if (bd_speedupreq == 0 || bd_request == 0)
 		needwake = 1;
 	bd_speedupreq = 1;
 	bd_request = 1;
 	if (needwake)
 		wakeup(&bd_request);
 	mtx_unlock(&bdlock);
 }
 
 #ifndef NSWBUF_MIN
 #define	NSWBUF_MIN	16
 #endif
 
 #ifdef __i386__
 #define	TRANSIENT_DENOM	5
 #else
 #define	TRANSIENT_DENOM 10
 #endif
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
 	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
 	maxbcachebuf_adjust();
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 			    32 * 1024 * 1024 / (factor * 5));
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 		tuned_nbuf = 1;
 	} else
 		tuned_nbuf = 0;
 
 	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
 	maxbuf = (LONG_MAX / 3) / BKVASIZE;
 	if (nbuf > maxbuf) {
 		if (!tuned_nbuf)
 			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 			    maxbuf);
 		nbuf = maxbuf;
 	}
 
 	/*
 	 * Ideal allocation size for the transient bio submap is 10%
 	 * of the maximal space buffer map.  This roughly corresponds
 	 * to the amount of the buffer mapped for typical UFS load.
 	 *
 	 * Clip the buffer map to reserve space for the transient
 	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
 	 * maximum buffer map extent on the platform.
 	 *
 	 * The fall-back to the maxbuf in case of maxbcache unset,
 	 * allows to not trim the buffer KVA for the architectures
 	 * with ample KVA space.
 	 */
 	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 		buf_sz = (long)nbuf * BKVASIZE;
 		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 		    (TRANSIENT_DENOM - 1)) {
 			/*
 			 * There is more KVA than memory.  Do not
 			 * adjust buffer map size, and assign the rest
 			 * of maxbuf to transient map.
 			 */
 			biotmap_sz = maxbuf_sz - buf_sz;
 		} else {
 			/*
 			 * Buffer map spans all KVA we could afford on
 			 * this platform.  Give 10% (20% on i386) of
 			 * the buffer map to the transient bio map.
 			 */
 			biotmap_sz = buf_sz / TRANSIENT_DENOM;
 			buf_sz -= biotmap_sz;
 		}
 		if (biotmap_sz / INT_MAX > MAXPHYS)
 			bio_transient_maxcnt = INT_MAX;
 		else
 			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 		/*
 		 * Artificially limit to 1024 simultaneous in-flight I/Os
 		 * using the transient mapping.
 		 */
 		if (bio_transient_maxcnt > 1024)
 			bio_transient_maxcnt = 1024;
 		if (tuned_nbuf)
 			nbuf = buf_sz / BKVASIZE;
 	}
 
 	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
 	nswbuf = min(nbuf / 4, 256);
 	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	swbuf = (void *)v;
 	v = (caddr_t)(swbuf + nswbuf);
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	KASSERT(maxbcachebuf >= MAXBSIZE,
 	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
 	    MAXBSIZE));
 	mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
 	mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
 	for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
 		mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	rw_init(&nblock, "needsbuffer lock");
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_xflags = 0;
 		bp->b_data = bp->b_kvabase = unmapped_buf;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 #ifdef INVARIANTS
 		bq_len[QUEUE_EMPTY]++;
 #endif
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by metadata.  hibufspace is the nominal maximum
 	 * used by most other requests.  The differential is required to 
 	 * ensure that metadata deadlocks don't occur.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system. XXX This is less true with vmem.  We could use
 	 * PAGE_SIZE.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
 	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
 	lobufspace = (hibufspace / 20) * 19; /* 95% */
 	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
 	 * arbitrarily and may need further tuning. It corresponds to
 	 * 128 outstanding write IO requests (if IO size is 128 KiB),
 	 * which fits with many RAID controllers' tagged queuing limits.
 	 * The lower 1 MiB limit is the historical upper limit for
 	 * hirunningspace.
 	 */
 	hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
 	    16 * 1024 * 1024), 1024 * 1024);
 	lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);
 
 	/*
 	 * Limit the amount of malloc memory since it is wired permanently into
 	 * the kernel space.  Even though this is accounted for in the buffer
 	 * allocation, we don't want the malloced region to grow uncontrolled.
 	 * The malloc scheme improves memory utilization significantly on
 	 * average (small) directories.
 	 */
 	maxbufmallocspace = hibufspace / 20;
 
 	/*
 	 * Reduce the chance of a deadlock occurring by limiting the number
 	 * of delayed-write dirty buffers we allow to stack up.
 	 */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	numdirtybuffers = 0;
 	/*
 	 * To support extreme low-memory systems, make sure hidirtybuffers
 	 * cannot eat up all available buffer space.  This occurs when our
 	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
 	 * buffer space assuming BKVASIZE'd buffers.
 	 */
 	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 	/*
 	 * lofreebuffers should be sufficient to avoid stalling waiting on
 	 * buf headers under heavy utilization.  The bufs in per-cpu caches
 	 * are counted as free but will be unavailable to threads executing
 	 * on other cpus.
 	 *
 	 * hifreebuffers is the free target for the bufspace daemon.  This
 	 * should be set appropriately to limit work per-iteration.
 	 */
 	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
 	hifreebuffers = (3 * lofreebuffers) / 2;
 	numfreebuffers = nbuf;
 
 	/* Setup the kva and free list allocators. */
 	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
 	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
 	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
 
 	/*
 	 * Size the clean queue according to the amount of buffer space.
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
 	clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
 
 }
 
 #ifdef INVARIANTS
 static inline void
 vfs_buf_check_mapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_kvabase != unmapped_buf,
 	    ("mapped buf: b_kvabase was not updated %p", bp));
 	KASSERT(bp->b_data != unmapped_buf,
 	    ("mapped buf: b_data was not updated %p", bp));
 	KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
 	    MAXPHYS, ("b_data + b_offset unmapped %p", bp));
 }
 
 static inline void
 vfs_buf_check_unmapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_data == unmapped_buf,
 	    ("unmapped buf: corrupted b_data %p", bp));
 }
 
 #define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
 #define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
 #else
 #define	BUF_CHECK_MAPPED(bp) do {} while (0)
 #define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
 #endif
 
 static int
 isbufbusy(struct buf *bp)
 {
 	if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
 	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
 		return (1);
 	return (0);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 bufshutdown(int show_busybufs)
 {
 	static int first_buf_printf = 1;
 	struct buf *bp;
 	int iter, nbusy, pbusy;
 #ifndef PREEMPTION
 	int subiter;
 #endif
 
 	/* 
 	 * Sync filesystems for shutdown
 	 */
 	wdog_kern_pat(WD_LASTVAL);
 	sys_sync(curthread, NULL);
 
 	/*
 	 * With soft updates, some buffers that are
 	 * written will be remarked as dirty until other
 	 * buffers are written.
 	 */
 	for (iter = pbusy = 0; iter < 20; iter++) {
 		nbusy = 0;
 		for (bp = &buf[nbuf]; --bp >= buf; )
 			if (isbufbusy(bp))
 				nbusy++;
 		if (nbusy == 0) {
 			if (first_buf_printf)
 				printf("All buffers synced.");
 			break;
 		}
 		if (first_buf_printf) {
 			printf("Syncing disks, buffers remaining... ");
 			first_buf_printf = 0;
 		}
 		printf("%d ", nbusy);
 		if (nbusy < pbusy)
 			iter = 0;
 		pbusy = nbusy;
 
 		wdog_kern_pat(WD_LASTVAL);
 		sys_sync(curthread, NULL);
 
 #ifdef PREEMPTION
 		/*
 		 * Drop Giant and spin for a while to allow
 		 * interrupt threads to run.
 		 */
 		DROP_GIANT();
 		DELAY(50000 * iter);
 		PICKUP_GIANT();
 #else
 		/*
 		 * Drop Giant and context switch several times to
 		 * allow interrupt threads to run.
 		 */
 		DROP_GIANT();
 		for (subiter = 0; subiter < 50 * iter; subiter++) {
 			thread_lock(curthread);
 			mi_switch(SW_VOL, NULL);
 			thread_unlock(curthread);
 			DELAY(1000);
 		}
 		PICKUP_GIANT();
 #endif
 	}
 	printf("\n");
 	/*
 	 * Count only busy local buffers to prevent forcing 
 	 * a fsck if we're just a client of a wedged NFS server
 	 */
 	nbusy = 0;
 	for (bp = &buf[nbuf]; --bp >= buf; ) {
 		if (isbufbusy(bp)) {
 #if 0
 /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
 			if (bp->b_dev == NULL) {
 				TAILQ_REMOVE(&mountlist,
 				    bp->b_vp->v_mount, mnt_list);
 				continue;
 			}
 #endif
 			nbusy++;
 			if (show_busybufs > 0) {
 				printf(
 	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
 				    nbusy, bp, bp->b_vp, bp->b_flags,
 				    (intmax_t)bp->b_blkno,
 				    (intmax_t)bp->b_lblkno);
 				BUF_LOCKPRINTINFO(bp);
 				if (show_busybufs > 1)
 					vn_printf(bp->b_vp,
 					    "vnode content: ");
 			}
 		}
 	}
 	if (nbusy) {
 		/*
 		 * Failed to sync all blocks. Indicate this and don't
 		 * unmount filesystems (thus forcing an fsck on reboot).
 		 */
 		printf("Giving up on %d buffers\n", nbusy);
 		DELAY(5000000);	/* 5 seconds */
 	} else {
 		if (!first_buf_printf)
 			printf("Final sync complete\n");
 		/*
 		 * Unmount filesystems
 		 */
 		if (panicstr == NULL)
 			vfs_unmountall();
 	}
 	swapoff_all();
 	DELAY(100000);		/* wait for console output to finish */
 }
 
 static void
 bpmap_qenter(struct buf *bp)
 {
 
 	BUF_CHECK_MAPPED(bp);
 
 	/*
 	 * bp->b_data is relative to bp->b_offset, but
 	 * bp->b_offset may be offset into the first page.
 	 */
 	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
 	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
 /*
  *	binsfree:
  *
  *	Insert the buffer into the appropriate free list.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
 	struct mtx *olock, *nlock;
 
 	if (qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 
 	/*
 	 * Stick to the same clean queue for the lifetime of the buf to
 	 * limit locking below.  Otherwise pick ont sequentially.
 	 */
 	if (qindex == QUEUE_CLEAN) {
 		if (bqisclean(bp->b_qindex))
 			qindex = bp->b_qindex;
 		else
 			qindex = bqcleanq();
 	}
 
 	/*
 	 * Handle delayed bremfree() processing.
 	 */
 	nlock = bqlock(qindex);
 	if (bp->b_flags & B_REMFREE) {
 		olock = bqlock(bp->b_qindex);
 		mtx_lock(olock);
 		bremfreel(bp);
 		if (olock != nlock) {
 			mtx_unlock(olock);
 			mtx_lock(nlock);
 		}
 	} else
 		mtx_lock(nlock);
 
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("binsfree: free buffer onto another queue???");
 
 	bp->b_qindex = qindex;
 	if (bp->b_flags & B_AGE)
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 	else
 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	bq_len[bp->b_qindex]++;
 #endif
 	mtx_unlock(nlock);
 }
 
 /*
  * buf_free:
  *
  *	Free a buffer to the buf zone once it no longer has valid contents.
  */
 static void
 buf_free(struct buf *bp)
 {
 
 	if (bp->b_flags & B_REMFREE)
 		bremfreef(bp);
 	if (bp->b_vflags & BV_BKGRDINPROG)
 		panic("losing buffer 1");
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
 	atomic_add_int(&numfreebuffers, 1);
 	bufspace_wakeup();
 }
 
 /*
  * buf_import:
  *
  *	Import bufs into the uma cache from the buf list.  The system still
  *	expects a static array of bufs and much of the synchronization
  *	around bufs assumes type stable storage.  As a result, UMA is used
  *	only as a per-cpu cache of bufs still maintained on a global list.
  */
 static int
 buf_import(void *arg, void **store, int cnt, int flags)
 {
 	struct buf *bp;
 	int i;
 
 	mtx_lock(&bqlocks[QUEUE_EMPTY]);
 	for (i = 0; i < cnt; i++) {
 		bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		if (bp == NULL)
 			break;
 		bremfreel(bp);
 		store[i] = bp;
 	}
 	mtx_unlock(&bqlocks[QUEUE_EMPTY]);
 
 	return (i);
 }
 
 /*
  * buf_release:
  *
  *	Release bufs from the uma cache back to the buffer queues.
  */
 static void
 buf_release(void *arg, void **store, int cnt)
 {
         int i;
 
         for (i = 0; i < cnt; i++)
 		binsfree(store[i], QUEUE_EMPTY);
 }
 
 /*
  * buf_alloc:
  *
  *	Allocate an empty buffer header.
  */
 static struct buf *
 buf_alloc(void)
 {
 	struct buf *bp;
 
 	bp = uma_zalloc(buf_zone, M_NOWAIT);
 	if (bp == NULL) {
 		bufspace_daemonwakeup();
 		atomic_add_int(&numbufallocfails, 1);
 		return (NULL);
 	}
 
 	/*
 	 * Wake-up the bufspace daemon on transition.
 	 */
 	if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
 		bufspace_daemonwakeup();
 
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
 	
 	KASSERT(bp->b_vp == NULL,
 	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
 	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
 	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
 	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
 	KASSERT(bp->b_npages == 0,
 	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
 	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
 	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
 	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
 	bp->b_vflags = 0;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_bufobj = NULL;
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
 	LIST_INIT(&bp->b_dep);
 
 	return (bp);
 }
 
 /*
  *	buf_qrecycle:
  *
  *	Free a buffer from the given bufqueue.  kva controls whether the
  *	freed buf must own some kva resources.  This is used for
  *	defragmenting.
  */
 static int
 buf_qrecycle(int qindex, bool kva)
 {
 	struct buf *bp, *nbp;
 
 	if (kva)
 		atomic_add_int(&bufdefragcnt, 1);
 	nbp = NULL;
 	mtx_lock(&bqlocks[qindex]);
 	nbp = TAILQ_FIRST(&bufqueues[qindex]);
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 	while ((bp = nbp) != NULL) {
 		/*
 		 * Calculate next bp (we can only use it if we do not
 		 * release the bqlock).
 		 */
 		nbp = TAILQ_NEXT(bp, b_freelist);
 
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * some kva to reclaim.
 		 */
 		if (kva && bp->b_kvasize == 0)
 			continue;
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 
 		/*
 		 * Skip buffers with background writes in progress.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		KASSERT(bp->b_qindex == qindex,
 		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
 		bremfreel(bp);
 		mtx_unlock(&bqlocks[qindex]);
 
 		/*
 		 * Requeue the background write buffer with error and
 		 * restart the scan.
 		 */
 		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
 			bqrelse(bp);
 			mtx_lock(&bqlocks[qindex]);
 			nbp = TAILQ_FIRST(&bufqueues[qindex]);
 			continue;
 		}
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		return (0);
 	}
 	mtx_unlock(&bqlocks[qindex]);
 
 	return (ENOBUFS);
 }
 
 /*
  *	buf_recycle:
  *
  *	Iterate through all clean queues until we find a buf to recycle or
  *	exhaust the search.
  */
 static int
 buf_recycle(bool kva)
 {
 	int qindex, first_qindex;
 
 	qindex = first_qindex = bqcleanq();
 	do {
 		if (buf_qrecycle(qindex, kva) == 0)
 			return (0);
 		if (++qindex == QUEUE_CLEAN + clean_queues)
 			qindex = QUEUE_CLEAN;
 	} while (qindex != first_qindex);
 
 	return (ENOBUFS);
 }
 
 /*
  *	buf_scan:
  *
  *	Scan the clean queues looking for a buffer to recycle.  needsbuffer
  *	is set on failure so that the caller may optionally bufspace_wait()
  *	in a race-free fashion.
  */
 static int
 buf_scan(bool defrag)
 {
 	int error;
 
 	/*
 	 * To avoid heavy synchronization and wakeup races we set
 	 * needsbuffer and re-poll before failing.  This ensures that
 	 * no frees can be missed between an unsuccessful poll and
 	 * going to sleep in a synchronized fashion.
 	 */
 	if ((error = buf_recycle(defrag)) != 0) {
 		atomic_set_int(&needsbuffer, 1);
 		bufspace_daemonwakeup();
 		error = buf_recycle(defrag);
 	}
 	if (error == 0)
 		atomic_add_int(&getnewbufrestarts, 1);
 	return (error);
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	struct mtx *qlock;
 
 	qlock = bqlock(bp->b_qindex);
 	mtx_lock(qlock);
 	bremfreel(bp);
 	mtx_unlock(qlock);
 }
 
 /*
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
  *	correct qlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
 	if (bp->b_qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
 	    bp->b_qindex));
 	bq_len[bp->b_qindex]--;
 #endif
 	bp->b_qindex = QUEUE_NONE;
 	bp->b_flags &= ~B_REMFREE;
 }
 
 /*
  *	bufkva_free:
  *
  *	Free the kva allocation for a buffer.
  *
  */
 static void
 bufkva_free(struct buf *bp)
 {
 
 #ifdef INVARIANTS
 	if (bp->b_kvasize == 0) {
 		KASSERT(bp->b_kvabase == unmapped_buf &&
 		    bp->b_data == unmapped_buf,
 		    ("Leaked KVA space on %p", bp));
 	} else if (buf_mapped(bp))
 		BUF_CHECK_MAPPED(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 #endif
 	if (bp->b_kvasize == 0)
 		return;
 
 	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
 	atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
 	atomic_add_int(&buffreekvacnt, 1);
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_kvasize = 0;
 }
 
 /*
  *	bufkva_alloc:
  *
  *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
  */
 static int
 bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
 {
 	vm_offset_t addr;
 	int error;
 
 	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
 	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
 
 	bufkva_free(bp);
 
 	addr = 0;
 	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
 	if (error != 0) {
 		/*
 		 * Buffer map is too fragmented.  Request the caller
 		 * to defragment the map.
 		 */
 		return (error);
 	}
 	bp->b_kvabase = (caddr_t)addr;
 	bp->b_kvasize = maxsize;
 	atomic_add_long(&bufkvaspace, bp->b_kvasize);
 	if ((gbflags & GB_UNMAPPED) != 0) {
 		bp->b_data = unmapped_buf;
 		BUF_CHECK_UNMAPPED(bp);
 	} else {
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 	}
 	return (0);
 }
 
 /*
  *	bufkva_reclaim:
  *
  *	Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
  *	callback that fires to avoid returning failure.
  */
 static void
 bufkva_reclaim(vmem_t *vmem, int flags)
 {
 	int i;
 
 	for (i = 0; i < 5; i++)
 		if (buf_scan(true) != 0)
 			break;
 	return;
 }
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 static void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt,
     struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *))
 {
 	struct buf *rabp;
 	int i;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 		if ((rabp->b_flags & B_CACHE) != 0) {
 			brelse(rabp);
 			continue;
 		}
 		if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 			if (racct_enable) {
 				PROC_LOCK(curproc);
 				racct_add_buf(curproc, rabp, 0);
 				PROC_UNLOCK(curproc);
 			}
 #endif /* RACCT */
 			curthread->td_ru.ru_inblock++;
 		}
 		rabp->b_flags |= B_ASYNC;
 		rabp->b_flags &= ~B_INVAL;
 		if ((flags & GB_CKHASH) != 0) {
 			rabp->b_flags |= B_CKHASH;
 			rabp->b_ckhashcalc = ckhashfunc;
 		}
 		rabp->b_ioflags &= ~BIO_ERROR;
 		rabp->b_iocmd = BIO_READ;
 		if (rabp->b_rcred == NOCRED && cred != NOCRED)
 			rabp->b_rcred = crhold(cred);
 		vfs_busy_pages(rabp, 0);
 		BUF_KERNPROC(rabp);
 		rabp->b_iooffset = dbtob(rabp->b_blkno);
 		bstrategy(rabp);
 	}
 }
 
 /*
  * Entry point for bread() and breadn() via #defines in sys/buf.h.
  *
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything, see
  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
  *
  * Always return a NULL buffer pointer (in bpp) when returning an error.
  */
 int
 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
     int *rabsize, int cnt, struct ucred *cred, int flags,
     void (*ckhashfunc)(struct buf *), struct buf **bpp)
 {
 	struct buf *bp;
 	int readwait, rv;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	/*
 	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
 	 */
 	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
 	if (bp == NULL)
 		return (EBUSY);
 
 	/*
 	 * If not found in cache, do some I/O
 	 */
 	readwait = 0;
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 			if (racct_enable) {
 				PROC_LOCK(curproc);
 				racct_add_buf(curproc, bp, 0);
 				PROC_UNLOCK(curproc);
 			}
 #endif /* RACCT */
 			curthread->td_ru.ru_inblock++;
 		}
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		if ((flags & GB_CKHASH) != 0) {
 			bp->b_flags |= B_CKHASH;
 			bp->b_ckhashcalc = ckhashfunc;
 		}
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	/*
 	 * Attempt to initiate asynchronous I/O on read-ahead blocks.
 	 */
 	breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc);
 
 	rv = 0;
 	if (readwait) {
 		rv = bufwait(bp);
 		if (rv != 0) {
 			brelse(bp);
 			*bpp = NULL;
 		}
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
 		bp->b_flags |= B_INVAL | B_RELBUF;
 		bp->b_flags &= ~B_CACHE;
 		brelse(bp);
 		return (ENXIO);
 	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (bp->b_flags & B_BARRIER)
 		barrierwrites++;
 
 	oldflags = bp->b_flags;
 
 	BUF_ASSERT_HELD(bp);
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/*
 	 * Mark the buffer clean.  Increment the bufobj write count
 	 * before bundirty() call, to prevent other thread from seeing
 	 * empty dirty list and zero counter for writes in progress,
 	 * falsely indicating that the bufobj is clean.
 	 */
 	bufobj_wref(bp->b_bufobj);
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, bp, 1);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		curthread->td_ru.ru_oublock++;
 	}
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	buf_track(bp, __func__);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT((bp->b_flags & B_BARRIER) == 0,
 	    ("Barrier request in delayed write %p", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	buf_track(bp, __func__);
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty
 	 * pages.
 	 *
 	 * Mark the buffer pages as clean.  We need to do this here to
 	 * satisfy the vnode_pager and the pageout daemon, so that it
 	 * thinks that the pages have been "cleaned".  Note that since
 	 * the pages are in a delayed write buffer -- the VFS layer
 	 * "will" see that the pages get written out on the next sync,
 	 * or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages_dirty_buf(bp);
 	bqrelse(bp);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		bdirtyadd();
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		bdirtysub();
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	babarrierwrite:
  *
  *	Asynchronous barrier write.  Start output on a buffer, but do not
  *	wait for it to complete.  Place a write barrier after this write so
  *	that this buffer and all buffers written before it are committed to
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 void
 babarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC | B_BARRIER;
 	(void) bwrite(bp);
 }
 
 /*
  *	bbarrierwrite:
  *
  *	Synchronous barrier write.  Start output on a buffer and wait for
  *	it to complete.  Place a write barrier after this write so that
  *	this buffer and all buffers written before it are committed to 
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 int
 bbarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_BARRIER;
 	return (bwrite(bp));
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
 		mtx_lock(&bdirtylock);
 		while (numdirtybuffers >= hidirtybuffers) {
 			bdirtywait = 1;
 			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
 			    "flswai", 0);
 		}
 		mtx_unlock(&bdirtylock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return(numdirtybuffers >= hidirtybuffers);
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	int qindex;
 
 	/*
 	 * Many functions erroneously call brelse with a NULL bp under rare
 	 * error conditions. Simply return when called with a NULL bp.
 	 */
 	if (bp == NULL)
 		return;
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 	KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
 	    ("brelse: non-VMIO buffer marked NOREUSE"));
 
 	if (BUF_LOCKRECURSED(bp)) {
 		/*
 		 * Do not process, in particular, do not handle the
 		 * B_INVAL/B_RELBUF and do not release to free list.
 		 */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		bdirty(bp);
 	}
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) &&
 	    !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  All errors except ENXIO (which
 		 * means the device is gone) are treated as being
 		 * transient.
 		 *
 		 * XXX Treating EIO as transient is not correct; the
 		 * contract with the local storage device drivers is that
 		 * they will only return EIO once the I/O is no longer
 		 * retriable.  Network I/O also respects this through the
 		 * guarantees of TCP and/or the internal retries of NFS.
 		 * ENOMEM might be transient, but we also have no way of
 		 * knowing when its ok to retry/reschedule.  In general,
 		 * this entire case should be made obsolete through better
 		 * error handling/recovery and resource scheduling.
 		 *
 		 * Do this also for buffers that failed with ENXIO, but have
 		 * non-empty dependencies - the soft updates code might need
 		 * to access the buffer to untangle them.
 		 *
 		 * Must clear BIO_ERROR to prevent pages from being scrapped.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed read I/O, or we were asked to free or not
 		 * cache the buffer, or we failed to write to a device that's
 		 * no longer present.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI)
 			bdirtysub();
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
 	 * if B_DELWRI is set.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
 	    (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
 	    !(bp->b_vp->v_mount != NULL &&
 	    (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) {
 		vfs_vmio_invalidate(bp);
 		allocbuf(bp, 0);
 	}
 
 	if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
 	    (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
 		allocbuf(bp, 0);
 		bp->b_flags &= ~B_NOREUSE;
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 			
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
 	 * doesn't find it.
 	 */
 	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
 	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
 		bp->b_flags |= B_INVAL;
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
 	buf_track(bp, __func__);
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		buf_free(bp);
 		return;
 	}
 	/* buffers with junk contents */
 	if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		qindex = QUEUE_CLEAN;
 		bp->b_flags |= B_AGE;
 	/* remaining buffers */
 	} else if (bp->b_flags & B_DELWRI)
 		qindex = QUEUE_DIRTY;
 	else
 		qindex = QUEUE_CLEAN;
 
 	binsfree(bp, qindex);
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 	if (qindex == QUEUE_CLEAN)
 		bufspace_wakeup();
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	qindex = QUEUE_NONE;
 	if (BUF_LOCKRECURSED(bp)) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		goto out;
 	}
 
 	/* buffers with stale but valid contents */
 	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
 	    BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		qindex = QUEUE_DIRTY;
 	} else {
 		if ((bp->b_flags & B_DELWRI) == 0 &&
 		    (bp->b_xflags & BX_VNDIRTY))
 			panic("bqrelse: not dirty");
 		if ((bp->b_flags & B_NOREUSE) != 0) {
 			brelse(bp);
 			return;
 		}
 		qindex = QUEUE_CLEAN;
 	}
 	binsfree(bp, qindex);
 
 out:
 	buf_track(bp, __func__);
 	/* unlock */
 	BUF_UNLOCK(bp);
 	if (qindex == QUEUE_CLEAN)
 		bufspace_wakeup();
 }
 
 /*
  * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
  * restore bogus pages.
  */
 static void
 vfs_vmio_iodone(struct buf *bp)
 {
 	vm_ooffset_t foff;
 	vm_page_t m;
 	vm_object_t obj;
 	struct vnode *vp;
 	int i, iosize, resid;
 	bool bogus;
 
 	obj = bp->b_bufobj->bo_object;
 	KASSERT(obj->paging_in_progress >= bp->b_npages,
 	    ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
 	    obj->paging_in_progress, bp->b_npages));
 
 	vp = bp->b_vp;
 	KASSERT(vp->v_holdcnt > 0,
 	    ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
 	KASSERT(vp->v_object != NULL,
 	    ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
 
 	bogus = false;
 	iosize = bp->b_bcount - bp->b_resid;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 		if (resid > iosize)
 			resid = iosize;
 
 		/*
 		 * cleanup bogus pages, restoring the originals
 		 */
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			bogus = true;
 			m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 			if (m == NULL)
 				panic("biodone: page disappeared!");
 			bp->b_pages[i] = m;
 		} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
 			    resid)) == 0, ("vfs_vmio_iodone: page %p "
 			    "has unexpected dirty bits", m));
 			vfs_page_set_valid(bp, foff, m);
 		}
 		KASSERT(OFF_TO_IDX(foff) == m->pindex,
 		    ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
 		    (intmax_t)foff, (uintmax_t)m->pindex));
 
 		vm_page_sunbusy(m);
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		iosize -= resid;
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  * Unwire a page held by a buf and place it on the appropriate vm queue.
  */
 static void
 vfs_vmio_unwire(struct buf *bp, vm_page_t m)
 {
 	bool freed;
 
 	vm_page_lock(m);
 	if (vm_page_unwire(m, PQ_NONE)) {
 		/*
 		 * Determine if the page should be freed before adding
 		 * it to the inactive queue.
 		 */
 		if (m->valid == 0) {
 			freed = !vm_page_busied(m);
 			if (freed)
 				vm_page_free(m);
 		} else if ((bp->b_flags & B_DIRECT) != 0)
 			freed = vm_page_try_to_free(m);
 		else
 			freed = false;
 		if (!freed) {
 			/*
 			 * If the page is unlikely to be reused, let the
 			 * VM know.  Otherwise, maintain LRU page
 			 * ordering and put the page at the tail of the
 			 * inactive queue.
 			 */
 			if ((bp->b_flags & B_NOREUSE) != 0)
 				vm_page_deactivate_noreuse(m);
 			else
 				vm_page_deactivate(m);
 		}
 	}
 	vm_page_unlock(m);
 }
 
 /*
  * Perform page invalidation when a buffer is released.  The fully invalid
  * pages will be reclaimed later in vfs_vmio_truncate().
  */
 static void
 vfs_vmio_invalidate(struct buf *bp)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int i, resid, poffset, presid;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	/*
 	 * Get the base offset and length of the buffer.  Note that 
 	 * in the VMIO case if the buffer block size is not
 	 * page-aligned then b_data pointer may not be page-aligned.
 	 * But our b_pages[] array *IS* page aligned.
 	 *
 	 * block sizes less then DEV_BSIZE (usually 512) are not 
 	 * supported due to the page granularity bits (m->valid,
 	 * m->dirty, etc...). 
 	 *
 	 * See man buf(9) for more information
 	 */
 	obj = bp->b_bufobj->bo_object;
 	resid = bp->b_bufsize;
 	poffset = bp->b_offset & PAGE_MASK;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page)
 			panic("vfs_vmio_invalidate: Unexpected bogus page.");
 		bp->b_pages[i] = NULL;
 
 		presid = resid > (PAGE_SIZE - poffset) ?
 		    (PAGE_SIZE - poffset) : resid;
 		KASSERT(presid >= 0, ("brelse: extra page"));
 		while (vm_page_xbusied(m)) {
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(obj);
 			vm_page_busy_sleep(m, "mbncsh", true);
 			VM_OBJECT_WLOCK(obj);
 		}
 		if (pmap_page_wired_mappings(m) == 0)
 			vm_page_set_invalid(m, poffset, presid);
 		vfs_vmio_unwire(bp, m);
 		resid -= presid;
 		poffset = 0;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = 0;
 }
 
 /*
  * Page-granular truncation of an existing VMIO buffer.
  */
 static void
 vfs_vmio_truncate(struct buf *bp, int desiredpages)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int i;
 
 	if (bp->b_npages == desiredpages)
 		return;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
 		    (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	obj = bp->b_bufobj->bo_object;
 	if (obj != NULL)
 		VM_OBJECT_WLOCK(obj);
 	for (i = desiredpages; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
 		bp->b_pages[i] = NULL;
 		vfs_vmio_unwire(bp, m);
 	}
 	if (obj != NULL)
 		VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = desiredpages;
 }
 
 /*
  * Byte granular extension of VMIO buffers.
  */
 static void
 vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
 {
 	/*
 	 * We are growing the buffer, possibly in a 
 	 * byte-granular fashion.
 	 */
 	vm_object_t obj;
 	vm_offset_t toff;
 	vm_offset_t tinc;
 	vm_page_t m;
 
 	/*
 	 * Step 1, bring in the VM pages from the object, allocating
 	 * them if necessary.  We must clear B_CACHE if these pages
 	 * are not valid for the range covered by the buffer.
 	 */
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	if (bp->b_npages < desiredpages) {
 		/*
 		 * We must allocate system pages since blocking
 		 * here could interfere with paging I/O, no
 		 * matter which process we are.
 		 *
 		 * Only exclusive busy can be tested here.
 		 * Blocking on shared busy might lead to
 		 * deadlocks once allocbuf() is called after
 		 * pages are vfs_busy_pages().
 		 */
 		(void)vm_page_grab_pages(obj,
 		    OFF_TO_IDX(bp->b_offset) + bp->b_npages,
 		    VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_NOBUSY | VM_ALLOC_WIRED,
 		    &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages);
 		bp->b_npages = desiredpages;
 	}
 
 	/*
 	 * Step 2.  We've loaded the pages into the buffer,
 	 * we have to figure out if we can still have B_CACHE
 	 * set.  Note that B_CACHE is set according to the
 	 * byte-granular range ( bcount and size ), not the
 	 * aligned range ( newbsize ).
 	 *
 	 * The VM test is against m->valid, which is DEV_BSIZE
 	 * aligned.  Needless to say, the validity of the data
 	 * needs to also be DEV_BSIZE aligned.  Note that this
 	 * fails with NFS if the server or some other client
 	 * extends the file's EOF.  If our buffer is resized, 
 	 * B_CACHE may remain set! XXX
 	 */
 	toff = bp->b_bcount;
 	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 	while ((bp->b_flags & B_CACHE) && toff < size) {
 		vm_pindex_t pi;
 
 		if (tinc > (size - toff))
 			tinc = size - toff;
 		pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
 		m = bp->b_pages[pi];
 		vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
 		toff += tinc;
 		tinc = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 
 	/*
 	 * Step 3, fixup the KVA pmap.
 	 */
 	if (buf_mapped(bp))
 		bpmap_qenter(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	struct bufobj *bo;
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 	int gbflags;
 
 	bo = &vp->v_bufobj;
 	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		BO_RLOCK(bo);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 		BO_RUNLOCK(bo);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
 			    gbflags);
 			return (nwritten);
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return (nwritten);
 }
 
 /*
  *	getnewbuf_kva:
  *
  *	Allocate KVA for an empty buf header according to gbflags.
  */
 static int
 getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
 {
 
 	if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
 		/*
 		 * In order to keep fragmentation sane we only allocate kva
 		 * in BKVASIZE chunks.  XXX with vmem we can do page size.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize &&
 		    bufkva_alloc(bp, maxsize, gbflags))
 			return (ENOSPC);
 	}
 	return (0);
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_arena is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	The caller is responsible for releasing the reserved bufspace after
  *	allocbuf() is called.
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
 {
 	struct buf *bp;
 	bool metadata, reserved;
 
 	bp = NULL;
 	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	if (!unmapped_buf_allowed)
 		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
 	    vp->v_type == VCHR)
 		metadata = true;
 	else
 		metadata = false;
 	atomic_add_int(&getnewbufcalls, 1);
 	reserved = false;
 	do {
 		if (reserved == false &&
 		    bufspace_reserve(maxsize, metadata) != 0)
 			continue;
 		reserved = true;
 		if ((bp = buf_alloc()) == NULL)
 			continue;
 		if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
 			return (bp);
 		break;
 	} while(buf_scan(false) == 0);
 
 	if (reserved)
 		atomic_subtract_long(&bufspace, maxsize);
 	if (bp != NULL) {
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 	}
 	bufspace_wait(vp, gbflags, slpflag, slptimeo);
 
 	return (NULL);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
 buf_flush(struct vnode *vp, int target)
 {
 	int flushed;
 
 	flushed = flushbufqueues(vp, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
 		if (vp != NULL && target > 2)
 			target /= 2;
 		flushbufqueues(vp, target, 1);
 	}
 	return (flushed);
 }
 
 static void
 buf_daemon()
 {
 	int lodirty;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
 	    SHUTDOWN_PRI_LAST);
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
 		lodirty = lodirtybuffers;
 		if (bd_speedupreq) {
 			lodirty = numdirtybuffers / 2;
 			bd_speedupreq = 0;
 		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
 		 * the I/O system.
 		 */
 		while (numdirtybuffers > lodirty) {
 			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
 				break;
 			kern_yield(PRI_USER);
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep for a short period
 		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			/*
 			 * Do an extra wakeup in case dirty threshold
 			 * changed via sysctl and the explicit transition
 			 * out of shortfall was missed.
 			 */
 			bdirtywakeup();
 			if (runningbufspace <= lorunningspace)
 				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(struct vnode *lvp, int target, int flushdeps)
 {
 	struct buf *sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int queue;
 	int error;
 	bool unlock;
 
 	flushed = 0;
 	queue = QUEUE_DIRTY;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
 	mtx_lock(&bqlocks[queue]);
 	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqlocks[queue]);
 	while (flushed != target) {
 		maybe_yield();
 		mtx_lock(&bqlocks[queue]);
 		bp = TAILQ_NEXT(sentinel, b_freelist);
 		if (bp != NULL) {
 			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
 			    b_freelist);
 		} else {
 			mtx_unlock(&bqlocks[queue]);
 			break;
 		}
 		/*
 		 * Skip sentinels inserted by other invocations of the
 		 * flushbufqueues(), taking care to not reorder them.
 		 *
 		 * Only flush the buffers that belong to the
 		 * vnode locked by the curthread.
 		 */
 		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
 		    bp->b_vp != lvp)) {
 			mtx_unlock(&bqlocks[queue]);
 			continue;
 		}
 		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
 		mtx_unlock(&bqlocks[queue]);
 		if (error != 0)
 			continue;
 
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (bp->b_flags & B_INVAL) {
 			bremfreef(bp);
 			brelse(bp);
 			flushed++;
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (lvp == NULL) {
 			unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		} else {
 			ASSERT_VOP_LOCKED(vp, "getbuf");
 			unlock = false;
 			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
 			    vn_lock(vp, LK_TRYUPGRADE);
 		}
 		if (error == 0) {
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			if (curproc == bufdaemonproc) {
 				vfs_bio_awrite(bp);
 			} else {
 				bremfree(bp);
 				bwrite(bp);
 				notbufdflushes++;
 			}
 			vn_finished_write(mp);
 			if (unlock)
 				VOP_UNLOCK(vp, 0);
 			flushwithdeps += hasdeps;
 			flushed++;
 
 			/*
 			 * Sleeping on runningbufspace while holding
 			 * vnode lock leads to deadlock.
 			 */
 			if (curproc == bufdaemonproc &&
 			    runningbufspace > hirunningspace)
 				waitrunningbufspace();
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	mtx_lock(&bqlocks[queue]);
 	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqlocks[queue]);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	struct buf *bp;
 
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	BO_RUNLOCK(bo);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 static int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (0);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	VM_OBJECT_RLOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
 	VM_OBJECT_RUNLOCK(obj);
 	return 1;
 
 notinmem:
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 /*
  * Set the dirty range for a buffer based on the status of the dirty
  * bits in the pages comprising the buffer.  The range is limited
  * to the size of the buffer.
  *
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages_dirty_buf(struct buf *bp)
 {
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 	int i;
 
 	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	vfs_drain_busy_pages(bp);
 	vfs_setdirty_locked_object(bp);
 	for (i = 0; i < bp->b_npages; i++) {
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		m = bp->b_pages[i];
 		vfs_page_set_validclean(bp, foff, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
 vfs_setdirty_locked_object(struct buf *bp)
 {
 	vm_object_t object;
 	int i;
 
 	object = bp->b_bufobj->bo_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  * Allocate the KVA mapping for an existing buffer.
  * If an unmapped buffer is provided but a mapped buffer is requested, take
  * also care to properly setup mappings between pages and KVA.
  */
 static void
 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
 {
 	int bsize, maxsize, need_mapping, need_kva;
 	off_t offset;
 
 	need_mapping = bp->b_data == unmapped_buf &&
 	    (gbflags & GB_UNMAPPED) == 0;
 	need_kva = bp->b_kvabase == unmapped_buf &&
 	    bp->b_data == unmapped_buf &&
 	    (gbflags & GB_KVAALLOC) != 0;
 	if (!need_mapping && !need_kva)
 		return;
 
 	BUF_CHECK_UNMAPPED(bp);
 
 	if (need_mapping && bp->b_kvabase != unmapped_buf) {
 		/*
 		 * Buffer is not mapped, but the KVA was already
 		 * reserved at the time of the instantiation.  Use the
 		 * allocated space.
 		 */
 		goto has_addr;
 	}
 
 	/*
 	 * Calculate the amount of the address space we would reserve
 	 * if the buffer was mapped.
 	 */
 	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
 	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 	offset = blkno * bsize;
 	maxsize = size + (offset & PAGE_MASK);
 	maxsize = imax(maxsize, bsize);
 
 	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
 		if ((gbflags & GB_NOWAIT_BD) != 0) {
 			/*
 			 * XXXKIB: defragmentation cannot
 			 * succeed, not sure what else to do.
 			 */
 			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
 		}
 		atomic_add_int(&mappingrestarts, 1);
 		bufspace_wait(bp->b_vp, gbflags, 0, 0);
 	}
 has_addr:
 	if (need_mapping) {
 		/* b_offset is handled by bpmap_qenter. */
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 		bpmap_qenter(bp);
 	}
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successful read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	int bsize, error, maxsize, vmio;
 	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > maxbcachebuf)
 		panic("getblk: size(%d) > maxbcachebuf(%d)\n", size,
 		    maxbcachebuf);
 	if (!unmapped_buf_allowed)
 		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 loop:
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
 		 * Buffer is in-core.  If the buffer is not busy nor managed,
 		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
 		if (flags & GB_LOCK_NOWAIT)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error)
 			return (NULL);
 		/* If recursed, assume caller knows the rules. */
 		else if (BUF_LOCKRECURSED(bp))
 			goto end;
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		if (bp->b_flags & B_MANAGED)
 			MPASS(bp->b_qindex == QUEUE_NONE);
 		else
 			bremfree(bp);
 
 		/*
 		 * check for size inconsistencies for non-VMIO case.
 		 */
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * Handle the case of unmapped buffer which should
 		 * become mapped, or the buffer for which KVA
 		 * reservation is requested.
 		 */
 		bp_unmapped_get_kva(bp, blkno, size, flags);
 
 		/*
 		 * If the size is inconsistent in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 		allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		BO_RUNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
 		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
 			return NULL;
 
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		if (vmio) {
 			maxsize = size + (offset & PAGE_MASK);
 		} else {
 			maxsize = size;
 			/* Do not allow non-VMIO notmapped buffers. */
 			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 		}
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
 			/*
 			 * XXX This is here until the sleep path is diagnosed
 			 * enough to work under very low memory conditions.
 			 *
 			 * There's an issue on low memory, 4BSD+non-preempt
 			 * systems (eg MIPS routers with 32MB RAM) where buffer
 			 * exhaustion occurs without sleeping for buffer
 			 * reclaimation.  This just sticks in a loop and
 			 * constantly attempts to allocate a buffer, which
 			 * hits exhaustion and tries to wakeup bufdaemon.
 			 * This never happens because we never yield.
 			 *
 			 * The real solution is to identify and fix these cases
 			 * so we aren't effectively busy-waiting in a loop
 			 * until the reclaimation path has cycles to run.
 			 */
 			kern_yield(PRI_USER);
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			bufspace_release(maxsize);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
 		bufspace_release(maxsize);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	BUF_ASSERT_HELD(bp);
 end:
 	buf_track(bp, __func__);
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
 		if ((flags & GB_NOWAIT_BD) &&
 		    (curthread->td_pflags & TDP_BUFNEED) != 0)
 			return (NULL);
 	}
 	allocbuf(bp, size);
 	bufspace_release(maxsize);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	BUF_ASSERT_HELD(bp);
 	return (bp);
 }
 
 /*
  * Truncate the backing store for a non-vmio buffer.
  */
 static void
 vfs_nonvmio_truncate(struct buf *bp, int newbsize)
 {
 
 	if (bp->b_flags & B_MALLOC) {
 		/*
 		 * malloced buffers are not shrunk
 		 */
 		if (newbsize == 0) {
 			bufmallocadjust(bp, 0);
 			free(bp->b_data, M_BIOBUF);
 			bp->b_data = bp->b_kvabase;
 			bp->b_flags &= ~B_MALLOC;
 		}
 		return;
 	}
 	vm_hold_free_pages(bp, newbsize);
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * Extend the backing for a non-VMIO buffer.
  */
 static void
 vfs_nonvmio_extend(struct buf *bp, int newbsize)
 {
 	caddr_t origbuf;
 	int origbufsize;
 
 	/*
 	 * We only use malloced memory on the first allocation.
 	 * and revert to page-allocated memory when the buffer
 	 * grows.
 	 *
 	 * There is a potential smp race here that could lead
 	 * to bufmallocspace slightly passing the max.  It
 	 * is probably extremely rare and not worth worrying
 	 * over.
 	 */
 	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
 	    bufmallocspace < maxbufmallocspace) {
 		bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
 		bp->b_flags |= B_MALLOC;
 		bufmallocadjust(bp, newbsize);
 		return;
 	}
 
 	/*
 	 * If the buffer is growing on its other-than-first
 	 * allocation then we revert to the page-allocation
 	 * scheme.
 	 */
 	origbuf = NULL;
 	origbufsize = 0;
 	if (bp->b_flags & B_MALLOC) {
 		origbuf = bp->b_data;
 		origbufsize = bp->b_bufsize;
 		bp->b_data = bp->b_kvabase;
 		bufmallocadjust(bp, 0);
 		bp->b_flags &= ~B_MALLOC;
 		newbsize = round_page(newbsize);
 	}
 	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
 	    (vm_offset_t) bp->b_data + newbsize);
 	if (origbuf != NULL) {
 		bcopy(origbuf, bp->b_data, origbufsize);
 		free(origbuf, M_BIOBUF);
 	}
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistent data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_bcount == size)
 		return (1);
 
 	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	newbsize = roundup2(size, DEV_BSIZE);
 	if ((bp->b_flags & B_VMIO) == 0) {
 		if ((bp->b_flags & B_MALLOC) == 0)
 			newbsize = round_page(newbsize);
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		if (newbsize < bp->b_bufsize)
 			vfs_nonvmio_truncate(bp, newbsize);
 		else if (newbsize > bp->b_bufsize)
 			vfs_nonvmio_extend(bp, newbsize);
 	} else {
 		int desiredpages;
 
 		desiredpages = (size == 0) ? 0 :
 		    num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize)
 			vfs_vmio_truncate(bp, desiredpages);
 		/* XXX This looks as if it should be newbsize > b_bufsize */
 		else if (size > bp->b_bcount)
 			vfs_vmio_extend(bp, desiredpages, size);
 		bufspace_adjust(bp, newbsize);
 	}
 	bp->b_bcount = size;		/* requested buffer size. */
 	return (1);
 }
 
 extern int inflight_transient_maps;
 
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
 	vm_offset_t start, end;
 
 	biotrack(bp, __func__);
 	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
 		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
 		bp->bio_flags |= BIO_UNMAPPED;
 		start = trunc_page((vm_offset_t)bp->bio_data);
 		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
 		bp->bio_data = unmapped_buf;
 		pmap_qremove(start, atop(end - start));
 		vmem_free(transient_arena, start, end - start);
 		atomic_add_int(&inflight_transient_maps, -1);
 	}
 	done = bp->bio_done;
 	if (done == NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, bp);
 		mtx_lock(mtxp);
 		bp->bio_flags |= BIO_DONE;
 		wakeup(bp);
 		mtx_unlock(mtxp);
 	} else
 		done(bp);
 }
 
 /*
  * Wait for a BIO to finish.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wchan, 0);
 	mtx_unlock(mtxp);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 	
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
 #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
 void
 biotrack_buf(struct bio *bp, const char *location)
 {
 
 	buf_track(bp->bio_track_bp, location);
 }
 #endif
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occurred, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existence
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
 	buf_track(bp, __func__);
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 	BUF_ASSERT_HELD(bp);
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	else if ((bp->b_flags & B_CKHASH) != 0) {
 		KASSERT(buf_mapped(bp), ("biodone: bp %p not mapped", bp));
 		(*bp->b_ckhashcalc)(bp);
 	}
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 
 	bufdone_finish(bp);
 
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 void
 bufdone_finish(struct buf *bp)
 {
 	BUF_ASSERT_HELD(bp);
 
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occurred.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR))
 			bp->b_flags |= B_CACHE;
 		vfs_vmio_iodone(bp);
 	}
 
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
 		    (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistent.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			if (buf_mapped(bp)) {
 				BUF_CHECK_MAPPED(bp);
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			} else
 				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_page_sunbusy(m);
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t eoff;
 
 	/*
 	 * Compute the end offset, eoff, such that [off, eoff) does not span a
 	 * page boundary and eoff is not greater than the end of the buffer.
 	 * The end of the buffer, in this case, is our file EOF, not the
 	 * allocation size of the buffer.
 	 */
 	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > off)
 		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
 }
 
 /*
  * vfs_page_set_validclean:
  *
  *	Set the valid bits and clear the dirty bits in a page based on the
  *	supplied offset.   The range is restricted to the buffer's size.
  */
 static void
 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundary or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * Ensure that all buffer pages are not exclusive busied.  If any page is
  * exclusive busy, drain it.
  */
 void
 vfs_drain_busy_pages(struct buf *bp)
 {
 	vm_page_t m;
 	int i, last_busied;
 
 	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
 	last_busied = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (vm_page_xbusied(m)) {
 			for (; last_busied < i; last_busied++)
 				vm_page_sbusy(bp->b_pages[last_busied]);
 			while (vm_page_xbusied(m)) {
 				vm_page_lock(m);
 				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 				vm_page_busy_sleep(m, "vbpage", true);
 				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 			}
 		}
 	}
 	for (i = 0; i < last_busied; i++)
 		vm_page_sunbusy(bp->b_pages[i]);
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being exclusive busy.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistent.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistent state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 	int i;
 	bool bogus;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	VM_OBJECT_WLOCK(obj);
 	vfs_drain_busy_pages(bp);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
 	bogus = false;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
 			vm_page_sbusy(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		if (clear_modify) {
 			pmap_remove_write(m);
 			vfs_page_set_validclean(bp, foff, m);
 		} else if (m->valid == VM_PAGE_BITS_ALL &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus = true;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  *	vfs_bio_set_valid:
  *
  *	Set the range within the buffer to valid.  The range is
  *	relative to the beginning of the buffer, b_offset.  Note that
  *	b_offset itself may be offset from the beginning of the first
  *	page.
  */
 void   
 vfs_bio_set_valid(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_valid_range(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	If the specified buffer is a non-VMIO buffer, clear the entire
  *	buffer.  If the specified buffer is a VMIO buffer, clear and
  *	validate only the previously invalid portions of the buffer.
  *	This routine essentially fakes an I/O, so we need to clear
  *	BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask, sa, ea, slide;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if ((bp->b_pages[0]->valid & mask) == 0) {
 			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
 			bp->b_pages[0]->valid |= mask;
 			goto unlock;
 		}
 	}
 	sa = bp->b_offset & PAGE_MASK;
 	slide = 0;
 	for (i = 0; i < bp->b_npages; i++, sa = 0) {
 		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
 		ea = slide & PAGE_MASK;
 		if (ea == 0)
 			ea = PAGE_SIZE;
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = sa / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0)
 			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
 		else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
 					pmap_zero_page_area(bp->b_pages[i],
 					    sa, DEV_BSIZE);
 				}
 			}
 		}
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
 void
 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
 {
 	vm_page_t m;
 	int i, n;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		bzero(bp->b_data + base, size);
 	} else {
 		BUF_CHECK_UNMAPPED(bp);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			m = bp->b_pages[i];
 			if (n > size)
 				n = size;
 			pmap_zero_page_area(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * Update buffer flags based on I/O request parameters, optionally releasing the
  * buffer.  If it's VMIO or direct I/O, the buffer pages are released to the VM,
  * where they may be placed on a page queue (VMIO) or freed immediately (direct
  * I/O).  Otherwise the buffer is released to the cache.
  */
 static void
 b_io_dismiss(struct buf *bp, int ioflag, bool release)
 {
 
 	KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0,
 	    ("buf %p non-VMIO noreuse", bp));
 
 	if ((ioflag & IO_DIRECT) != 0)
 		bp->b_flags |= B_DIRECT;
 	if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) {
 		bp->b_flags |= B_RELBUF;
 		if ((ioflag & IO_NOREUSE) != 0)
 			bp->b_flags |= B_NOREUSE;
 		if (release)
 			brelse(bp);
 	} else if (release)
 		bqrelse(bp);
 }
 
 void
 vfs_bio_brelse(struct buf *bp, int ioflag)
 {
 
 	b_io_dismiss(bp, ioflag, true);
 }
 
 void
 vfs_bio_set_flags(struct buf *bp, int ioflag)
 {
 
 	b_io_dismiss(bp, ioflag, false);
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	BUF_CHECK_MAPPED(bp);
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could interfere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) |
 		    VM_ALLOC_WAITOK);
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, int newbsize)
 {
 	vm_offset_t from;
 	vm_page_t p;
 	int index, newnpages;
 
 	BUF_CHECK_MAPPED(bp);
 
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
 		pmap_qremove(from, bp->b_npages - newnpages);
 	for (index = newnpages; index < bp->b_npages; index++) {
 		p = bp->b_pages[index];
 		bp->b_pages[index] = NULL;
 		p->wire_count--;
 		vm_page_free(p);
 	}
 	atomic_subtract_int(&vm_cnt.v_wire_count, bp->b_npages - newnpages);
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  *
  * This function only works with pager buffers.
  */
 int
 vmapbuf(struct buf *bp, int mapbuf)
 {
 	vm_prot_t prot;
 	int pidx;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
 	    btoc(MAXPHYS))) < 0)
 		return (-1);
 	bp->b_npages = pidx;
 	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
 	if (mapbuf || !unmapped_buf_allowed) {
 		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
 		bp->b_data = bp->b_kvabase + bp->b_offset;
 	} else
 		bp->b_data = unmapped_buf;
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  *
  * This function only works with pager buffers.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int npages;
 
 	npages = bp->b_npages;
 	if (buf_mapped(bp))
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_unhold_pages(bp->b_pages, npages);
 
 	bp->b_data = unmapped_buf;
 }
 
 void
 bdone(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, mtxp, pri, wchan, 0);
 	mtx_unlock(mtxp);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor)
 {
 
 	return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i = 0;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_WLOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_WLOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Set bio_data or bio_ma for struct bio from the struct buf.
  */
 void
 bdata2bio(struct buf *bp, struct bio *bip)
 {
 
 	if (!buf_mapped(bp)) {
 		KASSERT(unmapped_buf_allowed, ("unmapped"));
 		bip->bio_ma = bp->b_pages;
 		bip->bio_ma_n = bp->b_npages;
 		bip->bio_data = unmapped_buf;
 		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bip->bio_flags |= BIO_UNMAPPED;
 		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
 		    PAGE_SIZE == bp->b_npages,
 		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
 		    (long long)bip->bio_length, bip->bio_ma_n));
 	} else {
 		bip->bio_data = bp->b_data;
 		bip->bio_ma = NULL;
 	}
 }
 
 /*
  * The MIPS pmap code currently doesn't handle aliased pages.
  * The VIPT caches may not handle page aliasing themselves, leading
  * to data corruption.
  *
  * As such, this code makes a system extremely unhappy if said
  * system doesn't support unaliasing the above situation in hardware.
  * Some "recent" systems (eg some mips24k/mips74k cores) don't enable
  * this feature at build time, so it has to be handled in software.
  *
  * Once the MIPS pmap/cache code grows to support this function on
  * earlier chips, it should be flipped back off.
  */
 #ifdef	__mips__
 static int buf_pager_relbuf = 1;
 #else
 static int buf_pager_relbuf = 0;
 #endif
 SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
     &buf_pager_relbuf, 0,
     "Make buffer pager release buffers after reading");
 
 /*
  * The buffer pager.  It uses buffer reads to validate pages.
  *
  * In contrast to the generic local pager from vm/vnode_pager.c, this
  * pager correctly and easily handles volumes where the underlying
  * device block size is greater than the machine page size.  The
  * buffer cache transparently extends the requested page run to be
  * aligned at the block boundary, and does the necessary bogus page
  * replacements in the addends to avoid obliterating already valid
  * pages.
  *
  * The only non-trivial issue is that the exclusive busy state for
  * pages, which is assumed by the vm_pager_getpages() interface, is
  * incompatible with the VMIO buffer cache's desire to share-busy the
  * pages.  This function performs a trivial downgrade of the pages'
  * state before reading buffers, and a less trivial upgrade from the
  * shared-busy to excl-busy state after the read.
  */
 int
 vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
     int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
     vbg_get_blksize_t get_blksize)
 {
 	vm_page_t m;
 	vm_object_t object;
 	struct buf *bp;
 	struct mount *mp;
 	daddr_t lbn, lbnp;
 	vm_ooffset_t la, lb, poff, poffe;
 	long bsize;
 	int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b;
 	bool redo, lpart;
 
 	object = vp->v_object;
 	mp = vp->v_mount;
 	la = IDX_TO_OFF(ma[count - 1]->pindex);
 	if (la >= object->un_pager.vnp.vnp_size)
 		return (VM_PAGER_BAD);
 	lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
 	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
 
 	/*
 	 * Calculate read-ahead, behind and total pages.
 	 */
 	pgsin = count;
 	lb = IDX_TO_OFF(ma[0]->pindex);
 	pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
 	pgsin += pgsin_b;
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la);
 	if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size)
 		pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size,
 		    PAGE_SIZE) - la);
 	pgsin += pgsin_a;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, pgsin);
 
 	br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS)
 	    != 0) ? GB_UNMAPPED : 0;
 	VM_OBJECT_WLOCK(object);
 again:
 	for (i = 0; i < count; i++)
 		vm_page_busy_downgrade(ma[i]);
 	VM_OBJECT_WUNLOCK(object);
 
 	lbnp = -1;
 	for (i = 0; i < count; i++) {
 		m = ma[i];
 
 		/*
 		 * Pages are shared busy and the object lock is not
 		 * owned, which together allow for the pages'
 		 * invalidation.  The racy test for validity avoids
 		 * useless creation of the buffer for the most typical
 		 * case when invalidation is not used in redo or for
 		 * parallel read.  The shared->excl upgrade loop at
 		 * the end of the function catches the race in a
 		 * reliable way (protected by the object lock).
 		 */
 		if (m->valid == VM_PAGE_BITS_ALL)
 			continue;
 
 		poff = IDX_TO_OFF(m->pindex);
 		poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
 		for (; poff < poffe; poff += bsize) {
 			lbn = get_lblkno(vp, poff);
 			if (lbn == lbnp)
 				goto next_page;
 			lbnp = lbn;
 
 			bsize = get_blksize(vp, lbn);
 			error = bread_gb(vp, lbn, bsize, curthread->td_ucred,
 			    br_flags, &bp);
 			if (error != 0)
 				goto end_pages;
 			if (LIST_EMPTY(&bp->b_dep)) {
 				/*
 				 * Invalidation clears m->valid, but
 				 * may leave B_CACHE flag if the
 				 * buffer existed at the invalidation
 				 * time.  In this case, recycle the
 				 * buffer to do real read on next
 				 * bread() after redo.
 				 *
 				 * Otherwise B_RELBUF is not strictly
 				 * necessary, enable to reduce buf
 				 * cache pressure.
 				 */
 				if (buf_pager_relbuf ||
 				    m->valid != VM_PAGE_BITS_ALL)
 					bp->b_flags |= B_RELBUF;
 
 				bp->b_flags &= ~B_NOCACHE;
 				brelse(bp);
 			} else {
 				bqrelse(bp);
 			}
 		}
 		KASSERT(1 /* racy, enable for debugging */ ||
 		    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
 		    ("buf %d %p invalid", i, m));
 		if (i == count - 1 && lpart) {
 			VM_OBJECT_WLOCK(object);
 			if (m->valid != 0 &&
 			    m->valid != VM_PAGE_BITS_ALL)
 				vm_page_zero_invalid(m, TRUE);
 			VM_OBJECT_WUNLOCK(object);
 		}
 next_page:;
 	}
 end_pages:
 
 	VM_OBJECT_WLOCK(object);
 	redo = false;
 	for (i = 0; i < count; i++) {
 		vm_page_sunbusy(ma[i]);
 		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
 
 		/*
 		 * Since the pages were only sbusy while neither the
 		 * buffer nor the object lock was held by us, or
 		 * reallocated while vm_page_grab() slept for busy
 		 * relinguish, they could have been invalidated.
 		 * Recheck the valid bits and re-read as needed.
 		 *
 		 * Note that the last page is made fully valid in the
 		 * read loop, and partial validity for the page at
 		 * index count - 1 could mean that the page was
 		 * invalidated or removed, so we must restart for
 		 * safety as well.
 		 */
 		if (ma[i]->valid != VM_PAGE_BITS_ALL)
 			redo = true;
 	}
 	if (redo && error == 0)
 		goto again;
 	VM_OBJECT_WUNLOCK(object);
 	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 #ifdef FULL_BUF_TRACKING
 	uint32_t i, j;
 #endif
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
 	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
 	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
 	    "b_dep = %p\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
 	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
 	db_printf("b_kvabase = %p, b_kvasize = %d\n",
 	    bp->b_kvabase, bp->b_kvasize);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			if (m != NULL)
 				db_printf("(%p, 0x%lx, 0x%lx)", m->object,
 				    (u_long)m->pindex,
 				    (u_long)VM_PAGE_TO_PHYS(m));
 			else
 				db_printf("( ??? )");
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 #if defined(FULL_BUF_TRACKING)
 	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
 
 	i = bp->b_io_tcnt % BUF_TRACKING_SIZE;
 	for (j = 1; j <= BUF_TRACKING_SIZE; j++) {
 		if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL)
 			continue;
 		db_printf(" %2u: %s\n", j,
 		    bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]);
 	}
 #elif defined(BUF_TRACKING)
 	db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
 #endif
 	db_printf(" ");
 	BUF_LOCKPRINTINFO(bp);
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (BUF_ISLOCKED(bp)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
 {
 	struct vnode *vp;
 	struct buf *bp;
 
 	if (!have_addr) {
 		db_printf("usage: show vnodebufs <addr>\n");
 		return;
 	}
 	vp = (struct vnode *)addr;
 	db_printf("Clean buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 	db_printf("Dirty buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 }
 
 DB_COMMAND(countfreebufs, db_coundfreebufs)
 {
 	struct buf *bp;
 	int i, used = 0, nfree = 0;
 
 	if (have_addr) {
 		db_printf("usage: countfreebufs\n");
 		return;
 	}
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (bp->b_qindex == QUEUE_EMPTY)
 			nfree++;
 		else
 			used++;
 	}
 
 	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
 	    nfree + used);
 	db_printf("numfreebuffers is %d\n", numfreebuffers);
 }
 #endif /* DDB */
Index: head/sys/kern/vfs_extattr.c
===================================================================
--- head/sys/kern/vfs_extattr.c	(revision 326270)
+++ head/sys/kern/vfs_extattr.c	(revision 326271)
@@ -1,765 +1,767 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 1999-2001 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/fcntl.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/vnode.h>
 #include <sys/proc.h>
 #include <sys/extattr.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 /*
  * Syscall to push extended attribute configuration information into the VFS.
  * Accepts a path, which it converts to a mountpoint, as well as a command
  * (int cmd), and attribute name and misc data.
  *
  * Currently this is used only by UFS1 extended attributes.
  */
 int
 sys_extattrctl(td, uap)
 	struct thread *td;
 	struct extattrctl_args /* {
 		const char *path;
 		int cmd;
 		const char *filename;
 		int attrnamespace;
 		const char *attrname;
 	} */ *uap;
 {
 	struct vnode *filename_vp;
 	struct nameidata nd;
 	struct mount *mp, *mp_writable;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	AUDIT_ARG_CMD(uap->cmd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	/*
 	 * uap->attrname is not always defined.  We check again later when we
 	 * invoke the VFS call so as to pass in NULL there if needed.
 	 */
 	if (uap->attrname != NULL) {
 		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
 		    NULL);
 		if (error)
 			return (error);
 	}
 	AUDIT_ARG_TEXT(attrname);
 
 	mp = NULL;
 	filename_vp = NULL;
 	if (uap->filename != NULL) {
 		NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2,
 		    UIO_USERSPACE, uap->filename, td);
 		error = namei(&nd);
 		if (error)
 			return (error);
 		filename_vp = nd.ni_vp;
 		NDFREE(&nd, NDF_NO_VP_RELE);
 	}
 
 	/* uap->path is always defined. */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		goto out;
 	mp = nd.ni_vp->v_mount;
 	error = vfs_busy(mp, 0);
 	if (error) {
 		NDFREE(&nd, 0);
 		mp = NULL;
 		goto out;
 	}
 	VOP_UNLOCK(nd.ni_vp, 0);
 	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
 	NDFREE(&nd, NDF_NO_VP_UNLOCK);
 	if (error)
 		goto out;
 	if (filename_vp != NULL) {
 		/*
 		 * uap->filename is not always defined.  If it is,
 		 * grab a vnode lock, which VFS_EXTATTRCTL() will
 		 * later release.
 		 */
 		error = vn_lock(filename_vp, LK_EXCLUSIVE);
 		if (error) {
 			vn_finished_write(mp_writable);
 			goto out;
 		}
 	}
 
 	error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
 	    uap->attrname != NULL ? attrname : NULL);
 
 	vn_finished_write(mp_writable);
 out:
 	if (mp != NULL)
 		vfs_unbusy(mp);
 
 	/*
 	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
 	 * so vrele it if it is defined.
 	 */
 	if (filename_vp != NULL)
 		vrele(filename_vp);
 	return (error);
 }
 
 /*-
  * Set a named extended attribute on a file or directory
  *
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname", userspace buffer
  *            pointer "data", buffer length "nbytes", thread "td".
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     void *data, size_t nbytes, struct thread *td)
 {
 	struct mount *mp;
 	struct uio auio;
 	struct iovec aiov;
 	ssize_t cnt;
 	int error;
 
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	aiov.iov_base = data;
 	aiov.iov_len = nbytes;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = 0;
 	if (nbytes > IOSIZE_MAX) {
 		error = EINVAL;
 		goto done;
 	}
 	auio.uio_resid = nbytes;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	cnt = nbytes;
 
 #ifdef MAC
 	error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace,
 	    attrname);
 	if (error)
 		goto done;
 #endif
 
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
 	    td->td_ucred, td);
 	cnt -= auio.uio_resid;
 	td->td_retval[0] = cnt;
 
 done:
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 sys_extattr_set_fd(td, uap)
 	struct thread *td;
 	struct extattr_set_fd_args /* {
 		int fd;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
 	error = getvnode(td, uap->fd,
 	    cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
 	if (error)
 		return (error);
 
 	error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, uap->data, uap->nbytes, td);
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 sys_extattr_set_file(td, uap)
 	struct thread *td;
 	struct extattr_set_file_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 int
 sys_extattr_set_link(td, uap)
 	struct thread *td;
 	struct extattr_set_link_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*-
  * Get a named extended attribute on a file or directory
  *
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname", userspace buffer
  *            pointer "data", buffer length "nbytes", thread "td".
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     void *data, size_t nbytes, struct thread *td)
 {
 	struct uio auio, *auiop;
 	struct iovec aiov;
 	ssize_t cnt;
 	size_t size, *sizep;
 	int error;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	/*
 	 * Slightly unusual semantics: if the user provides a NULL data
 	 * pointer, they don't want to receive the data, just the maximum
 	 * read length.
 	 */
 	auiop = NULL;
 	sizep = NULL;
 	cnt = 0;
 	if (data != NULL) {
 		aiov.iov_base = data;
 		aiov.iov_len = nbytes;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		if (nbytes > IOSIZE_MAX) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid = nbytes;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_USERSPACE;
 		auio.uio_td = td;
 		auiop = &auio;
 		cnt = nbytes;
 	} else
 		sizep = &size;
 
 #ifdef MAC
 	error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace,
 	    attrname);
 	if (error)
 		goto done;
 #endif
 
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
 	    td->td_ucred, td);
 
 	if (auiop != NULL) {
 		cnt -= auio.uio_resid;
 		td->td_retval[0] = cnt;
 	} else
 		td->td_retval[0] = size;
 
 done:
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 int
 sys_extattr_get_fd(td, uap)
 	struct thread *td;
 	struct extattr_get_fd_args /* {
 		int fd;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
 	error = getvnode(td, uap->fd,
 	    cap_rights_init(&rights, CAP_EXTATTR_GET), &fp);
 	if (error)
 		return (error);
 
 	error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, uap->data, uap->nbytes, td);
 
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_extattr_get_file(td, uap)
 	struct thread *td;
 	struct extattr_get_file_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 int
 sys_extattr_get_link(td, uap)
 	struct thread *td;
 	struct extattr_get_link_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
 	    td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * extattr_delete_vp(): Delete a named extended attribute on a file or
  *                      directory
  *
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname", proc "p"
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     struct thread *td)
 {
 	struct mount *mp;
 	int error;
 
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 #ifdef MAC
 	error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace,
 	    attrname);
 	if (error)
 		goto done;
 #endif
 
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
 	    td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    td->td_ucred, td);
 #ifdef MAC
 done:
 #endif
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 sys_extattr_delete_fd(td, uap)
 	struct thread *td;
 	struct extattr_delete_fd_args /* {
 		int fd;
 		int attrnamespace;
 		const char *attrname;
 	} */ *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
 	error = getvnode(td, uap->fd,
 	    cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp);
 	if (error)
 		return (error);
 
 	error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_extattr_delete_file(td, uap)
 	struct thread *td;
 	struct extattr_delete_file_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return(error);
 	AUDIT_ARG_TEXT(attrname);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return(error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
 	vrele(nd.ni_vp);
 	return(error);
 }
 
 int
 sys_extattr_delete_link(td, uap)
 	struct thread *td;
 	struct extattr_delete_link_args /* {
 		const char *path;
 		int attrnamespace;
 		const char *attrname;
 	} */ *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
 	if (error)
 		return(error);
 	AUDIT_ARG_TEXT(attrname);
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return(error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
 	vrele(nd.ni_vp);
 	return(error);
 }
 
 /*-
  * Retrieve a list of extended attributes on a file or directory.
  *
  * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
  *            userspace buffer pointer "data", buffer length "nbytes",
  *            thread "td".
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
     size_t nbytes, struct thread *td)
 {
 	struct uio auio, *auiop;
 	size_t size, *sizep;
 	struct iovec aiov;
 	ssize_t cnt;
 	int error;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	auiop = NULL;
 	sizep = NULL;
 	cnt = 0;
 	if (data != NULL) {
 		aiov.iov_base = data;
 		aiov.iov_len = nbytes;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		if (nbytes > IOSIZE_MAX) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid = nbytes;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_USERSPACE;
 		auio.uio_td = td;
 		auiop = &auio;
 		cnt = nbytes;
 	} else
 		sizep = &size;
 
 #ifdef MAC
 	error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace);
 	if (error)
 		goto done;
 #endif
 
 	error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
 	    td->td_ucred, td);
 
 	if (auiop != NULL) {
 		cnt -= auio.uio_resid;
 		td->td_retval[0] = cnt;
 	} else
 		td->td_retval[0] = size;
 
 done:
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 
 int
 sys_extattr_list_fd(td, uap)
 	struct thread *td;
 	struct extattr_list_fd_args /* {
 		int fd;
 		int attrnamespace;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = getvnode(td, uap->fd,
 	    cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp);
 	if (error)
 		return (error);
 
 	error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_extattr_list_file(td, uap)
 	struct thread*td;
 	struct extattr_list_file_args /* {
 		const char *path;
 		int attrnamespace;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 int
 sys_extattr_list_link(td, uap)
 	struct thread*td;
 	struct extattr_list_link_args /* {
 		const char *path;
 		int attrnamespace;
 		void *data;
 		size_t nbytes;
 	} */ *uap;
 {
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
 	    td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
Index: head/sys/kern/vfs_hash.c
===================================================================
--- head/sys/kern/vfs_hash.c	(revision 326270)
+++ head/sys/kern/vfs_hash.c	(revision 326271)
@@ -1,232 +1,234 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2005 Poul-Henning Kamp
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/rwlock.h>
 #include <sys/vnode.h>
 
 static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table");
 
 static LIST_HEAD(vfs_hash_head, vnode)	*vfs_hash_tbl;
 static LIST_HEAD(,vnode)		vfs_hash_side;
 static u_long				vfs_hash_mask;
 static struct rwlock			vfs_hash_lock;
 
 static void
 vfs_hashinit(void *dummy __unused)
 {
 
 	vfs_hash_tbl = hashinit(desiredvnodes, M_VFS_HASH, &vfs_hash_mask);
 	rw_init(&vfs_hash_lock, "vfs hash");
 	LIST_INIT(&vfs_hash_side);
 }
 
 /* Must be SI_ORDER_SECOND so desiredvnodes is available */
 SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL);
 
 u_int
 vfs_hash_index(struct vnode *vp)
 {
 
 	return (vp->v_hash + vp->v_mount->mnt_hashseed);
 }
 
 static struct vfs_hash_head *
 vfs_hash_bucket(const struct mount *mp, u_int hash)
 {
 
 	return (&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]);
 }
 
 int
 vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
 {
 	struct vnode *vp;
 	int error;
 
 	while (1) {
 		rw_rlock(&vfs_hash_lock);
 		LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
 			if (vp->v_hash != hash)
 				continue;
 			if (vp->v_mount != mp)
 				continue;
 			if (fn != NULL && fn(vp, arg))
 				continue;
 			vhold(vp);
 			rw_runlock(&vfs_hash_lock);
 			error = vget(vp, flags | LK_VNHELD, td);
 			if (error == ENOENT && (flags & LK_NOWAIT) == 0)
 				break;
 			if (error)
 				return (error);
 			*vpp = vp;
 			return (0);
 		}
 		if (vp == NULL) {
 			rw_runlock(&vfs_hash_lock);
 			*vpp = NULL;
 			return (0);
 		}
 	}
 }
 
 void
 vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
 {
 	struct vnode *vp;
 
 	while (1) {
 		rw_rlock(&vfs_hash_lock);
 		LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
 			if (vp->v_hash != hash)
 				continue;
 			if (vp->v_mount != mp)
 				continue;
 			if (fn != NULL && fn(vp, arg))
 				continue;
 			vhold(vp);
 			rw_runlock(&vfs_hash_lock);
 			vref(vp);
 			vdrop(vp);
 			*vpp = vp;
 			return;
 		}
 		if (vp == NULL) {
 			rw_runlock(&vfs_hash_lock);
 			*vpp = NULL;
 			return;
 		}
 	}
 }
 
 void
 vfs_hash_remove(struct vnode *vp)
 {
 
 	rw_wlock(&vfs_hash_lock);
 	LIST_REMOVE(vp, v_hashlist);
 	rw_wunlock(&vfs_hash_lock);
 }
 
 int
 vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
 {
 	struct vnode *vp2;
 	int error;
 
 	*vpp = NULL;
 	while (1) {
 		rw_wlock(&vfs_hash_lock);
 		LIST_FOREACH(vp2,
 		    vfs_hash_bucket(vp->v_mount, hash), v_hashlist) {
 			if (vp2->v_hash != hash)
 				continue;
 			if (vp2->v_mount != vp->v_mount)
 				continue;
 			if (fn != NULL && fn(vp2, arg))
 				continue;
 			vhold(vp2);
 			rw_wunlock(&vfs_hash_lock);
 			error = vget(vp2, flags | LK_VNHELD, td);
 			if (error == ENOENT && (flags & LK_NOWAIT) == 0)
 				break;
 			rw_wlock(&vfs_hash_lock);
 			LIST_INSERT_HEAD(&vfs_hash_side, vp, v_hashlist);
 			rw_wunlock(&vfs_hash_lock);
 			vput(vp);
 			if (!error)
 				*vpp = vp2;
 			return (error);
 		}
 		if (vp2 == NULL)
 			break;
 			
 	}
 	vp->v_hash = hash;
 	LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
 	rw_wunlock(&vfs_hash_lock);
 	return (0);
 }
 
 void
 vfs_hash_rehash(struct vnode *vp, u_int hash)
 {
 
 	rw_wlock(&vfs_hash_lock);
 	LIST_REMOVE(vp, v_hashlist);
 	LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
 	vp->v_hash = hash;
 	rw_wunlock(&vfs_hash_lock);
 }
 
 void
 vfs_hash_changesize(int newmaxvnodes)
 {
 	struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl;
 	u_long vfs_hash_newmask, vfs_hash_oldmask;
 	struct vnode *vp;
 	int i;
 
 	vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH,
 		&vfs_hash_newmask);
 	/* If same hash table size, nothing to do */
 	if (vfs_hash_mask == vfs_hash_newmask) {
 		free(vfs_hash_newtbl, M_VFS_HASH);
 		return;
 	}
 	/*
 	 * Move everything from the old hash table to the new table.
 	 * None of the vnodes in the table can be recycled because to
 	 * do so, they have to be removed from the hash table.
 	 */
 	rw_wlock(&vfs_hash_lock);
 	vfs_hash_oldtbl = vfs_hash_tbl;
 	vfs_hash_oldmask = vfs_hash_mask;
 	vfs_hash_tbl = vfs_hash_newtbl;
 	vfs_hash_mask = vfs_hash_newmask;
 	for (i = 0; i <= vfs_hash_oldmask; i++) {
 		while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) {
 			LIST_REMOVE(vp, v_hashlist);
 			LIST_INSERT_HEAD(
 			    vfs_hash_bucket(vp->v_mount, vp->v_hash),
 			    vp, v_hashlist);
 		}
 	}
 	rw_wunlock(&vfs_hash_lock);
 	free(vfs_hash_oldtbl, M_VFS_HASH);
 }
Index: head/sys/libkern/arm/aeabi_unwind.c
===================================================================
--- head/sys/libkern/arm/aeabi_unwind.c	(revision 326270)
+++ head/sys/libkern/arm/aeabi_unwind.c	(revision 326271)
@@ -1,59 +1,61 @@
-/*
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (C) 2013 Andrew Turner
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #else
 #define	panic(x) (void)0
 #endif
 
 /* We need to provide these functions never call them */
 void __aeabi_unwind_cpp_pr0(void);
 void __aeabi_unwind_cpp_pr1(void);
 void __aeabi_unwind_cpp_pr2(void);
 
 void
 __aeabi_unwind_cpp_pr0(void)
 {
 	panic("__aeabi_unwind_cpp_pr0");
 }
 
 void
 __aeabi_unwind_cpp_pr1(void)
 {
 	panic("__aeabi_unwind_cpp_pr1");
 }
 
 void
 __aeabi_unwind_cpp_pr2(void)
 {
 	panic("__aeabi_unwind_cpp_pr2");
 }
Index: head/sys/libkern/arm/ldivmod_helper.c
===================================================================
--- head/sys/libkern/arm/ldivmod_helper.c	(revision 326270)
+++ head/sys/libkern/arm/ldivmod_helper.c	(revision 326271)
@@ -1,49 +1,51 @@
 /*
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (C) 2012 Andrew Turner
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <libkern/quad.h>
 
 /*
  * Helper for __aeabi_ldivmod.
  * TODO: __divdi3 calls __qdivrem. We should do the same and use the
  * remainder value rather than re-calculating it.
  */
 long long __kern_ldivmod(long long, long long, long long *);
 
 long long
 __kern_ldivmod(long long n, long long m, long long *rem)
 {
 	long long q;
 
 	q = __divdi3(n, m);	/* q = n / m */
 	*rem = n - m * q;
 
 	return q;
 }
Index: head/sys/libkern/iconv.c
===================================================================
--- head/sys/libkern/iconv.c	(revision 326270)
+++ head/sys/libkern/iconv.c	(revision 326271)
@@ -1,576 +1,578 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/iconv.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 
 #include "iconv_converter_if.h"
 
 SYSCTL_DECL(_kern_iconv);
 
 SYSCTL_NODE(_kern, OID_AUTO, iconv, CTLFLAG_RW, NULL, "kernel iconv interface");
 
 MALLOC_DEFINE(M_ICONV, "iconv", "ICONV structures");
 static MALLOC_DEFINE(M_ICONVDATA, "iconv_data", "ICONV data");
 
 MODULE_VERSION(libiconv, 2);
 
 static struct sx iconv_lock;
 
 #ifdef notnow
 /*
  * iconv converter instance
  */
 struct iconv_converter {
 	KOBJ_FIELDS;
 	void *			c_data;
 };
 #endif
 
 struct sysctl_oid *iconv_oid_hook = &sysctl___kern_iconv;
 
 /*
  * List of loaded converters
  */
 static TAILQ_HEAD(iconv_converter_list, iconv_converter_class)
     iconv_converters = TAILQ_HEAD_INITIALIZER(iconv_converters);
 
 /*
  * List of supported/loaded charsets pairs
  */
 static TAILQ_HEAD(, iconv_cspair)
     iconv_cslist = TAILQ_HEAD_INITIALIZER(iconv_cslist);
 static int iconv_csid = 1;
 
 static char iconv_unicode_string[] = "unicode";	/* save eight bytes when possible */
 
 static void iconv_unregister_cspair(struct iconv_cspair *csp);
 
 static int
 iconv_mod_unload(void)
 {
 	struct iconv_cspair *csp;
 
 	sx_xlock(&iconv_lock);
 	TAILQ_FOREACH(csp, &iconv_cslist, cp_link) {
 		if (csp->cp_refcount) {
 			sx_xunlock(&iconv_lock);
 			return EBUSY;
 		}
 	}
 
 	while ((csp = TAILQ_FIRST(&iconv_cslist)) != NULL)
 		iconv_unregister_cspair(csp);
 	sx_xunlock(&iconv_lock);
 	sx_destroy(&iconv_lock);
 	return 0;
 }
 
 static int
 iconv_mod_handler(module_t mod, int type, void *data)
 {
 	int error;
 
 	switch (type) {
 	    case MOD_LOAD:
 		error = 0;
 		sx_init(&iconv_lock, "iconv");
 		break;
 	    case MOD_UNLOAD:
 		error = iconv_mod_unload();
 		break;
 	    default:
 		error = EINVAL;
 	}
 	return error;
 }
 
 static moduledata_t iconv_mod = {
 	"iconv", iconv_mod_handler, NULL
 };
 
 DECLARE_MODULE(iconv, iconv_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);
 
 static int
 iconv_register_converter(struct iconv_converter_class *dcp)
 {
 	kobj_class_compile((struct kobj_class*)dcp);
 	dcp->refs++;
 	TAILQ_INSERT_TAIL(&iconv_converters, dcp, cc_link);
 	return 0;
 }
 
 static int
 iconv_unregister_converter(struct iconv_converter_class *dcp)
 {
 	dcp->refs--;
 	if (dcp->refs > 1) {
 		ICDEBUG("converter has %d references left\n", dcp->refs);
 		return EBUSY;
 	}
 	TAILQ_REMOVE(&iconv_converters, dcp, cc_link);
 	kobj_class_free((struct kobj_class*)dcp);
 	return 0;
 }
 
 static int
 iconv_lookupconv(const char *name, struct iconv_converter_class **dcpp)
 {
 	struct iconv_converter_class *dcp;
 
 	TAILQ_FOREACH(dcp, &iconv_converters, cc_link) {
 		if (name == NULL)
 			continue;
 		if (strcmp(name, ICONV_CONVERTER_NAME(dcp)) == 0) {
 			if (dcpp)
 				*dcpp = dcp;
 			return 0;
 		}
 	}
 	return ENOENT;
 }
 
 static int
 iconv_lookupcs(const char *to, const char *from, struct iconv_cspair **cspp)
 {
 	struct iconv_cspair *csp;
 
 	TAILQ_FOREACH(csp, &iconv_cslist, cp_link) {
 		if (strcasecmp(csp->cp_to, to) == 0 &&
 		    strcasecmp(csp->cp_from, from) == 0) {
 			if (cspp)
 				*cspp = csp;
 			return 0;
 		}
 	}
 	return ENOENT;
 }
 
 static int
 iconv_register_cspair(const char *to, const char *from,
 	struct iconv_converter_class *dcp, void *data,
 	struct iconv_cspair **cspp)
 {
 	struct iconv_cspair *csp;
 	char *cp;
 	int csize, ucsto, ucsfrom;
 
 	if (iconv_lookupcs(to, from, NULL) == 0)
 		return EEXIST;
 	csize = sizeof(*csp);
 	ucsto = strcmp(to, iconv_unicode_string) == 0;
 	if (!ucsto)
 		csize += strlen(to) + 1;
 	ucsfrom = strcmp(from, iconv_unicode_string) == 0;
 	if (!ucsfrom)
 		csize += strlen(from) + 1;
 	csp = malloc(csize, M_ICONV, M_WAITOK);
 	bzero(csp, csize);
 	csp->cp_id = iconv_csid++;
 	csp->cp_dcp = dcp;
 	cp = (char*)(csp + 1);
 	if (!ucsto) {
 		strcpy(cp, to);
 		csp->cp_to = cp;
 		cp += strlen(cp) + 1;
 	} else
 		csp->cp_to = iconv_unicode_string;
 	if (!ucsfrom) {
 		strcpy(cp, from);
 		csp->cp_from = cp;
 	} else
 		csp->cp_from = iconv_unicode_string;
 	csp->cp_data = data;
 
 	TAILQ_INSERT_TAIL(&iconv_cslist, csp, cp_link);
 	*cspp = csp;
 	return 0;
 }
 
 static void
 iconv_unregister_cspair(struct iconv_cspair *csp)
 {
 	TAILQ_REMOVE(&iconv_cslist, csp, cp_link);
 	if (csp->cp_data)
 		free(csp->cp_data, M_ICONVDATA);
 	free(csp, M_ICONV);
 }
 
 /*
  * Lookup and create an instance of converter.
  * Currently this layer didn't have associated 'instance' structure
  * to avoid unnesessary memory allocation.
  */
 int
 iconv_open(const char *to, const char *from, void **handle)
 {
 	struct iconv_cspair *csp, *cspfrom, *cspto;
 	struct iconv_converter_class *dcp;
 	const char *cnvname;
 	int error;
 
 	/*
 	 * First, lookup fully qualified cspairs
 	 */
 	error = iconv_lookupcs(to, from, &csp);
 	if (error == 0)
 		return ICONV_CONVERTER_OPEN(csp->cp_dcp, csp, NULL, handle);
 
 	/*
 	 * Well, nothing found. Now try to construct a composite conversion
 	 * ToDo: add a 'capability' field to converter
 	 */
 	TAILQ_FOREACH(dcp, &iconv_converters, cc_link) {
 		cnvname = ICONV_CONVERTER_NAME(dcp);
 		if (cnvname == NULL)
 			continue;
 		error = iconv_lookupcs(cnvname, from, &cspfrom);
 		if (error)
 			continue;
 		error = iconv_lookupcs(to, cnvname, &cspto);
 		if (error)
 			continue;
 		/*
 		 * Fine, we're found a pair which can be combined together
 		 */
 		return ICONV_CONVERTER_OPEN(dcp, cspto, cspfrom, handle);
 	}
 	return ENOENT;
 }
 
 int
 iconv_close(void *handle)
 {
 	return ICONV_CONVERTER_CLOSE(handle);
 }
 
 int
 iconv_conv(void *handle, const char **inbuf,
 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
 {
 	return ICONV_CONVERTER_CONV(handle, inbuf, inbytesleft, outbuf, outbytesleft, 0, 0);
 }
 
 int
 iconv_conv_case(void *handle, const char **inbuf,
 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int casetype)
 {
 	return ICONV_CONVERTER_CONV(handle, inbuf, inbytesleft, outbuf, outbytesleft, 0, casetype);
 }
 
 int
 iconv_convchr(void *handle, const char **inbuf,
 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
 {
 	return ICONV_CONVERTER_CONV(handle, inbuf, inbytesleft, outbuf, outbytesleft, 1, 0);
 }
 
 int
 iconv_convchr_case(void *handle, const char **inbuf,
 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int casetype)
 {
 	return ICONV_CONVERTER_CONV(handle, inbuf, inbytesleft, outbuf, outbytesleft, 1, casetype);
 }
 
 int
 towlower(int c, void *handle)
 {
 	return ICONV_CONVERTER_TOLOWER(handle, c);
 }
 
 int
 towupper(int c, void *handle)
 {
 	return ICONV_CONVERTER_TOUPPER(handle, c);
 }
 
 /*
  * Give a list of loaded converters. Each name terminated with 0.
  * An empty string terminates the list.
  */
 static int
 iconv_sysctl_drvlist(SYSCTL_HANDLER_ARGS)
 {
 	struct iconv_converter_class *dcp;
 	const char *name;
 	char spc;
 	int error;
 
 	error = 0;
 	sx_slock(&iconv_lock);
 	TAILQ_FOREACH(dcp, &iconv_converters, cc_link) {
 		name = ICONV_CONVERTER_NAME(dcp);
 		if (name == NULL)
 			continue;
 		error = SYSCTL_OUT(req, name, strlen(name) + 1);
 		if (error)
 			break;
 	}
 	sx_sunlock(&iconv_lock);
 	if (error)
 		return error;
 	spc = 0;
 	error = SYSCTL_OUT(req, &spc, sizeof(spc));
 	return error;
 }
 
 SYSCTL_PROC(_kern_iconv, OID_AUTO, drvlist, CTLFLAG_RD | CTLTYPE_OPAQUE,
 	    NULL, 0, iconv_sysctl_drvlist, "S,xlat", "registered converters");
 
 /*
  * List all available charset pairs.
  */
 static int
 iconv_sysctl_cslist(SYSCTL_HANDLER_ARGS)
 {
 	struct iconv_cspair *csp;
 	struct iconv_cspair_info csi;
 	int error;
 
 	error = 0;
 	bzero(&csi, sizeof(csi));
 	csi.cs_version = ICONV_CSPAIR_INFO_VER;
 	sx_slock(&iconv_lock);
 	TAILQ_FOREACH(csp, &iconv_cslist, cp_link) {
 		csi.cs_id = csp->cp_id;
 		csi.cs_refcount = csp->cp_refcount;
 		csi.cs_base = csp->cp_base ? csp->cp_base->cp_id : 0;
 		strcpy(csi.cs_to, csp->cp_to);
 		strcpy(csi.cs_from, csp->cp_from);
 		error = SYSCTL_OUT(req, &csi, sizeof(csi));
 		if (error)
 			break;
 	}
 	sx_sunlock(&iconv_lock);
 	return error;
 }
 
 SYSCTL_PROC(_kern_iconv, OID_AUTO, cslist, CTLFLAG_RD | CTLTYPE_OPAQUE,
 	    NULL, 0, iconv_sysctl_cslist, "S,xlat", "registered charset pairs");
 
 int
 iconv_add(const char *converter, const char *to, const char *from)
 {
 	struct iconv_converter_class *dcp;
 	struct iconv_cspair *csp;
 
 	if (iconv_lookupconv(converter, &dcp) != 0)
 		return EINVAL;
 
 	return iconv_register_cspair(to, from, dcp, NULL, &csp);
 }
 
 /*
  * Add new charset pair
  */
 static int
 iconv_sysctl_add(SYSCTL_HANDLER_ARGS)
 {
 	struct iconv_converter_class *dcp;
 	struct iconv_cspair *csp;
 	struct iconv_add_in din;
 	struct iconv_add_out dout;
 	int error;
 
 	error = SYSCTL_IN(req, &din, sizeof(din));
 	if (error)
 		return error;
 	if (din.ia_version != ICONV_ADD_VER)
 		return EINVAL;
 	if (din.ia_datalen > ICONV_CSMAXDATALEN)
 		return EINVAL;
 	if (strlen(din.ia_from) >= ICONV_CSNMAXLEN)
 		return EINVAL;
 	if (strlen(din.ia_to) >= ICONV_CSNMAXLEN)
 		return EINVAL;
 	if (strlen(din.ia_converter) >= ICONV_CNVNMAXLEN)
 		return EINVAL;
 	if (iconv_lookupconv(din.ia_converter, &dcp) != 0)
 		return EINVAL;
 	sx_xlock(&iconv_lock);
 	error = iconv_register_cspair(din.ia_to, din.ia_from, dcp, NULL, &csp);
 	if (error) {
 		sx_xunlock(&iconv_lock);
 		return error;
 	}
 	if (din.ia_datalen) {
 		csp->cp_data = malloc(din.ia_datalen, M_ICONVDATA, M_WAITOK);
 		error = copyin(din.ia_data, csp->cp_data, din.ia_datalen);
 		if (error)
 			goto bad;
 	}
 	dout.ia_csid = csp->cp_id;
 	error = SYSCTL_OUT(req, &dout, sizeof(dout));
 	if (error)
 		goto bad;
 	sx_xunlock(&iconv_lock);
 	ICDEBUG("%s => %s, %d bytes\n",din.ia_from, din.ia_to, din.ia_datalen);
 	return 0;
 bad:
 	iconv_unregister_cspair(csp);
 	sx_xunlock(&iconv_lock);
 	return error;
 }
 
 SYSCTL_PROC(_kern_iconv, OID_AUTO, add, CTLFLAG_RW | CTLTYPE_OPAQUE,
 	    NULL, 0, iconv_sysctl_add, "S,xlat", "register charset pair");
 
 /*
  * Default stubs for converters
  */
 int
 iconv_converter_initstub(struct iconv_converter_class *dp)
 {
 	return 0;
 }
 
 int
 iconv_converter_donestub(struct iconv_converter_class *dp)
 {
 	return 0;
 }
 
 int
 iconv_converter_tolowerstub(int c, void *handle)
 {
 	return (c);
 }
 
 int
 iconv_converter_handler(module_t mod, int type, void *data)
 {
 	struct iconv_converter_class *dcp = data;
 	int error;
 
 	switch (type) {
 	    case MOD_LOAD:
 		sx_xlock(&iconv_lock);
 		error = iconv_register_converter(dcp);
 		if (error) {
 			sx_xunlock(&iconv_lock);
 			break;
 		}
 		error = ICONV_CONVERTER_INIT(dcp);
 		if (error)
 			iconv_unregister_converter(dcp);
 		sx_xunlock(&iconv_lock);
 		break;
 	    case MOD_UNLOAD:
 		sx_xlock(&iconv_lock);
 		ICONV_CONVERTER_DONE(dcp);
 		error = iconv_unregister_converter(dcp);
 		sx_xunlock(&iconv_lock);
 		break;
 	    default:
 		error = EINVAL;
 	}
 	return error;
 }
 
 /*
  * Common used functions (don't use with unicode)
  */
 char *
 iconv_convstr(void *handle, char *dst, const char *src)
 {
 	char *p = dst;
 	size_t inlen, outlen;
 	int error;
 
 	if (handle == NULL) {
 		strcpy(dst, src);
 		return dst;
 	}
 	inlen = outlen = strlen(src);
 	error = iconv_conv(handle, NULL, NULL, &p, &outlen);
 	if (error)
 		return NULL;
 	error = iconv_conv(handle, &src, &inlen, &p, &outlen);
 	if (error)
 		return NULL;
 	*p = 0;
 	return dst;
 }
 
 void *
 iconv_convmem(void *handle, void *dst, const void *src, int size)
 {
 	const char *s = src;
 	char *d = dst;
 	size_t inlen, outlen;
 	int error;
 
 	if (size == 0)
 		return dst;
 	if (handle == NULL) {
 		memcpy(dst, src, size);
 		return dst;
 	}
 	inlen = outlen = size;
 	error = iconv_conv(handle, NULL, NULL, &d, &outlen);
 	if (error)
 		return NULL;
 	error = iconv_conv(handle, &s, &inlen, &d, &outlen);
 	if (error)
 		return NULL;
 	return dst;
 }
 
 int
 iconv_lookupcp(char **cpp, const char *s)
 {
 	if (cpp == NULL) {
 		ICDEBUG("warning a NULL list passed\n", "");
 		return ENOENT;
 	}
 	for (; *cpp; cpp++)
 		if (strcmp(*cpp, s) == 0)
 			return 0;
 	return ENOENT;
 }
 
 /*
  * Return if fsname is in use of not
  */
 int
 iconv_vfs_refcount(const char *fsname)
 {
 	struct vfsconf *vfsp;
 
 	vfsp = vfs_byname(fsname);
 	if (vfsp != NULL && vfsp->vfc_refcount > 0)
 		return (EBUSY);
 	return (0);
 }
Index: head/sys/libkern/iconv_ucs.c
===================================================================
--- head/sys/libkern/iconv_ucs.c	(revision 326270)
+++ head/sys/libkern/iconv_ucs.c	(revision 326271)
@@ -1,538 +1,540 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003, 2005 Ryuichiro Imura
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/iconv.h>
 
 #include "iconv_converter_if.h"
 
 /*
  * "UCS" converter
  */
 
 #define	KICONV_UCS_COMBINE	0x1
 #define	KICONV_UCS_FROM_UTF8	0x2
 #define	KICONV_UCS_TO_UTF8	0x4
 #define	KICONV_UCS_FROM_LE	0x8
 #define	KICONV_UCS_TO_LE	0x10
 #define	KICONV_UCS_FROM_UTF16	0x20
 #define	KICONV_UCS_TO_UTF16	0x40
 #define	KICONV_UCS_UCS4		0x80
 
 #define	ENCODING_UTF16	"UTF-16BE"
 #define	ENCODING_UTF8	"UTF-8"
 
 static struct {
 	const char *name;
 	int from_flag, to_flag;
 } unicode_family[] = {
 	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
 	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
 	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
 	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
 	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
 	{ NULL,		0,	0 }
 };
 
 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
 static uint32_t encode_surrogate(uint32_t code);
 static uint32_t decode_surrogate(const u_char *ucs);
 
 #ifdef MODULE_DEPEND
 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
 #endif
 
 /*
  * UCS converter instance
  */
 struct iconv_ucs {
 	KOBJ_FIELDS;
 	int			convtype;
 	struct iconv_cspair *	d_csp;
 	struct iconv_cspair *	d_cspf;
 	void *			f_ctp;
 	void *			t_ctp;
 	void *			ctype;
 };
 
 static int
 iconv_ucs_open(struct iconv_converter_class *dcp,
 	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
 {
 	struct iconv_ucs *dp;
 	int i;
 	const char *from, *to;
 
 	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
 	to = csp->cp_to;
 	from = cspf ? cspf->cp_from : csp->cp_from;
 
 	dp->convtype = 0;
 
 	if (cspf)
 		dp->convtype |= KICONV_UCS_COMBINE;
 	for (i = 0; unicode_family[i].name; i++) {
 		if (strcasecmp(from, unicode_family[i].name) == 0)
 			dp->convtype |= unicode_family[i].from_flag;
 		if (strcasecmp(to, unicode_family[i].name) == 0)
 			dp->convtype |= unicode_family[i].to_flag;
 	}
 	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
 		dp->convtype |= KICONV_UCS_UCS4;
 	else
 		dp->convtype &= ~KICONV_UCS_UCS4;
 
 	dp->f_ctp = dp->t_ctp = NULL;
 	if (dp->convtype & KICONV_UCS_COMBINE) {
 		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
 		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
 			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
 		}
 		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
 		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
 			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
 		}
 	}
 
 	dp->ctype = NULL;
 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
 		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
 
 	dp->d_csp = csp;
 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
 		if (cspf) {
 			dp->d_cspf = cspf;
 			cspf->cp_refcount++;
 		} else
 			csp->cp_refcount++;
 	}
 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
 		csp->cp_refcount++;
 	*dpp = (void*)dp;
 	return 0;
 }
 
 static int
 iconv_ucs_close(void *data)
 {
 	struct iconv_ucs *dp = data;
 
 	if (dp->f_ctp)
 		iconv_close(dp->f_ctp);
 	if (dp->t_ctp)
 		iconv_close(dp->t_ctp);
 	if (dp->ctype)
 		iconv_close(dp->ctype);
 	if (dp->d_cspf)
 		dp->d_cspf->cp_refcount--;
 	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
 		dp->d_csp->cp_refcount--;
 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
 		dp->d_csp->cp_refcount--;
 	kobj_delete((struct kobj*)data, M_ICONV);
 	return 0;
 }
 
 static int
 iconv_ucs_conv(void *d2p, const char **inbuf,
 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
 	int convchar, int casetype)
 {
 	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
 	int ret = 0, i;
 	size_t in, on, ir, or, inlen, outlen, ucslen;
 	const char *src, *p;
 	char *dst;
 	u_char ucs[4], *q;
 	uint32_t code;
 
 	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
 		return 0;
 	ir = in = *inbytesleft;
 	or = on = *outbytesleft;
 	src = *inbuf;
 	dst = *outbuf;
 
 	while (ir > 0 && or > 0) {
 
 		/*
 		 * The first half of conversion.
 		 * (convert any code into ENCODING_UNICODE)
 		 */
 		code = 0;
 		p = src;
 		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
 			/* convert UTF-8 to ENCODING_UNICODE */
 			inlen = 0;
 			code = utf8_to_ucs4(p, &inlen, ir);
 			if (code == 0) {
 				ret = -1;
 				break;
 			}
 
 			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
 				code = towlower(code, dp->ctype);
 			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
 				code = towupper(code, dp->ctype);
 			}
 
 			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
 				/* reserved for utf-16 surrogate pair */
 				/* invalid unicode */
 				ret = -1;
 				break;
 			}
 
 			if (inlen == 4) {
 				if (dp->convtype & KICONV_UCS_UCS4) {
 					ucslen = 4;
 					code = encode_surrogate(code);
 				} else {
 					/* can't handle with ucs-2 */
 					ret = -1;
 					break;
 				}
 			} else {
 				ucslen = 2;
 			}
 
 			/* save UCS-4 into ucs[] */
 			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
 				*q++ = (code >> (i << 3)) & 0xff;
 
 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
 			/* convert local code to ENCODING_UNICODE */
 			ucslen = 4;
 			inlen = ir;
 			q = ucs;
 			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
 			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
 			if (ret)
 				break;
 			inlen = ir - inlen;
 			ucslen = 4 - ucslen;
 
 		} else {
 			/* src code is a proper subset of ENCODING_UNICODE */
 			q = ucs;
 			if (dp->convtype & KICONV_UCS_FROM_LE) {
 				*q = *(p + 1);
 				*(q + 1) = *p;
 				p += 2;
 			} else {
 				*q = *p++;
 				*(q + 1) = *p++;
 			}
 			if ((*q & 0xfc) == 0xd8) {
 				if (dp->convtype & KICONV_UCS_UCS4 &&
 				    dp->convtype & KICONV_UCS_FROM_UTF16) {
 					inlen = ucslen = 4;
 				} else {
 					/* invalid unicode */
 					ret = -1;
 					break;
 				}
 			} else {
 				inlen = ucslen = 2;
 			}
 			if (ir < inlen) {
 				ret = -1;
 				break;
 			}
 			if (ucslen == 4) {
 				q += 2;
 				if (dp->convtype & KICONV_UCS_FROM_LE) {
 					*q = *(p + 1);
 					*(q + 1) = *p;
 				} else {
 					*q = *p++;
 					*(q + 1) = *p;
 				}
 				if ((*q & 0xfc) != 0xdc) {
 					/* invalid unicode */
 					ret = -1;
 					break;
 				}
 			}
 		}
 
 		/*
 		 * The second half of conversion.
 		 * (convert ENCODING_UNICODE into any code)
 		 */
 		p = ucs;
 		if (dp->convtype & KICONV_UCS_TO_UTF8) {
 			q = (u_char *)dst;
 			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
 				/* decode surrogate pair */
 				code = decode_surrogate(p);
 			} else {
 				code = (ucs[0] << 8) | ucs[1];
 			}
 
 			if (casetype == KICONV_LOWER && dp->ctype) {
 				code = towlower(code, dp->ctype);
 			} else if (casetype == KICONV_UPPER && dp->ctype) {
 				code = towupper(code, dp->ctype);
 			}
 
 			outlen = 0;
 			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
 				ret = -1;
 				break;
 			}
 
 			src += inlen;
 			ir -= inlen;
 			dst += outlen;
 			or -= outlen;
 
 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
 			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
 			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
 			if (ret)
 				break;
 
 			src += inlen;
 			ir -= inlen;
 
 		} else {
 			/* dst code is a proper subset of ENCODING_UNICODE */
 			if (or < ucslen) {
 				ret = -1;
 				break;
 			}
 			src += inlen;
 			ir -= inlen;
 			or -= ucslen;
 			if (dp->convtype & KICONV_UCS_TO_LE) {
 				*dst++ = *(p + 1);
 				*dst++ = *p;
 				p += 2;
 			} else {
 				*dst++ = *p++;
 				*dst++ = *p++;
 			}
 			if (ucslen == 4) {
 				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
 				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
 					ret = -1;
 					break;
 				}
 				if (dp->convtype & KICONV_UCS_TO_LE) {
 					*dst++ = *(p + 1);
 					*dst++ = *p;
 				} else {
 					*dst++ = *p++;
 					*dst++ = *p;
 				}
 			}
 		}
 
 		if (convchar == 1)
 			break;
 	}
 
 	*inbuf += in - ir;
 	*outbuf += on - or;
 	*inbytesleft -= in - ir;
 	*outbytesleft -= on - or;
 	return (ret);
 }
 
 static int
 iconv_ucs_init(struct iconv_converter_class *dcp)
 {
 	int error;
 
 	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
 	if (error)
 		return (error);
 	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
 	if (error)
 		return (error);
 	return (0);
 }
 
 static int
 iconv_ucs_done(struct iconv_converter_class *dcp)
 {
 	return (0);
 }
 
 static const char *
 iconv_ucs_name(struct iconv_converter_class *dcp)
 {
 	return (ENCODING_UNICODE);
 }
 
 static kobj_method_t iconv_ucs_methods[] = {
 	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
 	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
 	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
 	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
 	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
 	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
 	{0, 0}
 };
 
 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
 
 static uint32_t
 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
 {
 	size_t i, w = 0;
 	uint32_t ucs4 = 0;
 
 	/*
 	 * get leading 1 byte from utf-8
 	 */
 	if ((*src & 0x80) == 0) {
 		/*
 		 * leading 1 bit is "0"
 		 *  utf-8: 0xxxxxxx
 		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
 		 */
 		w = 1;
 		/* get trailing 7 bits */
 		ucs4 = *src & 0x7f;
 	} else if ((*src & 0xe0) == 0xc0) {
 		/*
 		 * leading 3 bits are "110"
 		 *  utf-8: 110xxxxx 10yyyyyy
 		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
 		 */
 		w = 2;
 		/* get trailing 5 bits */
 		ucs4 = *src & 0x1f;
 	} else if ((*src & 0xf0) == 0xe0) {
 		/*
 		 * leading 4 bits are "1110"
 		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
 		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
 		 */
 		w = 3;
 		/* get trailing 4 bits */
 		ucs4 = *src & 0x0f;
 	} else if ((*src & 0xf8) == 0xf0) {
 		/*
 		 * leading 5 bits are "11110"
 		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
 		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
 		 */
 		w = 4;
 		/* get trailing 3 bits */
 		ucs4 = *src & 0x07;
 	} else {
 		/* out of utf-16 range or having illegal bits */
 		return (0);
 	}
 
 	if (srclen < w)
 		return (0);
 
 	/*
 	 * get left parts from utf-8
 	 */
 	for (i = 1 ; i < w ; i++) {
 		if ((*(src + i) & 0xc0) != 0x80) {
 			/* invalid: leading 2 bits are not "10" */
 			return (0);
 		}
 		/* concatenate trailing 6 bits into ucs4 */
 		ucs4 <<= 6;
 		ucs4 |= *(src + i) & 0x3f;
 	}
 
 	*utf8width = w;
 	return (ucs4);
 }
 
 static u_char *
 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
 {
 	u_char lead, *p;
 	size_t i, w;
 
 	/*
 	 * determine utf-8 width and leading bits
 	 */
 	if (ucs4 < 0x80) {
 		w = 1;
 		lead = 0;	/* "0" */
 	} else if (ucs4 < 0x800) {
 		w = 2;
 		lead = 0xc0;	/* "11" */
 	} else if (ucs4 < 0x10000) {
 		w = 3;
 		lead = 0xe0;	/* "111" */
 	} else if (ucs4 < 0x200000) {
 		w = 4;
 		lead = 0xf0;	/* "1111" */
 	} else {
 		return (NULL);
 	}
 
 	if (dstlen < w)
 		return (NULL);
 
 	/*
 	 * construct utf-8
 	 */
 	p = dst;
 	for (i = w - 1 ; i >= 1 ; i--) {
 		/* get trailing 6 bits and put it with leading bit as "1" */
 		*(p + i) = (ucs4 & 0x3f) | 0x80;
 		ucs4 >>= 6;
 	}
 	*p = ucs4 | lead;
 
 	*utf8width = w;
 
 	return (p);
 }
 
 static uint32_t
 encode_surrogate(uint32_t code)
 {
 	return ((((code - 0x10000) << 6) & 0x3ff0000) |
 	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
 }
 
 static uint32_t
 decode_surrogate(const u_char *ucs)
 {
 	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
 	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
 }
 
Index: head/sys/libkern/iconv_xlat.c
===================================================================
--- head/sys/libkern/iconv_xlat.c	(revision 326270)
+++ head/sys/libkern/iconv_xlat.c	(revision 326271)
@@ -1,126 +1,128 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/iconv.h>
 
 #include "iconv_converter_if.h"
 
 /*
  * "XLAT" converter
  */
 
 #ifdef MODULE_DEPEND
 MODULE_DEPEND(iconv_xlat, libiconv, 2, 2, 2);
 #endif
 
 /*
  * XLAT converter instance
  */
 struct iconv_xlat {
 	KOBJ_FIELDS;
 	u_char *		d_table;
 	struct iconv_cspair *	d_csp;
 };
 
 static int
 iconv_xlat_open(struct iconv_converter_class *dcp,
 	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
 {
 	struct iconv_xlat *dp;
 
 	dp = (struct iconv_xlat *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
 	dp->d_table = csp->cp_data;
 	dp->d_csp = csp;
 	csp->cp_refcount++;
 	*dpp = (void*)dp;
 	return 0;
 }
 
 static int
 iconv_xlat_close(void *data)
 {
 	struct iconv_xlat *dp = data;
 
 	dp->d_csp->cp_refcount--;
 	kobj_delete((struct kobj*)data, M_ICONV);
 	return 0;
 }
 
 static int
 iconv_xlat_conv(void *d2p, const char **inbuf,
 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
 	int convchar, int casetype)
 {
 	struct iconv_xlat *dp = (struct iconv_xlat*)d2p;
 	const char *src;
 	char *dst;
 	int n, r;
 
 	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
 		return 0;
 	if (casetype != 0)
 		return -1;
 	if (convchar == 1)
 		r = n = 1;
 	else
 		r = n = min(*inbytesleft, *outbytesleft);
 	src = *inbuf;
 	dst = *outbuf;
 	while(r--)
 		*dst++ = dp->d_table[(u_char)*src++];
 	*inbuf += n;
 	*outbuf += n;
 	*inbytesleft -= n;
 	*outbytesleft -= n;
 	return 0;
 }
 
 static const char *
 iconv_xlat_name(struct iconv_converter_class *dcp)
 {
 	return "xlat";
 }
 
 static kobj_method_t iconv_xlat_methods[] = {
 	KOBJMETHOD(iconv_converter_open,	iconv_xlat_open),
 	KOBJMETHOD(iconv_converter_close,	iconv_xlat_close),
 	KOBJMETHOD(iconv_converter_conv,	iconv_xlat_conv),
 #if 0
 	KOBJMETHOD(iconv_converter_init,	iconv_xlat_init),
 	KOBJMETHOD(iconv_converter_done,	iconv_xlat_done),
 #endif
 	KOBJMETHOD(iconv_converter_name,	iconv_xlat_name),
 	{0, 0}
 };
 
 KICONV_CONVERTER(xlat, sizeof(struct iconv_xlat));
Index: head/sys/libkern/iconv_xlat16.c
===================================================================
--- head/sys/libkern/iconv_xlat16.c	(revision 326270)
+++ head/sys/libkern/iconv_xlat16.c	(revision 326271)
@@ -1,363 +1,365 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003, 2005 Ryuichiro Imura
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/iconv.h>
 
 #include "iconv_converter_if.h"
 
 /*
  * "XLAT16" converter
  */
 
 #ifdef MODULE_DEPEND
 MODULE_DEPEND(iconv_xlat16, libiconv, 2, 2, 2);
 #endif
 
 #define C2I1(c)	((c) & 0x8000 ? ((c) & 0xff) | 0x100 : (c) & 0xff)
 #define C2I2(c)	((c) & 0x8000 ? ((c) >> 8) & 0x7f : ((c) >> 8) & 0xff)
 
 /*
  * XLAT16 converter instance
  */
 struct iconv_xlat16 {
 	KOBJ_FIELDS;
 	uint32_t *		d_table[0x200];
 	void *			f_ctp;
 	void *			t_ctp;
 	struct iconv_cspair *	d_csp;
 };
 
 static int
 iconv_xlat16_open(struct iconv_converter_class *dcp,
 	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
 {
 	struct iconv_xlat16 *dp;
 	uint32_t *headp, **idxp;
 	int i;
 
 	dp = (struct iconv_xlat16 *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
 	headp = (uint32_t *)((caddr_t)csp->cp_data + sizeof(dp->d_table));
 	idxp = (uint32_t **)csp->cp_data;
 	for (i = 0 ; i < 0x200 ; i++) {
 		if (*idxp) {
 			dp->d_table[i] = headp;
 			headp += 0x80;
 		} else {
 			dp->d_table[i] = NULL;
 		}
 		idxp++;
 	}
 
 	if (strcmp(csp->cp_to, KICONV_WCTYPE_NAME) != 0) {
 		if (iconv_open(KICONV_WCTYPE_NAME, csp->cp_from, &dp->f_ctp) != 0)
 			dp->f_ctp = NULL;
 		if (iconv_open(KICONV_WCTYPE_NAME, csp->cp_to, &dp->t_ctp) != 0)
 			dp->t_ctp = NULL;
 	} else {
 		dp->f_ctp = dp->t_ctp = dp;
 	}
 
 	dp->d_csp = csp;
 	csp->cp_refcount++;
 	*dpp = (void*)dp;
 	return (0);
 }
 
 static int
 iconv_xlat16_close(void *data)
 {
 	struct iconv_xlat16 *dp = data;
 
 	if (dp->f_ctp && dp->f_ctp != data)
 		iconv_close(dp->f_ctp);
 	if (dp->t_ctp && dp->t_ctp != data)
 		iconv_close(dp->t_ctp);
 	dp->d_csp->cp_refcount--;
 	kobj_delete((struct kobj*)data, M_ICONV);
 	return (0);
 }
 
 static int
 iconv_xlat16_conv(void *d2p, const char **inbuf,
 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
 	int convchar, int casetype)
 {
 	struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p;
 	const char *src;
 	char *dst;
 	int nullin, ret = 0;
 	size_t in, on, ir, or, inlen;
 	uint32_t code;
 	u_char u, l;
 	uint16_t c1, c2, ctmp;
 
 	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
 		return (0);
 	ir = in = *inbytesleft;
 	or = on = *outbytesleft;
 	src = *inbuf;
 	dst = *outbuf;
 
 	while(ir > 0 && or > 0) {
 
 		inlen = 0;
 		code = 0;
 
 		c1 = ir > 1 ? *(src+1) & 0xff : 0;
 		c2 = *src & 0xff;
 		ctmp = 0;
 
 		c1 = c2 & 0x80 ? c1 | 0x100 : c1;
 		c2 = c2 & 0x80 ? c2 & 0x7f : c2;
 
 		if (ir > 1 && dp->d_table[c1] && dp->d_table[c1][c2]) {
 			/*
 			 * inbuf char is a double byte char
 			 */
 			inlen = 2;
 
 			/* toupper,tolower */
 			if (casetype == KICONV_FROM_LOWER && dp->f_ctp)
 				ctmp = towlower(((u_char)*src << 8) | (u_char)*(src + 1),
 				    dp->f_ctp);
 			else if (casetype == KICONV_FROM_UPPER && dp->f_ctp)
 				ctmp = towupper(((u_char)*src << 8) | (u_char)*(src + 1),
 				    dp->f_ctp);
 			if (ctmp) {
 				c1 = C2I1(ctmp);
 				c2 = C2I2(ctmp);
 			}
 		}
 
 		if (inlen == 0) {
 			c1 &= 0xff00;
 			if (!dp->d_table[c1]) {
 				ret = -1;
 				break;
 			}
 			/*
 			 * inbuf char is a single byte char
 			 */
 			inlen = 1;
 
 			if (casetype & (KICONV_FROM_LOWER|KICONV_FROM_UPPER))
 				code = dp->d_table[c1][c2];
 
 			if (casetype == KICONV_FROM_LOWER) {
 				if (dp->f_ctp)
 					ctmp = towlower((u_char)*src, dp->f_ctp);
 				else if (code & XLAT16_HAS_FROM_LOWER_CASE)
 					ctmp = (u_char)(code >> 16);
 			} else if (casetype == KICONV_FROM_UPPER) {
 				if (dp->f_ctp)
 					ctmp = towupper((u_char)*src, dp->f_ctp);
 				else if (code & XLAT16_HAS_FROM_UPPER_CASE)
 					ctmp = (u_char)(code >> 16);
 			}
 			if (ctmp) {
 				c1 = C2I1(ctmp << 8);
 				c2 = C2I2(ctmp << 8);
 			}
 		}
 
 		code = dp->d_table[c1][c2];
 		if (!code) {
 			ret = -1;
 			break;
 		}
 
 		nullin = (code & XLAT16_ACCEPT_NULL_IN) ? 1 : 0;
 		if (inlen == 1 && nullin) {
 			/*
 			 * XLAT16_ACCEPT_NULL_IN requires inbuf has 2byte
 			 */
 			ret = -1;
 			break;
 		}
 
 		/*
 		 * now start translation
 		 */
 		u = (u_char)(code >> 8);
 		l = (u_char)code;
 
 #ifdef XLAT16_ACCEPT_3BYTE_CHR
 		if (code & XLAT16_IS_3BYTE_CHR) {
 			if (or < 3) {
 				ret = -1;
 				break;
 			}
 			*dst++ = u;
 			*dst++ = l;
 			*dst++ = (u_char)(code >> 16);
 			or -= 3;
 		} else
 #endif
 		if (u || code & XLAT16_ACCEPT_NULL_OUT) {
 			if (or < 2) {
 				ret = -1;
 				break;
 			}
 
 			/* toupper,tolower */
 			if (casetype == KICONV_LOWER && dp->t_ctp) {
 				code = towlower((uint16_t)code, dp->t_ctp);
 				u = (u_char)(code >> 8);
 				l = (u_char)code;
 			}
 			if (casetype == KICONV_UPPER && dp->t_ctp) {
 				code = towupper((uint16_t)code, dp->t_ctp);
 				u = (u_char)(code >> 8);
 				l = (u_char)code;
 			}
 
 			*dst++ = u;
 			*dst++ = l;
 			or -= 2;
 		} else {
 			/* toupper,tolower */
 			if (casetype == KICONV_LOWER) {
 				if (dp->t_ctp)
 					l = (u_char)towlower(l, dp->t_ctp);
 				else if (code & XLAT16_HAS_LOWER_CASE)
 					l = (u_char)(code >> 16);
 			}
 			if (casetype == KICONV_UPPER) {
 				if (dp->t_ctp)
 					l = (u_char)towupper(l, dp->t_ctp);
 				else if (code & XLAT16_HAS_UPPER_CASE)
 					l = (u_char)(code >> 16);
 			}
 
 			*dst++ = l;
 			or--;
 		}
 
 		if (inlen == 2) {
 			/*
 			 * there is a case that inbuf char is a single
 			 * byte char while inlen == 2
 			 */
 			if ((u_char)*(src+1) == '\0' && !nullin ) {
 				src++;
 				ir--;
 			} else {
 				src += 2;
 				ir -= 2;
 			}
 		} else {
 			src++;
 			ir--;
 		}
 
 		if (convchar == 1)
 			break;
 	}
 
 	*inbuf += in - ir;
 	*outbuf += on - or;
 	*inbytesleft -= in - ir;
 	*outbytesleft -= on - or;
 	return (ret);
 }
 
 static const char *
 iconv_xlat16_name(struct iconv_converter_class *dcp)
 {
 	return ("xlat16");
 }
 
 static int
 iconv_xlat16_tolower(void *d2p, int c)
 {
         struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p;
 	int c1, c2, out;
 
 	if (c < 0x100) {
 		c1 = C2I1(c << 8);
 		c2 = C2I2(c << 8);
 	} else if (c < 0x10000) {
                 c1 = C2I1(c);
                 c2 = C2I2(c);
 	} else
 		return (c);
 
 	if (dp->d_table[c1] && dp->d_table[c1][c2] & XLAT16_HAS_LOWER_CASE) {
 		/*return (int)(dp->d_table[c1][c2] & 0xffff);*/
 		out = dp->d_table[c1][c2] & 0xffff;
 		if ((out & 0xff) == 0)
 			out = (out >> 8) & 0xff;
 		return (out);
 	} else
 		return (c);
 }
 
 static int
 iconv_xlat16_toupper(void *d2p, int c)
 {
         struct iconv_xlat16 *dp = (struct iconv_xlat16*)d2p;
 	int c1, c2, out;
 
 	if (c < 0x100) {
 		c1 = C2I1(c << 8);
 		c2 = C2I2(c << 8);
 	} else if (c < 0x10000) {
                 c1 = C2I1(c);
                 c2 = C2I2(c);
 	} else
 		return (c);
 
 	if (dp->d_table[c1] && dp->d_table[c1][c2] & XLAT16_HAS_UPPER_CASE) {
 		out = dp->d_table[c1][c2] & 0xffff;
 		if ((out & 0xff) == 0)
 			out = (out >> 8) & 0xff;
 		return (out);
 	} else
 		return (c);
 }
 
 static kobj_method_t iconv_xlat16_methods[] = {
 	KOBJMETHOD(iconv_converter_open,	iconv_xlat16_open),
 	KOBJMETHOD(iconv_converter_close,	iconv_xlat16_close),
 	KOBJMETHOD(iconv_converter_conv,	iconv_xlat16_conv),
 #if 0
 	KOBJMETHOD(iconv_converter_init,	iconv_xlat16_init),
 	KOBJMETHOD(iconv_converter_done,	iconv_xlat16_done),
 #endif
 	KOBJMETHOD(iconv_converter_name,	iconv_xlat16_name),
 	KOBJMETHOD(iconv_converter_tolower,	iconv_xlat16_tolower),
 	KOBJMETHOD(iconv_converter_toupper,	iconv_xlat16_toupper),
 	{0, 0}
 };
 
 KICONV_CONVERTER(xlat16, sizeof(struct iconv_xlat16));
Index: head/sys/libkern/inet_aton.c
===================================================================
--- head/sys/libkern/inet_aton.c	(revision 326270)
+++ head/sys/libkern/inet_aton.c	(revision 326271)
@@ -1,136 +1,138 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2001 Charles Mott <cm@linktel.net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/ctype.h>
 #include <sys/limits.h>
 #include <sys/systm.h>
 
 #include <netinet/in.h>
 
 int
 inet_aton(const char *cp, struct in_addr *addr)
 {
 	u_long parts[4];
 	in_addr_t val;
 	const char *c;
 	char *endptr;
 	int gotend, n;
 
 	c = (const char *)cp;
 	n = 0;
 
 	/*
 	 * Run through the string, grabbing numbers until
 	 * the end of the string, or some error
 	 */
 	gotend = 0;
 	while (!gotend) {
 		unsigned long l;
 
 		l = strtoul(c, &endptr, 0);
 
 		if (l == ULONG_MAX || (l == 0 && endptr == c))
 			return (0);
 
 		val = (in_addr_t)l;
 
 		/*
 		 * If the whole string is invalid, endptr will equal
 		 * c.. this way we can make sure someone hasn't
 		 * gone '.12' or something which would get past
 		 * the next check.
 		 */
 		if (endptr == c)
 			return (0);
 		parts[n] = val;
 		c = endptr;
 
 		/* Check the next character past the previous number's end */
 		switch (*c) {
 		case '.' :
 
 			/* Make sure we only do 3 dots .. */
 			if (n == 3)	/* Whoops. Quit. */
 				return (0);
 			n++;
 			c++;
 			break;
 
 		case '\0':
 			gotend = 1;
 			break;
 
 		default:
 			if (isspace((unsigned char)*c)) {
 				gotend = 1;
 				break;
 			} else {
 
 				/* Invalid character, then fail. */
 				return (0);
 			}
 		}
 
 	}
 
 	/* Concoct the address according to the number of parts specified. */
 	switch (n) {
 	case 0:				/* a -- 32 bits */
 
 		/*
 		 * Nothing is necessary here.  Overflow checking was
 		 * already done in strtoul().
 		 */
 		break;
 	case 1:				/* a.b -- 8.24 bits */
 		if (val > 0xffffff || parts[0] > 0xff)
 			return (0);
 		val |= parts[0] << 24;
 		break;
 
 	case 2:				/* a.b.c -- 8.8.16 bits */
 		if (val > 0xffff || parts[0] > 0xff || parts[1] > 0xff)
 			return (0);
 		val |= (parts[0] << 24) | (parts[1] << 16);
 		break;
 
 	case 3:				/* a.b.c.d -- 8.8.8.8 bits */
 		if (val > 0xff || parts[0] > 0xff || parts[1] > 0xff ||
 		    parts[2] > 0xff)
 			return (0);
 		val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8);
 		break;
 	}
 
 	if (addr != NULL)
 		addr->s_addr = htonl(val);
 	return (1);
 }
 
Index: head/sys/libkern/memcchr.c
===================================================================
--- head/sys/libkern/memcchr.c	(revision 326270)
+++ head/sys/libkern/memcchr.c	(revision 326271)
@@ -1,115 +1,117 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2012 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/param.h>
 
 /*
  * memcchr(): find first character in buffer not matching `c'.
  *
  * This function performs the complement of memchr().  To provide decent
  * performance, this function compares data from the buffer one word at
  * a time.
  *
  * This code is inspired by libc's strlen(), written by Xin Li.
  */
 
 #if LONG_BIT != 32 && LONG_BIT != 64
 #error Unsupported word size
 #endif
 
 #define	LONGPTR_MASK (sizeof(long) - 1)
 
 #define	TESTBYTE				\
 	do {					\
 		if (*p != (unsigned char)c)	\
 			goto done;		\
 		p++;				\
 	} while (0)
 
 void *
 memcchr(const void *begin, int c, size_t n)
 {
 	const unsigned long *lp;
 	const unsigned char *p, *end;
 	unsigned long word;
 
 	/* Four or eight repetitions of `c'. */
 	word = (unsigned char)c;
 	word |= word << 8;
 	word |= word << 16;
 #if LONG_BIT >= 64
 	word |= word << 32;
 #endif
 
 	/* Don't perform memory I/O when passing a zero-length buffer. */
 	if (n == 0)
 		return (NULL);
 
 	/*
 	 * First determine whether there is a character unequal to `c'
 	 * in the first word.  As this word may contain bytes before
 	 * `begin', we may execute this loop spuriously.
 	 */
 	lp = (const unsigned long *)((uintptr_t)begin & ~LONGPTR_MASK);
 	end = (const unsigned char *)begin + n;
 	if (*lp++ != word)
 		for (p = begin; p < (const unsigned char *)lp;)
 			TESTBYTE;
 
 	/* Now compare the data one word at a time. */
 	for (; (const unsigned char *)lp < end; lp++) {
 		if (*lp != word) {
 			p = (const unsigned char *)lp;
 			TESTBYTE;
 			TESTBYTE;
 			TESTBYTE;
 #if LONG_BIT >= 64
 			TESTBYTE;
 			TESTBYTE;
 			TESTBYTE;
 			TESTBYTE;
 #endif
 			goto done;
 		}
 	}
 
 	return (NULL);
 
 done:
 	/*
 	 * If the end of the buffer is not word aligned, the previous
 	 * loops may obtain an address that's beyond the end of the
 	 * buffer.
 	 */
 	if (p < end)
 		return (__DECONST(void *, p));
 	return (NULL);
 }
Index: head/sys/libkern/memmove.c
===================================================================
--- head/sys/libkern/memmove.c	(revision 326270)
+++ head/sys/libkern/memmove.c	(revision 326271)
@@ -1,38 +1,40 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2009 Roman Divacky <rdivacky@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
 */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/libkern.h>
 
 void *
 memmove(void *dest, const void *src, size_t n)
 {
 
 	bcopy(src, dest, n);
 	return (dest);
 }
Index: head/sys/libkern/memset.c
===================================================================
--- head/sys/libkern/memset.c	(revision 326270)
+++ head/sys/libkern/memset.c	(revision 326271)
@@ -1,44 +1,46 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (C) 1992-2007 The FreeBSD Project. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define	LIBKERN_INLINE
 
 #include <sys/types.h>
 #include <sys/libkern.h>
 
 void *
 memset(void *b, int c, size_t len)
 {
 	char *bb;
 
 	if (c == 0)
 		bzero(b, len);
 	else
 		for (bb = (char *)b; len--; )
 			*bb++ = c;
 	return (b);
 }
Index: head/sys/libkern/strcspn.c
===================================================================
--- head/sys/libkern/strcspn.c	(revision 326270)
+++ head/sys/libkern/strcspn.c	(revision 326271)
@@ -1,72 +1,74 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2005 David Schultz <das@FreeBSD.ORG>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/libkern.h>
 #include <sys/types.h>
 #include <sys/limits.h>
 
 #define	IDX(c)	((u_char)(c) / LONG_BIT)
 #define	BIT(c)	((u_long)1 << ((u_char)(c) % LONG_BIT))
 
 size_t 
 strcspn(const char * __restrict s, const char * __restrict charset) 
 {
 	/*
 	 * NB: idx and bit are temporaries whose use causes gcc 3.4.2 to
 	 * generate better code.  Without them, gcc gets a little confused.
 	 */
 	const char *s1;
 	u_long bit;
 	u_long tbl[(UCHAR_MAX + 1) / LONG_BIT];
 	int idx;
 
 	if(*s == '\0')
 		return (0);
 
 #if LONG_BIT == 64	/* always better to unroll on 64-bit architectures */
 	tbl[0] = 1;
 	tbl[3] = tbl[2] = tbl[1] = 0;
 #else
 	for (tbl[0] = idx = 1; idx < sizeof(tbl) / sizeof(tbl[0]); idx++)
 		tbl[idx] = 0;
 #endif
 	for (; *charset != '\0'; charset++) {
 		idx = IDX(*charset);
 		bit = BIT(*charset);
 		tbl[idx] |= bit;
 	}
 
 	for(s1 = s; ; s1++) {
 		idx = IDX(*s1);
 		bit = BIT(*s1);
 		if ((tbl[idx] & bit) != 0)
 			break;
 	}
 	return (s1 - s);
 }
Index: head/sys/libkern/strdup.c
===================================================================
--- head/sys/libkern/strdup.c	(revision 326270)
+++ head/sys/libkern/strdup.c	(revision 326271)
@@ -1,50 +1,52 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035
  * ("CBOSS"), as part of the DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 
 char *
 strdup(const char *string, struct malloc_type *type)
 {
 	size_t len;
 	char *copy;
 
 	len = strlen(string) + 1;
 	copy = malloc(len, type, M_WAITOK);
 	bcopy(string, copy, len);
 	return (copy);
 }
Index: head/sys/libkern/strlcat.c
===================================================================
--- head/sys/libkern/strlcat.c	(revision 326270)
+++ head/sys/libkern/strlcat.c	(revision 326271)
@@ -1,72 +1,74 @@
 /*	$OpenBSD: strlcat.c,v 1.2 1999/06/17 16:28:58 millert Exp $	*/
 
 /*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
  * Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
  * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
 static char *rcsid = "$OpenBSD: strlcat.c,v 1.2 1999/06/17 16:28:58 millert Exp $";
 #endif /* LIBC_SCCS and not lint */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/libkern.h>
 
 /*
  * Appends src to string dst of size siz (unlike strncat, siz is the
  * full size of dst, not space left).  At most siz-1 characters
  * will be copied.  Always NUL terminates (unless siz <= strlen(dst)).
  * Returns strlen(src) + MIN(siz, strlen(initial dst)).
  * If retval >= siz, truncation occurred.
  */
 size_t
 strlcat(char *dst, const char *src, size_t siz)
 {
 	char *d = dst;
 	const char *s = src;
 	size_t n = siz;
 	size_t dlen;
 
 	/* Find the end of dst and adjust bytes left but don't go past end */
 	while (n-- != 0 && *d != '\0')
 		d++;
 	dlen = d - dst;
 	n = siz - dlen;
 
 	if (n == 0)
 		return(dlen + strlen(s));
 	while (*s != '\0') {
 		if (n != 1) {
 			*d++ = *s;
 			n--;
 		}
 		s++;
 	}
 	*d = '\0';
 
 	return(dlen + (s - src));	/* count does not include NUL */
 }
Index: head/sys/libkern/strlen.c
===================================================================
--- head/sys/libkern/strlen.c	(revision 326270)
+++ head/sys/libkern/strlen.c	(revision 326271)
@@ -1,129 +1,131 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2009, 2010 Xin LI <delphij@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/libkern.h>
 #include <sys/limits.h>
 
 /*
  * Portable strlen() for 32-bit and 64-bit systems.
  *
  * Rationale: it is generally much more efficient to do word length
  * operations and avoid branches on modern computer systems, as
  * compared to byte-length operations with a lot of branches.
  *
  * The expression:
  *
  *	((x - 0x01....01) & ~x & 0x80....80)
  *
  * would evaluate to a non-zero value iff any of the bytes in the
  * original word is zero.
  *
  * On multi-issue processors, we can divide the above expression into:
  *	a)  (x - 0x01....01)
  *	b) (~x & 0x80....80)
  *	c) a & b
  *
  * Where, a) and b) can be partially computed in parallel.
  *
  * The algorithm above is found on "Hacker's Delight" by
  * Henry S. Warren, Jr.
  */
 
 /* Magic numbers for the algorithm */
 #if LONG_BIT == 32
 static const unsigned long mask01 = 0x01010101;
 static const unsigned long mask80 = 0x80808080;
 #elif LONG_BIT == 64
 static const unsigned long mask01 = 0x0101010101010101;
 static const unsigned long mask80 = 0x8080808080808080;
 #else
 #error Unsupported word size
 #endif
 
 #define	LONGPTR_MASK (sizeof(long) - 1)
 
 /*
  * Helper macro to return string length if we caught the zero
  * byte.
  */
 #define testbyte(x)				\
 	do {					\
 		if (p[x] == '\0')		\
 		    return (p - str + x);	\
 	} while (0)
 
 size_t
 strlen(const char *str)
 {
 	const char *p;
 	const unsigned long *lp;
 	long va, vb;
 
 	/*
 	 * Before trying the hard (unaligned byte-by-byte access) way
 	 * to figure out whether there is a nul character, try to see
 	 * if there is a nul character is within this accessible word
 	 * first.
 	 *
 	 * p and (p & ~LONGPTR_MASK) must be equally accessible since
 	 * they always fall in the same memory page, as long as page
 	 * boundaries is integral multiple of word size.
 	 */
 	lp = (const unsigned long *)((uintptr_t)str & ~LONGPTR_MASK);
 	va = (*lp - mask01);
 	vb = ((~*lp) & mask80);
 	lp++;
 	if (va & vb)
 		/* Check if we have \0 in the first part */
 		for (p = str; p < (const char *)lp; p++)
 			if (*p == '\0')
 				return (p - str);
 
 	/* Scan the rest of the string using word sized operation */
 	for (; ; lp++) {
 		va = (*lp - mask01);
 		vb = ((~*lp) & mask80);
 		if (va & vb) {
 			p = (const char *)(lp);
 			testbyte(0);
 			testbyte(1);
 			testbyte(2);
 			testbyte(3);
 #if (LONG_BIT >= 64)
 			testbyte(4);
 			testbyte(5);
 			testbyte(6);
 			testbyte(7);
 #endif
 		}
 	}
 
 	/* NOTREACHED */
 	return (0);
 }
Index: head/sys/libkern/strnlen.c
===================================================================
--- head/sys/libkern/strnlen.c	(revision 326270)
+++ head/sys/libkern/strnlen.c	(revision 326271)
@@ -1,42 +1,44 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2009 David Schultz <das@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/libkern.h>
 
 size_t
 strnlen(const char *s, size_t maxlen)
 {
 	size_t len;
 
 	for (len = 0; len < maxlen; len++, s++) {
 		if (!*s)
 			break;
 	}
 	return (len);
 }
Index: head/sys/libkern/strspn.c
===================================================================
--- head/sys/libkern/strspn.c	(revision 326270)
+++ head/sys/libkern/strspn.c	(revision 326271)
@@ -1,71 +1,73 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2005 David Schultz <das@FreeBSD.ORG>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/types.h>
 
 #define	IDX(c)	((u_char)(c) / LONG_BIT)
 #define	BIT(c)	((u_long)1 << ((u_char)(c) % LONG_BIT))
 
 size_t
 strspn(const char *s, const char *charset)
 {
 	/*
 	 * NB: idx and bit are temporaries whose use causes gcc 3.4.2 to
 	 * generate better code.  Without them, gcc gets a little confused.
 	 */
 	const char *s1;
 	u_long bit;
 	u_long tbl[(UCHAR_MAX + 1) / LONG_BIT];
 	int idx;
 
 	if(*s == '\0')
 		return (0);
 
 #if LONG_BIT == 64	/* always better to unroll on 64-bit architectures */
 	tbl[3] = tbl[2] = tbl[1] = tbl[0] = 0;
 #else
 	for (idx = 0; idx < sizeof(tbl) / sizeof(tbl[0]); idx++)
 		tbl[idx] = 0;
 #endif
 	for (; *charset != '\0'; charset++) {
 		idx = IDX(*charset);
 		bit = BIT(*charset);
 		tbl[idx] |= bit;
 	}
 
 	for(s1 = s; ; s1++) {
 		idx = IDX(*s1);
 		bit = BIT(*s1);
 		if ((tbl[idx] & bit) == 0)
 			break;
 	}
 	return (s1 - s);
 }
Index: head/sys/libkern/strvalid.c
===================================================================
--- head/sys/libkern/strvalid.c	(revision 326270)
+++ head/sys/libkern/strvalid.c	(revision 326271)
@@ -1,53 +1,55 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by NAI Labs,
  * the Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/libkern.h>
 
 /*
  * Return (1) if the buffer pointed to by kernel pointer 'buffer' and
  * of length 'bufferlen' contains a valid NUL-terminated string
  */
 int
 strvalid(const char *buffer, size_t bufferlen)
 {
 	size_t i;
 
 	/* Must be NUL-terminated. */
 	for (i = 0; i < bufferlen; i++)
 		if (buffer[i] == '\0')
 			return (1);
 
 	return (0);
 }