diff --git a/sys/amd64/linux32/linux32_machdep.c b/sys/amd64/linux32/linux32_machdep.c
index ba4dcc61af76..7f1c85cee201 100644
--- a/sys/amd64/linux32/linux32_machdep.c
+++ b/sys/amd64/linux32/linux32_machdep.c
@@ -1,614 +1,614 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2004 Tim J. Robbins
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 2000 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reg.h>
 #include <sys/syscallsubr.h>
 
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/ifunc.h>
 
 #include <compat/freebsd32/freebsd32_util.h>
 #include <amd64/linux32/linux.h>
 #include <amd64/linux32/linux32_proto.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_fork.h>
 #include <compat/linux/linux_ipc.h>
 #include <compat/linux/linux_mmap.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_util.h>
 
 static void	bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru);
 
 struct l_old_select_argv {
 	l_int		nfds;
 	l_uintptr_t	readfds;
 	l_uintptr_t	writefds;
 	l_uintptr_t	exceptfds;
 	l_uintptr_t	timeout;
 } __packed;
 
 static void
 bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru)
 {
 
 	lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
 	lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
 	lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
 	lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
 	lru->ru_maxrss = ru->ru_maxrss;
 	lru->ru_ixrss = ru->ru_ixrss;
 	lru->ru_idrss = ru->ru_idrss;
 	lru->ru_isrss = ru->ru_isrss;
 	lru->ru_minflt = ru->ru_minflt;
 	lru->ru_majflt = ru->ru_majflt;
 	lru->ru_nswap = ru->ru_nswap;
 	lru->ru_inblock = ru->ru_inblock;
 	lru->ru_oublock = ru->ru_oublock;
 	lru->ru_msgsnd = ru->ru_msgsnd;
 	lru->ru_msgrcv = ru->ru_msgrcv;
 	lru->ru_nsignals = ru->ru_nsignals;
 	lru->ru_nvcsw = ru->ru_nvcsw;
 	lru->ru_nivcsw = ru->ru_nivcsw;
 }
 
 int
 linux_copyout_rusage(struct rusage *ru, void *uaddr)
 {
 	struct l_rusage lru;
 
 	bsd_to_linux_rusage(ru, &lru);
 
 	return (copyout(&lru, uaddr, sizeof(struct l_rusage)));
 }
 
 int
 linux_readv(struct thread *td, struct linux_readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 struct l_ipc_kludge {
 	l_uintptr_t msgp;
 	l_long msgtyp;
 } __packed;
 
 int
 linux_ipc(struct thread *td, struct linux_ipc_args *args)
 {
 
 	switch (args->what & 0xFFFF) {
 	case LINUX_SEMOP: {
 
 		return (kern_semop(td, args->arg1, PTRIN(args->ptr),
 		    args->arg2, NULL));
 	}
 	case LINUX_SEMGET: {
 		struct linux_semget_args a;
 
 		a.key = args->arg1;
 		a.nsems = args->arg2;
 		a.semflg = args->arg3;
 		return (linux_semget(td, &a));
 	}
 	case LINUX_SEMCTL: {
 		struct linux_semctl_args a;
 		int error;
 
 		a.semid = args->arg1;
 		a.semnum = args->arg2;
 		a.cmd = args->arg3;
 		error = copyin(PTRIN(args->ptr), &a.arg, sizeof(a.arg));
 		if (error)
 			return (error);
 		return (linux_semctl(td, &a));
 	}
 	case LINUX_SEMTIMEDOP: {
 		struct linux_semtimedop_args a;
 
 		a.semid = args->arg1;
 		a.tsops = PTRIN(args->ptr);
 		a.nsops = args->arg2;
 		a.timeout = PTRIN(args->arg5);
 		return (linux_semtimedop(td, &a));
 	}
 	case LINUX_MSGSND: {
 		struct linux_msgsnd_args a;
 
 		a.msqid = args->arg1;
 		a.msgp = PTRIN(args->ptr);
 		a.msgsz = args->arg2;
 		a.msgflg = args->arg3;
 		return (linux_msgsnd(td, &a));
 	}
 	case LINUX_MSGRCV: {
 		struct linux_msgrcv_args a;
 
 		a.msqid = args->arg1;
 		a.msgsz = args->arg2;
 		a.msgflg = args->arg3;
 		if ((args->what >> 16) == 0) {
 			struct l_ipc_kludge tmp;
 			int error;
 
 			if (args->ptr == 0)
 				return (EINVAL);
 			error = copyin(PTRIN(args->ptr), &tmp, sizeof(tmp));
 			if (error)
 				return (error);
 			a.msgp = PTRIN(tmp.msgp);
 			a.msgtyp = tmp.msgtyp;
 		} else {
 			a.msgp = PTRIN(args->ptr);
 			a.msgtyp = args->arg5;
 		}
 		return (linux_msgrcv(td, &a));
 	}
 	case LINUX_MSGGET: {
 		struct linux_msgget_args a;
 
 		a.key = args->arg1;
 		a.msgflg = args->arg2;
 		return (linux_msgget(td, &a));
 	}
 	case LINUX_MSGCTL: {
 		struct linux_msgctl_args a;
 
 		a.msqid = args->arg1;
 		a.cmd = args->arg2;
 		a.buf = PTRIN(args->ptr);
 		return (linux_msgctl(td, &a));
 	}
 	case LINUX_SHMAT: {
 		struct linux_shmat_args a;
 		l_uintptr_t addr;
 		int error;
 
 		a.shmid = args->arg1;
 		a.shmaddr = PTRIN(args->ptr);
 		a.shmflg = args->arg2;
 		error = linux_shmat(td, &a);
 		if (error != 0)
 			return (error);
 		addr = td->td_retval[0];
 		error = copyout(&addr, PTRIN(args->arg3), sizeof(addr));
 		td->td_retval[0] = 0;
 		return (error);
 	}
 	case LINUX_SHMDT: {
 		struct linux_shmdt_args a;
 
 		a.shmaddr = PTRIN(args->ptr);
 		return (linux_shmdt(td, &a));
 	}
 	case LINUX_SHMGET: {
 		struct linux_shmget_args a;
 
 		a.key = args->arg1;
 		a.size = args->arg2;
 		a.shmflg = args->arg3;
 		return (linux_shmget(td, &a));
 	}
 	case LINUX_SHMCTL: {
 		struct linux_shmctl_args a;
 
 		a.shmid = args->arg1;
 		a.cmd = args->arg2;
 		a.buf = PTRIN(args->ptr);
 		return (linux_shmctl(td, &a));
 	}
 	default:
 		break;
 	}
 
 	return (EINVAL);
 }
 
 int
 linux_old_select(struct thread *td, struct linux_old_select_args *args)
 {
 	struct l_old_select_argv linux_args;
 	struct linux_select_args newsel;
 	int error;
 
 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
 	if (error)
 		return (error);
 
 	newsel.nfds = linux_args.nfds;
 	newsel.readfds = PTRIN(linux_args.readfds);
 	newsel.writefds = PTRIN(linux_args.writefds);
 	newsel.exceptfds = PTRIN(linux_args.exceptfds);
 	newsel.timeout = PTRIN(linux_args.timeout);
 	return (linux_select(td, &newsel));
 }
 
 int
 linux_set_cloned_tls(struct thread *td, void *desc)
 {
 	struct l_user_desc info;
 	struct pcb *pcb;
 	int error;
 
 	error = copyin(desc, &info, sizeof(struct l_user_desc));
 	if (error) {
 		linux_msg(td, "set_cloned_tls copyin info failed!");
 	} else {
 		/* We might copy out the entry_number as GUGS32_SEL. */
 		info.entry_number = GUGS32_SEL;
 		error = copyout(&info, desc, sizeof(struct l_user_desc));
 		if (error)
 			linux_msg(td, "set_cloned_tls copyout info failed!");
 
 		pcb = td->td_pcb;
 		update_pcb_bases(pcb);
 		pcb->pcb_gsbase = (register_t)info.base_addr;
 		td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
 	}
 
 	return (error);
 }
 
 int
 linux_set_upcall(struct thread *td, register_t stack)
 {
 
 	if (stack)
 		td->td_frame->tf_rsp = stack;
 
 	/*
 	 * The newly created Linux thread returns
 	 * to the user space by the same path that a parent do.
 	 */
 	td->td_frame->tf_rax = 0;
 	return (0);
 }
 
 int
 linux_mmap(struct thread *td, struct linux_mmap_args *args)
 {
 	int error;
 	struct l_mmap_argv linux_args;
 
 	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
 	if (error)
 		return (error);
 
 	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
 	    linux_args.prot, linux_args.flags, linux_args.fd,
 	    (uint32_t)linux_args.pgoff));
 }
 
 int
 linux_iopl(struct thread *td, struct linux_iopl_args *args)
 {
 	int error;
 
 	if (args->level < 0 || args->level > 3)
 		return (EINVAL);
 	if ((error = priv_check(td, PRIV_IO)) != 0)
 		return (error);
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
 	td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
 	    (args->level * (PSL_IOPL / 3));
 
 	return (0);
 }
 
 int
 linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
 {
 	l_osigaction_t osa;
 	l_sigaction_t act, oact;
 	int error;
 
 	if (args->nsa != NULL) {
 		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
 		if (error)
 			return (error);
 		act.lsa_handler = osa.lsa_handler;
 		act.lsa_flags = osa.lsa_flags;
 		act.lsa_restorer = osa.lsa_restorer;
 		LINUX_SIGEMPTYSET(act.lsa_mask);
 		act.lsa_mask.__mask = osa.lsa_mask;
 	}
 
 	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
 	    args->osa ? &oact : NULL);
 
 	if (args->osa != NULL && !error) {
 		osa.lsa_handler = oact.lsa_handler;
 		osa.lsa_flags = oact.lsa_flags;
 		osa.lsa_restorer = oact.lsa_restorer;
 		osa.lsa_mask = oact.lsa_mask.__mask;
 		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
 	}
 
 	return (error);
 }
 
 /*
  * Linux has two extra args, restart and oldmask.  We don't use these,
  * but it seems that "restart" is actually a context pointer that
  * enables the signal to happen with a different register set.
  */
 int
 linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
 {
 	sigset_t sigmask;
 	l_sigset_t mask;
 
 	LINUX_SIGEMPTYSET(mask);
 	mask.__mask = args->mask;
 	linux_to_bsd_sigset(&mask, &sigmask);
 	return (kern_sigsuspend(td, sigmask));
 }
 
 int
 linux_pause(struct thread *td, struct linux_pause_args *args)
 {
 	struct proc *p = td->td_proc;
 	sigset_t sigmask;
 
 	PROC_LOCK(p);
 	sigmask = td->td_sigmask;
 	PROC_UNLOCK(p);
 	return (kern_sigsuspend(td, sigmask));
 }
 
 int
 linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
 {
 	struct timeval atv;
 	l_timeval atv32;
 	struct timezone rtz;
 	int error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		atv32.tv_sec = atv.tv_sec;
 		atv32.tv_usec = atv.tv_usec;
 		error = copyout(&atv32, uap->tp, sizeof(atv32));
 	}
 	if (error == 0 && uap->tzp != NULL) {
 		rtz.tz_minuteswest = 0;
 		rtz.tz_dsttime = 0;
 		error = copyout(&rtz, uap->tzp, sizeof(rtz));
 	}
 	return (error);
 }
 
 int
 linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap)
 {
 	l_timeval atv32;
 	struct timeval atv, *tvp;
 	struct timezone atz, *tzp;
 	int error;
 
 	if (uap->tp) {
 		error = copyin(uap->tp, &atv32, sizeof(atv32));
 		if (error)
 			return (error);
 		atv.tv_sec = atv32.tv_sec;
 		atv.tv_usec = atv32.tv_usec;
 		tvp = &atv;
 	} else
 		tvp = NULL;
 	if (uap->tzp) {
 		error = copyin(uap->tzp, &atz, sizeof(atz));
 		if (error)
 			return (error);
 		tzp = &atz;
 	} else
 		tzp = NULL;
 	return (kern_settimeofday(td, tvp, tzp));
 }
 
 int
 linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
 {
 	struct rusage s;
 	int error;
 
 	error = kern_getrusage(td, uap->who, &s);
 	if (error != 0)
 		return (error);
 	if (uap->rusage != NULL)
 		error = linux_copyout_rusage(&s, uap->rusage);
 	return (error);
 }
 
 int
 linux_set_thread_area(struct thread *td,
     struct linux_set_thread_area_args *args)
 {
 	struct l_user_desc info;
 	struct pcb *pcb;
 	int error;
 
 	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
 	if (error)
 		return (error);
 
 	/*
 	 * Semantics of Linux version: every thread in the system has array
 	 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
 	 * This syscall loads one of the selected TLS descriptors with a value
 	 * and also loads GDT descriptors 6, 7 and 8 with the content of
 	 * the per-thread descriptors.
 	 *
 	 * Semantics of FreeBSD version: I think we can ignore that Linux has
 	 * three per-thread descriptors and use just the first one.
 	 * The tls_array[] is used only in [gs]et_thread_area() syscalls and
 	 * for loading the GDT descriptors. We use just one GDT descriptor
 	 * for TLS, so we will load just one.
 	 *
 	 * XXX: This doesn't work when a user space process tries to use more
 	 * than one TLS segment. Comment in the Linux source says wine might
 	 * do this.
 	 */
 
 	/*
 	 * GLIBC reads current %gs and call set_thread_area() with it.
 	 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because
 	 * we use these segments.
 	 */
 	switch (info.entry_number) {
 	case GUGS32_SEL:
 	case GUDATA_SEL:
 	case 6:
 	case -1:
 		info.entry_number = GUGS32_SEL;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	/*
 	 * We have to copy out the GDT entry we use.
 	 *
 	 * XXX: What if a user space program does not check the return value
 	 * and tries to use 6, 7 or 8?
 	 */
 	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
 	if (error)
 		return (error);
 
 	pcb = td->td_pcb;
 	update_pcb_bases(pcb);
 	pcb->pcb_gsbase = (register_t)info.base_addr;
 	update_gdt_gsbase(td, info.base_addr);
 
 	return (0);
 }
 
 void
 bsd_to_linux_regset32(const struct reg32 *b_reg,
     struct linux_pt_regset32 *l_regset)
 {
 
 	l_regset->ebx = b_reg->r_ebx;
 	l_regset->ecx = b_reg->r_ecx;
 	l_regset->edx = b_reg->r_edx;
 	l_regset->esi = b_reg->r_esi;
 	l_regset->edi = b_reg->r_edi;
 	l_regset->ebp = b_reg->r_ebp;
 	l_regset->eax = b_reg->r_eax;
 	l_regset->ds = b_reg->r_ds;
 	l_regset->es = b_reg->r_es;
 	l_regset->fs = b_reg->r_fs;
 	l_regset->gs = b_reg->r_gs;
 	l_regset->orig_eax = b_reg->r_eax;
 	l_regset->eip = b_reg->r_eip;
 	l_regset->cs = b_reg->r_cs;
 	l_regset->eflags = b_reg->r_eflags;
 	l_regset->esp = b_reg->r_esp;
 	l_regset->ss = b_reg->r_ss;
 }
 
 int futex_xchgl_nosmap(int oparg, uint32_t *uaddr, int *oldval);
 int futex_xchgl_smap(int oparg, uint32_t *uaddr, int *oldval);
 DEFINE_IFUNC(, int, futex_xchgl, (int, uint32_t *, int *))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ?
 	    futex_xchgl_smap : futex_xchgl_nosmap);
 }
 
 int futex_addl_nosmap(int oparg, uint32_t *uaddr, int *oldval);
 int futex_addl_smap(int oparg, uint32_t *uaddr, int *oldval);
 DEFINE_IFUNC(, int, futex_addl, (int, uint32_t *, int *))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ?
 	    futex_addl_smap : futex_addl_nosmap);
 }
 
 int futex_orl_nosmap(int oparg, uint32_t *uaddr, int *oldval);
 int futex_orl_smap(int oparg, uint32_t *uaddr, int *oldval);
 DEFINE_IFUNC(, int, futex_orl, (int, uint32_t *, int *))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ?
 	    futex_orl_smap : futex_orl_nosmap);
 }
 
 int futex_andl_nosmap(int oparg, uint32_t *uaddr, int *oldval);
 int futex_andl_smap(int oparg, uint32_t *uaddr, int *oldval);
 DEFINE_IFUNC(, int, futex_andl, (int, uint32_t *, int *))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ?
 	    futex_andl_smap : futex_andl_nosmap);
 }
 
 int futex_xorl_nosmap(int oparg, uint32_t *uaddr, int *oldval);
 int futex_xorl_smap(int oparg, uint32_t *uaddr, int *oldval);
 DEFINE_IFUNC(, int, futex_xorl, (int, uint32_t *, int *))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 ?
 	    futex_xorl_smap : futex_xorl_nosmap);
 }
 
 int
 linux_ptrace_peekuser(struct thread *td, pid_t pid, void *addr, void *data)
 {
 
 	LINUX_RATELIMIT_MSG_OPT1("PTRACE_PEEKUSER offset %ld not implemented; "
 	    "returning EINVAL", (uintptr_t)addr);
 	return (EINVAL);
 }
 
 int
 linux_ptrace_pokeuser(struct thread *td, pid_t pid, void *addr, void *data)
 {
 
 	LINUX_RATELIMIT_MSG_OPT1("PTRACE_POKEUSER offset %ld "
 	    "not implemented; returning EINVAL", (uintptr_t)addr);
 	return (EINVAL);
 }
diff --git a/sys/cddl/compat/opensolaris/kern/opensolaris_uio.c b/sys/cddl/compat/opensolaris/kern/opensolaris_uio.c
index e1cc8f802762..b2a08a55bba4 100644
--- a/sys/cddl/compat/opensolaris/kern/opensolaris_uio.c
+++ b/sys/cddl/compat/opensolaris/kern/opensolaris_uio.c
@@ -1,91 +1,91 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
 /*        All Rights Reserved   */
 
 /*
  * University Copyright- Copyright (c) 1982, 1986, 1988
  * The Regents of the University of California
  * All Rights Reserved
  *
  * University Acknowledgment- Portions of this document are derived from
  * software developed by the University of California, Berkeley, and its
  * contributors.
  */
 
 /*
  */
 
 #include <sys/types.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 
 /*
  * same as uiomove() but doesn't modify uio structure.
  * return in cbytes how many bytes were copied.
  */
 int
 uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes)
 {
 	struct iovec small_iovec[1];
 	struct uio small_uio_clone;
 	struct uio *uio_clone;
 	int error;
 
 	ASSERT3U(uio->uio_rw, ==, rw);
 	if (uio->uio_iovcnt == 1) {
 		small_uio_clone = *uio;
 		small_iovec[0] = *uio->uio_iov;
 		small_uio_clone.uio_iov = small_iovec;
 		uio_clone = &small_uio_clone;
 	} else {
 		uio_clone = cloneuio(uio);
 	}
 
 	error = vn_io_fault_uiomove(p, n, uio_clone);
 	*cbytes = uio->uio_resid - uio_clone->uio_resid;
 	if (uio_clone != &small_uio_clone)
-		free(uio_clone, M_IOV);
+		freeuio(uio_clone);
 	return (error);
 }
 
 /*
  * Drop the next n chars out of *uiop.
  */
 void
 uioskip(uio_t *uio, size_t n)
 {
 	enum uio_seg segflg;
 
 	/* For the full compatibility with illumos. */
 	if (n > uio->uio_resid)
 		return;
 
 	segflg = uio->uio_segflg;
 	uio->uio_segflg = UIO_NOCOPY;
 	uiomove(NULL, n, uio->uio_rw, uio);
 	uio->uio_segflg = segflg;
 }
diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c
index f2d66cf74b2b..facbf00a4c51 100644
--- a/sys/compat/freebsd32/freebsd32_misc.c
+++ b/sys/compat/freebsd32/freebsd32_misc.c
@@ -1,4176 +1,4173 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2002 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ffclock.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ktrace.h"
 
 #define __ELF_WORD_SIZE 32
 
 #ifdef COMPAT_FREEBSD11
 #define	_WANT_FREEBSD11_KEVENT
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/capsicum.h>
 #include <sys/clock.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/imgact.h>
 #include <sys/mbuf.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/eventvar.h>	/* Must come after sys/selinfo.h */
 #include <sys/pipe.h>		/* Must come after sys/selinfo.h */
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/thr.h>
 #include <sys/timerfd.h>
 #include <sys/timex.h>
 #include <sys/unistd.h>
 #include <sys/ucontext.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #include <sys/ipc.h>
 #include <sys/msg.h>
 #include <sys/sem.h>
 #include <sys/shm.h>
 #include <sys/timeffc.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #ifdef INET
 #include <netinet/in.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/elf.h>
 #ifdef __amd64__
 #include <machine/md_var.h>
 #endif
 
 #include <security/audit/audit.h>
 
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_misc.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 
 int compat_freebsd_32bit = 1;
 
 static void
 register_compat32_feature(void *arg)
 {
 	if (!compat_freebsd_32bit)
 		return;
 
 	FEATURE_ADD("compat_freebsd32", "Compatible with 32-bit FreeBSD");
 	FEATURE_ADD("compat_freebsd_32bit",
 	    "Compatible with 32-bit FreeBSD (legacy feature name)");
 }
 SYSINIT(freebsd32, SI_SUB_EXEC, SI_ORDER_ANY, register_compat32_feature,
     NULL);
 
 struct ptrace_io_desc32 {
 	int		piod_op;
 	uint32_t	piod_offs;
 	uint32_t	piod_addr;
 	uint32_t	piod_len;
 };
 
 struct ptrace_vm_entry32 {
 	int		pve_entry;
 	int		pve_timestamp;
 	uint32_t	pve_start;
 	uint32_t	pve_end;
 	uint32_t	pve_offset;
 	u_int		pve_prot;
 	u_int		pve_pathlen;
 	int32_t		pve_fileid;
 	u_int		pve_fsid;
 	uint32_t	pve_path;
 };
 
 #ifdef __amd64__
 CTASSERT(sizeof(struct timeval32) == 8);
 CTASSERT(sizeof(struct timespec32) == 8);
 CTASSERT(sizeof(struct itimerval32) == 16);
 CTASSERT(sizeof(struct bintime32) == 12);
 #else
 CTASSERT(sizeof(struct timeval32) == 16);
 CTASSERT(sizeof(struct timespec32) == 16);
 CTASSERT(sizeof(struct itimerval32) == 32);
 CTASSERT(sizeof(struct bintime32) == 16);
 #endif
 CTASSERT(sizeof(struct ostatfs32) == 256);
 #ifdef __amd64__
 CTASSERT(sizeof(struct rusage32) == 72);
 #else
 CTASSERT(sizeof(struct rusage32) == 88);
 #endif
 CTASSERT(sizeof(struct sigaltstack32) == 12);
 #ifdef __amd64__
 CTASSERT(sizeof(struct kevent32) == 56);
 #else
 CTASSERT(sizeof(struct kevent32) == 64);
 #endif
 CTASSERT(sizeof(struct iovec32) == 8);
 CTASSERT(sizeof(struct msghdr32) == 28);
 #ifdef __amd64__
 CTASSERT(sizeof(struct stat32) == 208);
 CTASSERT(sizeof(struct freebsd11_stat32) == 96);
 #else
 CTASSERT(sizeof(struct stat32) == 224);
 CTASSERT(sizeof(struct freebsd11_stat32) == 120);
 #endif
 CTASSERT(sizeof(struct sigaction32) == 24);
 
 static int freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count);
 static int freebsd32_user_clock_nanosleep(struct thread *td, clockid_t clock_id,
     int flags, const struct timespec32 *ua_rqtp, struct timespec32 *ua_rmtp);
 
 void
 freebsd32_rusage_out(const struct rusage *s, struct rusage32 *s32)
 {
 
 	TV_CP(*s, *s32, ru_utime);
 	TV_CP(*s, *s32, ru_stime);
 	CP(*s, *s32, ru_maxrss);
 	CP(*s, *s32, ru_ixrss);
 	CP(*s, *s32, ru_idrss);
 	CP(*s, *s32, ru_isrss);
 	CP(*s, *s32, ru_minflt);
 	CP(*s, *s32, ru_majflt);
 	CP(*s, *s32, ru_nswap);
 	CP(*s, *s32, ru_inblock);
 	CP(*s, *s32, ru_oublock);
 	CP(*s, *s32, ru_msgsnd);
 	CP(*s, *s32, ru_msgrcv);
 	CP(*s, *s32, ru_nsignals);
 	CP(*s, *s32, ru_nvcsw);
 	CP(*s, *s32, ru_nivcsw);
 }
 
 int
 freebsd32_wait4(struct thread *td, struct freebsd32_wait4_args *uap)
 {
 	int error, status;
 	struct rusage32 ru32;
 	struct rusage ru, *rup;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (error)
 		return (error);
 	if (uap->status != NULL)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0) {
 		freebsd32_rusage_out(&ru, &ru32);
 		error = copyout(&ru32, uap->rusage, sizeof(ru32));
 	}
 	return (error);
 }
 
 int
 freebsd32_wait6(struct thread *td, struct freebsd32_wait6_args *uap)
 {
 	struct __wrusage32 wru32;
 	struct __wrusage wru, *wrup;
 	struct siginfo32 si32;
 	struct __siginfo si, *sip;
 	int error, status;
 
 	if (uap->wrusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 	if (uap->info != NULL) {
 		sip = &si;
 		bzero(sip, sizeof(*sip));
 	} else
 		sip = NULL;
 	error = kern_wait6(td, uap->idtype, PAIR32TO64(id_t, uap->id),
 	    &status, uap->options, wrup, sip);
 	if (error != 0)
 		return (error);
 	if (uap->status != NULL)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->wrusage != NULL && error == 0) {
 		freebsd32_rusage_out(&wru.wru_self, &wru32.wru_self);
 		freebsd32_rusage_out(&wru.wru_children, &wru32.wru_children);
 		error = copyout(&wru32, uap->wrusage, sizeof(wru32));
 	}
 	if (uap->info != NULL && error == 0) {
 		siginfo_to_siginfo32 (&si, &si32);
 		error = copyout(&si32, uap->info, sizeof(si32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 static void
 copy_statfs(struct statfs *in, struct ostatfs32 *out)
 {
 
 	statfs_scale_blocks(in, INT32_MAX);
 	bzero(out, sizeof(*out));
 	CP(*in, *out, f_bsize);
 	out->f_iosize = MIN(in->f_iosize, INT32_MAX);
 	CP(*in, *out, f_blocks);
 	CP(*in, *out, f_bfree);
 	CP(*in, *out, f_bavail);
 	out->f_files = MIN(in->f_files, INT32_MAX);
 	out->f_ffree = MIN(in->f_ffree, INT32_MAX);
 	CP(*in, *out, f_fsid);
 	CP(*in, *out, f_owner);
 	CP(*in, *out, f_type);
 	CP(*in, *out, f_flags);
 	out->f_syncwrites = MIN(in->f_syncwrites, INT32_MAX);
 	out->f_asyncwrites = MIN(in->f_asyncwrites, INT32_MAX);
 	strlcpy(out->f_fstypename,
 	      in->f_fstypename, MFSNAMELEN);
 	strlcpy(out->f_mntonname,
 	      in->f_mntonname, min(MNAMELEN, FREEBSD4_OMNAMELEN));
 	out->f_syncreads = MIN(in->f_syncreads, INT32_MAX);
 	out->f_asyncreads = MIN(in->f_asyncreads, INT32_MAX);
 	strlcpy(out->f_mntfromname,
 	      in->f_mntfromname, min(MNAMELEN, FREEBSD4_OMNAMELEN));
 }
 #endif
 
 int
 freebsd32_getfsstat(struct thread *td, struct freebsd32_getfsstat_args *uap)
 {
 	size_t count;
 	int error;
 
 	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
 		return (EINVAL);
 	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
 	    UIO_USERSPACE, uap->mode);
 	if (error == 0)
 		td->td_retval[0] = count;
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_getfsstat(struct thread *td,
     struct freebsd4_freebsd32_getfsstat_args *uap)
 {
 	struct statfs *buf, *sp;
 	struct ostatfs32 stat32;
 	size_t count, size, copycount;
 	int error;
 
 	count = uap->bufsize / sizeof(struct ostatfs32);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode);
 	if (size > 0) {
 		sp = buf;
 		copycount = count;
 		while (copycount > 0 && error == 0) {
 			copy_statfs(sp, &stat32);
 			error = copyout(&stat32, uap->buf, sizeof(stat32));
 			sp++;
 			uap->buf++;
 			copycount--;
 		}
 		free(buf, M_STATFS);
 	}
 	if (error == 0)
 		td->td_retval[0] = count;
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD11
 int
 freebsd11_freebsd32_getfsstat(struct thread *td,
     struct freebsd11_freebsd32_getfsstat_args *uap)
 {
 	return(kern_freebsd11_getfsstat(td, uap->buf, uap->bufsize,
 	    uap->mode));
 }
 #endif
 
 int
 freebsd32_sigaltstack(struct thread *td,
 		      struct freebsd32_sigaltstack_args *uap)
 {
 	struct sigaltstack32 s32;
 	struct sigaltstack ss, oss, *ssp;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		PTRIN_CP(s32, ss, ss_sp);
 		CP(s32, ss, ss_size);
 		CP(s32, ss, ss_flags);
 		ssp = &ss;
 	} else
 		ssp = NULL;
 	error = kern_sigaltstack(td, ssp, &oss);
 	if (error == 0 && uap->oss != NULL) {
 		PTROUT_CP(oss, s32, ss_sp);
 		CP(oss, s32, ss_size);
 		CP(oss, s32, ss_flags);
 		error = copyout(&s32, uap->oss, sizeof(s32));
 	}
 	return (error);
 }
 
 /*
  * Custom version of exec_copyin_args() so that we can translate
  * the pointers.
  */
 int
 freebsd32_exec_copyin_args(struct image_args *args, const char *fname,
     enum uio_seg segflg, uint32_t *argv, uint32_t *envv)
 {
 	char *argp, *envp;
 	uint32_t *p32, arg;
 	int error;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 
 	/*
 	 * Allocate demand-paged memory for the file name, argument, and
 	 * environment strings.
 	 */
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Copy the file name.
 	 */
 	error = exec_args_add_fname(args, fname, segflg);
 	if (error != 0)
 		goto err_exit;
 
 	/*
 	 * extract arguments first
 	 */
 	p32 = argv;
 	for (;;) {
 		error = copyin(p32++, &arg, sizeof(arg));
 		if (error)
 			goto err_exit;
 		if (arg == 0)
 			break;
 		argp = PTRIN(arg);
 		error = exec_args_add_arg(args, argp, UIO_USERSPACE);
 		if (error != 0)
 			goto err_exit;
 	}
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		p32 = envv;
 		for (;;) {
 			error = copyin(p32++, &arg, sizeof(arg));
 			if (error)
 				goto err_exit;
 			if (arg == 0)
 				break;
 			envp = PTRIN(arg);
 			error = exec_args_add_env(args, envp, UIO_USERSPACE);
 			if (error != 0)
 				goto err_exit;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 int
 freebsd32_execve(struct thread *td, struct freebsd32_execve_args *uap)
 {
 	struct image_args eargs;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = freebsd32_exec_copyin_args(&eargs, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL, oldvmspace);
 	post_execve(td, error, oldvmspace);
 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
 	return (error);
 }
 
 int
 freebsd32_fexecve(struct thread *td, struct freebsd32_fexecve_args *uap)
 {
 	struct image_args eargs;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = freebsd32_exec_copyin_args(&eargs, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
 		eargs.fd = uap->fd;
 		error = kern_execve(td, &eargs, NULL, oldvmspace);
 	}
 	post_execve(td, error, oldvmspace);
 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
 	return (error);
 }
 
 int
 freebsd32_mknodat(struct thread *td, struct freebsd32_mknodat_args *uap)
 {
 
 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->mode, PAIR32TO64(dev_t, uap->dev)));
 }
 
 int
 freebsd32_mprotect(struct thread *td, struct freebsd32_mprotect_args *uap)
 {
 	int prot;
 
 	prot = uap->prot;
 #if defined(__amd64__)
 	if (i386_read_exec && (prot & PROT_READ) != 0)
 		prot |= PROT_EXEC;
 #endif
 	return (kern_mprotect(td, (uintptr_t)PTRIN(uap->addr), uap->len,
 	    prot, 0));
 }
 
 int
 freebsd32_mmap(struct thread *td, struct freebsd32_mmap_args *uap)
 {
 	int prot;
 
 	prot = uap->prot;
 #if defined(__amd64__)
 	if (i386_read_exec && (prot & PROT_READ))
 		prot |= PROT_EXEC;
 #endif
 
 	return (kern_mmap(td, &(struct mmap_req){
 		.mr_hint = (uintptr_t)uap->addr,
 		.mr_len = uap->len,
 		.mr_prot = prot,
 		.mr_flags = uap->flags,
 		.mr_fd = uap->fd,
 		.mr_pos = PAIR32TO64(off_t, uap->pos),
 	    }));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_mmap(struct thread *td,
     struct freebsd6_freebsd32_mmap_args *uap)
 {
 	int prot;
 
 	prot = uap->prot;
 #if defined(__amd64__)
 	if (i386_read_exec && (prot & PROT_READ))
 		prot |= PROT_EXEC;
 #endif
 
 	return (kern_mmap(td, &(struct mmap_req){
 		.mr_hint = (uintptr_t)uap->addr,
 		.mr_len = uap->len,
 		.mr_prot = prot,
 		.mr_flags = uap->flags,
 		.mr_fd = uap->fd,
 		.mr_pos = PAIR32TO64(off_t, uap->pos),
 	    }));
 }
 #endif
 
 #ifdef COMPAT_43
 int
 ofreebsd32_mmap(struct thread *td, struct ofreebsd32_mmap_args *uap)
 {
 	return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
 	    uap->flags, uap->fd, uap->pos));
 }
 #endif
 
 int
 freebsd32_setitimer(struct thread *td, struct freebsd32_setitimer_args *uap)
 {
 	struct itimerval itv, oitv, *itvp;	
 	struct itimerval32 i32;
 	int error;
 
 	if (uap->itv != NULL) {
 		error = copyin(uap->itv, &i32, sizeof(i32));
 		if (error)
 			return (error);
 		TV_CP(i32, itv, it_interval);
 		TV_CP(i32, itv, it_value);
 		itvp = &itv;
 	} else
 		itvp = NULL;
 	error = kern_setitimer(td, uap->which, itvp, &oitv);
 	if (error || uap->oitv == NULL)
 		return (error);
 	TV_CP(oitv, i32, it_interval);
 	TV_CP(oitv, i32, it_value);
 	return (copyout(&i32, uap->oitv, sizeof(i32)));
 }
 
 int
 freebsd32_getitimer(struct thread *td, struct freebsd32_getitimer_args *uap)
 {
 	struct itimerval itv;
 	struct itimerval32 i32;
 	int error;
 
 	error = kern_getitimer(td, uap->which, &itv);
 	if (error || uap->itv == NULL)
 		return (error);
 	TV_CP(itv, i32, it_interval);
 	TV_CP(itv, i32, it_value);
 	return (copyout(&i32, uap->itv, sizeof(i32)));
 }
 
 int
 freebsd32_select(struct thread *td, struct freebsd32_select_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval tv, *tvp;
 	int error;
 
 	if (uap->tv != NULL) {
 		error = copyin(uap->tv, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, tv, tv_sec);
 		CP(tv32, tv, tv_usec);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	/*
 	 * XXX Do pointers need PTRIN()?
 	 */
 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    sizeof(int32_t) * 8));
 }
 
 int
 freebsd32_pselect(struct thread *td, struct freebsd32_pselect_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts;
 	struct timeval tv, *tvp;
 	sigset_t set, *uset;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts32, sizeof(ts32));
 		if (error != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->sm != NULL) {
 		error = copyin(uap->sm, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		uset = &set;
 	} else
 		uset = NULL;
 	/*
 	 * XXX Do pointers need PTRIN()?
 	 */
 	error = kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    uset, sizeof(int32_t) * 8);
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd32_kevent_args *uap;
 	struct kevent32	ks32[KQ_NEVENTS];
 	uint64_t e;
 	int i, j, error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd32_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		CP(kevp[i], ks32[i], ident);
 		CP(kevp[i], ks32[i], filter);
 		CP(kevp[i], ks32[i], flags);
 		CP(kevp[i], ks32[i], fflags);
 #if BYTE_ORDER == LITTLE_ENDIAN
 		ks32[i].data1 = kevp[i].data;
 		ks32[i].data2 = kevp[i].data >> 32;
 #else
 		ks32[i].data1 = kevp[i].data >> 32;
 		ks32[i].data2 = kevp[i].data;
 #endif
 		PTROUT_CP(kevp[i], ks32[i], udata);
 		for (j = 0; j < nitems(kevp->ext); j++) {
 			e = kevp[i].ext[j];
 #if BYTE_ORDER == LITTLE_ENDIAN
 			ks32[i].ext64[2 * j] = e;
 			ks32[i].ext64[2 * j + 1] = e >> 32;
 #else
 			ks32[i].ext64[2 * j] = e >> 32;
 			ks32[i].ext64[2 * j + 1] = e;
 #endif
 		}
 	}
 	error = copyout(ks32, uap->eventlist, count * sizeof *ks32);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd32_kevent_args *uap;
 	struct kevent32	ks32[KQ_NEVENTS];
 	uint64_t e;
 	int i, j, error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd32_kevent_args *)arg;
 
 	error = copyin(uap->changelist, ks32, count * sizeof *ks32);
 	if (error)
 		goto done;
 	uap->changelist += count;
 
 	for (i = 0; i < count; i++) {
 		CP(ks32[i], kevp[i], ident);
 		CP(ks32[i], kevp[i], filter);
 		CP(ks32[i], kevp[i], flags);
 		CP(ks32[i], kevp[i], fflags);
 		kevp[i].data = PAIR32TO64(uint64_t, ks32[i].data);
 		PTRIN_CP(ks32[i], kevp[i], udata);
 		for (j = 0; j < nitems(kevp->ext); j++) {
 #if BYTE_ORDER == LITTLE_ENDIAN
 			e = ks32[i].ext64[2 * j + 1];
 			e <<= 32;
 			e += ks32[i].ext64[2 * j];
 #else
 			e = ks32[i].ext64[2 * j];
 			e <<= 32;
 			e += ks32[i].ext64[2 * j + 1];
 #endif
 			kevp[i].ext[j] = e;
 		}
 	}
 done:
 	return (error);
 }
 
 int
 freebsd32_kevent(struct thread *td, struct freebsd32_kevent_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = freebsd32_kevent_copyout,
 		.k_copyin = freebsd32_kevent_copyin,
 	};
 #ifdef KTRACE
 	struct kevent32 *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("kevent32", UIO_USERSPACE, uap->changelist,
 		    uap->nchanges, sizeof(struct kevent32));
 #endif
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    &k_ops, tsp);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("kevent32", UIO_USERSPACE, eventlist,
 		    td->td_retval[0], sizeof(struct kevent32));
 #endif
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD11
 static int
 freebsd32_kevent11_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_freebsd32_kevent_args *uap;
 	struct freebsd11_kevent32 ks32[KQ_NEVENTS];
 	int i, error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_freebsd32_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		CP(kevp[i], ks32[i], ident);
 		CP(kevp[i], ks32[i], filter);
 		CP(kevp[i], ks32[i], flags);
 		CP(kevp[i], ks32[i], fflags);
 		CP(kevp[i], ks32[i], data);
 		PTROUT_CP(kevp[i], ks32[i], udata);
 	}
 	error = copyout(ks32, uap->eventlist, count * sizeof *ks32);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 freebsd32_kevent11_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_freebsd32_kevent_args *uap;
 	struct freebsd11_kevent32 ks32[KQ_NEVENTS];
 	int i, j, error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_freebsd32_kevent_args *)arg;
 
 	error = copyin(uap->changelist, ks32, count * sizeof *ks32);
 	if (error)
 		goto done;
 	uap->changelist += count;
 
 	for (i = 0; i < count; i++) {
 		CP(ks32[i], kevp[i], ident);
 		CP(ks32[i], kevp[i], filter);
 		CP(ks32[i], kevp[i], flags);
 		CP(ks32[i], kevp[i], fflags);
 		CP(ks32[i], kevp[i], data);
 		PTRIN_CP(ks32[i], kevp[i], udata);
 		for (j = 0; j < nitems(kevp->ext); j++)
 			kevp[i].ext[j] = 0;
 	}
 done:
 	return (error);
 }
 
 int
 freebsd11_freebsd32_kevent(struct thread *td,
     struct freebsd11_freebsd32_kevent_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = freebsd32_kevent11_copyout,
 		.k_copyin = freebsd32_kevent11_copyin,
 	};
 #ifdef KTRACE
 	struct freebsd11_kevent32 *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("freebsd11_kevent32", UIO_USERSPACE,
 		    uap->changelist, uap->nchanges,
 		    sizeof(struct freebsd11_kevent32));
 #endif
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    &k_ops, tsp);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("freebsd11_kevent32", UIO_USERSPACE,
 		    eventlist, td->td_retval[0],
 		    sizeof(struct freebsd11_kevent32));
 #endif
 	return (error);
 }
 #endif
 
 int
 freebsd32_gettimeofday(struct thread *td,
 		       struct freebsd32_gettimeofday_args *uap)
 {
 	struct timeval atv;
 	struct timeval32 atv32;
 	struct timezone rtz;
 	int error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		CP(atv, atv32, tv_sec);
 		CP(atv, atv32, tv_usec);
 		error = copyout(&atv32, uap->tp, sizeof (atv32));
 	}
 	if (error == 0 && uap->tzp != NULL) {
 		rtz.tz_minuteswest = 0;
 		rtz.tz_dsttime = 0;
 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
 	}
 	return (error);
 }
 
 int
 freebsd32_getrusage(struct thread *td, struct freebsd32_getrusage_args *uap)
 {
 	struct rusage32 s32;
 	struct rusage s;
 	int error;
 
 	error = kern_getrusage(td, uap->who, &s);
 	if (error == 0) {
 		freebsd32_rusage_out(&s, &s32);
 		error = copyout(&s32, uap->rusage, sizeof(s32));
 	}
 	return (error);
 }
 
 static void
 ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl,
     struct ptrace_lwpinfo32 *pl32)
 {
 
 	bzero(pl32, sizeof(*pl32));
 	pl32->pl_lwpid = pl->pl_lwpid;
 	pl32->pl_event = pl->pl_event;
 	pl32->pl_flags = pl->pl_flags;
 	pl32->pl_sigmask = pl->pl_sigmask;
 	pl32->pl_siglist = pl->pl_siglist;
 	siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
 	strcpy(pl32->pl_tdname, pl->pl_tdname);
 	pl32->pl_child_pid = pl->pl_child_pid;
 	pl32->pl_syscall_code = pl->pl_syscall_code;
 	pl32->pl_syscall_narg = pl->pl_syscall_narg;
 }
 
 static void
 ptrace_sc_ret_to32(const struct ptrace_sc_ret *psr,
     struct ptrace_sc_ret32 *psr32)
 {
 
 	bzero(psr32, sizeof(*psr32));
 	psr32->sr_retval[0] = psr->sr_retval[0];
 	psr32->sr_retval[1] = psr->sr_retval[1];
 	psr32->sr_error = psr->sr_error;
 }
 
 int
 freebsd32_ptrace(struct thread *td, struct freebsd32_ptrace_args *uap)
 {
 	union {
 		struct ptrace_io_desc piod;
 		struct ptrace_lwpinfo pl;
 		struct ptrace_vm_entry pve;
 		struct ptrace_coredump pc;
 		struct ptrace_sc_remote sr;
 		struct dbreg32 dbreg;
 		struct fpreg32 fpreg;
 		struct reg32 reg;
 		struct iovec vec;
 		register_t args[nitems(td->td_sa.args)];
 		struct ptrace_sc_ret psr;
 		int ptevents;
 	} r;
 	union {
 		struct ptrace_io_desc32 piod;
 		struct ptrace_lwpinfo32 pl;
 		struct ptrace_vm_entry32 pve;
 		struct ptrace_coredump32 pc;
 		struct ptrace_sc_remote32 sr;
 		uint32_t args[nitems(td->td_sa.args)];
 		struct ptrace_sc_ret32 psr;
 		struct iovec32 vec;
 	} r32;
 	syscallarg_t pscr_args[nitems(td->td_sa.args)];
 	u_int pscr_args32[nitems(td->td_sa.args)];
 	void *addr;
 	int data, error, i;
 
 	if (!allow_ptrace)
 		return (ENOSYS);
 	error = 0;
 
 	AUDIT_ARG_PID(uap->pid);
 	AUDIT_ARG_CMD(uap->req);
 	AUDIT_ARG_VALUE(uap->data);
 	addr = &r;
 	data = uap->data;
 	switch (uap->req) {
 	case PT_GET_EVENT_MASK:
 	case PT_GET_SC_ARGS:
 	case PT_GET_SC_RET:
 		break;
 	case PT_LWPINFO:
 		if (uap->data > sizeof(r32.pl))
 			return (EINVAL);
 
 		/*
 		 * Pass size of native structure in 'data'.  Truncate
 		 * if necessary to avoid siginfo.
 		 */
 		data = sizeof(r.pl);
 		if (uap->data < offsetof(struct ptrace_lwpinfo32, pl_siginfo) +
 		    sizeof(struct siginfo32))
 			data = offsetof(struct ptrace_lwpinfo, pl_siginfo);
 		break;
 	case PT_GETREGS:
 		bzero(&r.reg, sizeof(r.reg));
 		break;
 	case PT_GETFPREGS:
 		bzero(&r.fpreg, sizeof(r.fpreg));
 		break;
 	case PT_GETDBREGS:
 		bzero(&r.dbreg, sizeof(r.dbreg));
 		break;
 	case PT_SETREGS:
 		error = copyin(uap->addr, &r.reg, sizeof(r.reg));
 		break;
 	case PT_SETFPREGS:
 		error = copyin(uap->addr, &r.fpreg, sizeof(r.fpreg));
 		break;
 	case PT_SETDBREGS:
 		error = copyin(uap->addr, &r.dbreg, sizeof(r.dbreg));
 		break;
 	case PT_GETREGSET:
 	case PT_SETREGSET:
 		error = copyin(uap->addr, &r32.vec, sizeof(r32.vec));
 		if (error != 0)
 			break;
 
 		r.vec.iov_len = r32.vec.iov_len;
 		r.vec.iov_base = PTRIN(r32.vec.iov_base);
 		break;
 	case PT_SET_EVENT_MASK:
 		if (uap->data != sizeof(r.ptevents))
 			error = EINVAL;
 		else
 			error = copyin(uap->addr, &r.ptevents, uap->data);
 		break;
 	case PT_IO:
 		error = copyin(uap->addr, &r32.piod, sizeof(r32.piod));
 		if (error)
 			break;
 		CP(r32.piod, r.piod, piod_op);
 		PTRIN_CP(r32.piod, r.piod, piod_offs);
 		PTRIN_CP(r32.piod, r.piod, piod_addr);
 		CP(r32.piod, r.piod, piod_len);
 		break;
 	case PT_VM_ENTRY:
 		error = copyin(uap->addr, &r32.pve, sizeof(r32.pve));
 		if (error)
 			break;
 
 		CP(r32.pve, r.pve, pve_entry);
 		CP(r32.pve, r.pve, pve_timestamp);
 		CP(r32.pve, r.pve, pve_start);
 		CP(r32.pve, r.pve, pve_end);
 		CP(r32.pve, r.pve, pve_offset);
 		CP(r32.pve, r.pve, pve_prot);
 		CP(r32.pve, r.pve, pve_pathlen);
 		CP(r32.pve, r.pve, pve_fileid);
 		CP(r32.pve, r.pve, pve_fsid);
 		PTRIN_CP(r32.pve, r.pve, pve_path);
 		break;
 	case PT_COREDUMP:
 		if (uap->data != sizeof(r32.pc))
 			error = EINVAL;
 		else
 			error = copyin(uap->addr, &r32.pc, uap->data);
 		CP(r32.pc, r.pc, pc_fd);
 		CP(r32.pc, r.pc, pc_flags);
 		r.pc.pc_limit = PAIR32TO64(off_t, r32.pc.pc_limit);
 		data = sizeof(r.pc);
 		break;
 	case PT_SC_REMOTE:
 		if (uap->data != sizeof(r32.sr)) {
 			error = EINVAL;
 			break;
 		}
 		error = copyin(uap->addr, &r32.sr, uap->data);
 		if (error != 0)
 			break;
 		CP(r32.sr, r.sr, pscr_syscall);
 		CP(r32.sr, r.sr, pscr_nargs);
 		if (r.sr.pscr_nargs > nitems(td->td_sa.args)) {
 			error = EINVAL;
 			break;
 		}
 		error = copyin(PTRIN(r32.sr.pscr_args), pscr_args32,
 		    sizeof(u_int) * r32.sr.pscr_nargs);
 		if (error != 0)
 			break;
 		for (i = 0; i < r32.sr.pscr_nargs; i++)
 			pscr_args[i] = pscr_args32[i];
 		r.sr.pscr_args = pscr_args;
 		break;
 	default:
 		addr = uap->addr;
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = kern_ptrace(td, uap->req, uap->pid, addr, data);
 	if (error)
 		return (error);
 
 	switch (uap->req) {
 	case PT_VM_ENTRY:
 		CP(r.pve, r32.pve, pve_entry);
 		CP(r.pve, r32.pve, pve_timestamp);
 		CP(r.pve, r32.pve, pve_start);
 		CP(r.pve, r32.pve, pve_end);
 		CP(r.pve, r32.pve, pve_offset);
 		CP(r.pve, r32.pve, pve_prot);
 		CP(r.pve, r32.pve, pve_pathlen);
 		CP(r.pve, r32.pve, pve_fileid);
 		CP(r.pve, r32.pve, pve_fsid);
 		error = copyout(&r32.pve, uap->addr, sizeof(r32.pve));
 		break;
 	case PT_IO:
 		CP(r.piod, r32.piod, piod_len);
 		error = copyout(&r32.piod, uap->addr, sizeof(r32.piod));
 		break;
 	case PT_GETREGS:
 		error = copyout(&r.reg, uap->addr, sizeof(r.reg));
 		break;
 	case PT_GETFPREGS:
 		error = copyout(&r.fpreg, uap->addr, sizeof(r.fpreg));
 		break;
 	case PT_GETDBREGS:
 		error = copyout(&r.dbreg, uap->addr, sizeof(r.dbreg));
 		break;
 	case PT_GETREGSET:
 		r32.vec.iov_len = r.vec.iov_len;
 		error = copyout(&r32.vec, uap->addr, sizeof(r32.vec));
 		break;
 	case PT_GET_EVENT_MASK:
 		/* NB: The size in uap->data is validated in kern_ptrace(). */
 		error = copyout(&r.ptevents, uap->addr, uap->data);
 		break;
 	case PT_LWPINFO:
 		ptrace_lwpinfo_to32(&r.pl, &r32.pl);
 		error = copyout(&r32.pl, uap->addr, uap->data);
 		break;
 	case PT_GET_SC_ARGS:
 		for (i = 0; i < nitems(r.args); i++)
 			r32.args[i] = (uint32_t)r.args[i];
 		error = copyout(r32.args, uap->addr, MIN(uap->data,
 		    sizeof(r32.args)));
 		break;
 	case PT_GET_SC_RET:
 		ptrace_sc_ret_to32(&r.psr, &r32.psr);
 		error = copyout(&r32.psr, uap->addr, MIN(uap->data,
 		    sizeof(r32.psr)));
 		break;
 	case PT_SC_REMOTE:
 		ptrace_sc_ret_to32(&r.sr.pscr_ret, &r32.sr.pscr_ret);
 		error = copyout(&r32.sr.pscr_ret, uap->addr +
 		    offsetof(struct ptrace_sc_remote32, pscr_ret),
 		    sizeof(r32.psr));
 		break;
 	}
 
 	return (error);
 }
 
 int
 freebsd32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
 {
 	struct iovec32 iov32;
 	struct iovec *iov;
 	struct uio *uio;
-	u_int iovlen;
 	int error, i;
 
 	*uiop = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (EINVAL);
-	iovlen = iovcnt * sizeof(struct iovec);
-	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
-	iov = (struct iovec *)(uio + 1);
+	uio = allocuio(iovcnt);
+	iov = uio->uio_iov;
 	for (i = 0; i < iovcnt; i++) {
 		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
 		if (error) {
-			free(uio, M_IOV);
+			freeuio(uio);
 			return (error);
 		}
 		iov[i].iov_base = PTRIN(iov32.iov_base);
 		iov[i].iov_len = iov32.iov_len;
 	}
-	uio->uio_iov = iov;
 	uio->uio_iovcnt = iovcnt;
 	uio->uio_segflg = UIO_USERSPACE;
 	uio->uio_offset = -1;
 	uio->uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
-			free(uio, M_IOV);
+			freeuio(uio);
 			return (EINVAL);
 		}
 		uio->uio_resid += iov->iov_len;
 		iov++;
 	}
 	*uiop = uio;
 	return (0);
 }
 
 int
 freebsd32_readv(struct thread *td, struct freebsd32_readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 freebsd32_writev(struct thread *td, struct freebsd32_writev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_writev(td, uap->fd, auio);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 freebsd32_preadv(struct thread *td, struct freebsd32_preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 freebsd32_pwritev(struct thread *td, struct freebsd32_pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 freebsd32_copyiniov(struct iovec32 *iovp32, u_int iovcnt, struct iovec **iovp,
     int error)
 {
 	struct iovec32 iov32;
 	struct iovec *iov;
 	u_int iovlen;
 	int i;
 
 	*iovp = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (error);
 	iovlen = iovcnt * sizeof(struct iovec);
 	iov = malloc(iovlen, M_IOV, M_WAITOK);
 	for (i = 0; i < iovcnt; i++) {
 		error = copyin(&iovp32[i], &iov32, sizeof(struct iovec32));
 		if (error) {
 			free(iov, M_IOV);
 			return (error);
 		}
 		iov[i].iov_base = PTRIN(iov32.iov_base);
 		iov[i].iov_len = iov32.iov_len;
 	}
 	*iovp = iov;
 	return (0);
 }
 
 static int
 freebsd32_copyinmsghdr(const struct msghdr32 *msg32, struct msghdr *msg)
 {
 	struct msghdr32 m32;
 	int error;
 
 	error = copyin(msg32, &m32, sizeof(m32));
 	if (error)
 		return (error);
 	msg->msg_name = PTRIN(m32.msg_name);
 	msg->msg_namelen = m32.msg_namelen;
 	msg->msg_iov = PTRIN(m32.msg_iov);
 	msg->msg_iovlen = m32.msg_iovlen;
 	msg->msg_control = PTRIN(m32.msg_control);
 	msg->msg_controllen = m32.msg_controllen;
 	msg->msg_flags = m32.msg_flags;
 	return (0);
 }
 
 static int
 freebsd32_copyoutmsghdr(struct msghdr *msg, struct msghdr32 *msg32)
 {
 	struct msghdr32 m32;
 	int error;
 
 	m32.msg_name = PTROUT(msg->msg_name);
 	m32.msg_namelen = msg->msg_namelen;
 	m32.msg_iov = PTROUT(msg->msg_iov);
 	m32.msg_iovlen = msg->msg_iovlen;
 	m32.msg_control = PTROUT(msg->msg_control);
 	m32.msg_controllen = msg->msg_controllen;
 	m32.msg_flags = msg->msg_flags;
 	error = copyout(&m32, msg32, sizeof(m32));
 	return (error);
 }
 
 #define FREEBSD32_ALIGNBYTES	(sizeof(int) - 1)
 #define FREEBSD32_ALIGN(p)	\
 	(((u_long)(p) + FREEBSD32_ALIGNBYTES) & ~FREEBSD32_ALIGNBYTES)
 #define	FREEBSD32_CMSG_SPACE(l)	\
 	(FREEBSD32_ALIGN(sizeof(struct cmsghdr)) + FREEBSD32_ALIGN(l))
 
 #define	FREEBSD32_CMSG_DATA(cmsg)	((unsigned char *)(cmsg) + \
 				 FREEBSD32_ALIGN(sizeof(struct cmsghdr)))
 
 static size_t
 freebsd32_cmsg_convert(const struct cmsghdr *cm, void *data, socklen_t datalen)
 {
 	size_t copylen;
 	union {
 		struct timespec32 ts;
 		struct timeval32 tv;
 		struct bintime32 bt;
 	} tmp32;
 
 	union {
 		struct timespec ts;
 		struct timeval tv;
 		struct bintime bt;
 	} *in;
 
 	in = data;
 	copylen = 0;
 	switch (cm->cmsg_level) {
 	case SOL_SOCKET:
 		switch (cm->cmsg_type) {
 		case SCM_TIMESTAMP:
 			TV_CP(*in, tmp32, tv);
 			copylen = sizeof(tmp32.tv);
 			break;
 
 		case SCM_BINTIME:
 			BT_CP(*in, tmp32, bt);
 			copylen = sizeof(tmp32.bt);
 			break;
 
 		case SCM_REALTIME:
 		case SCM_MONOTONIC:
 			TS_CP(*in, tmp32, ts);
 			copylen = sizeof(tmp32.ts);
 			break;
 
 		default:
 			break;
 		}
 
 	default:
 		break;
 	}
 
 	if (copylen == 0)
 		return (datalen);
 
 	KASSERT((datalen >= copylen), ("corrupted cmsghdr"));
 
 	bcopy(&tmp32, data, copylen);
 	return (copylen);
 }
 
 static int
 freebsd32_copy_msg_out(struct msghdr *msg, struct mbuf *control)
 {
 	struct cmsghdr *cm;
 	void *data;
 	socklen_t clen, datalen, datalen_out, oldclen;
 	int error;
 	caddr_t ctlbuf;
 	int len, copylen;
 	struct mbuf *m;
 	error = 0;
 
 	len    = msg->msg_controllen;
 	msg->msg_controllen = 0;
 
 	ctlbuf = msg->msg_control;
 	for (m = control; m != NULL && len > 0; m = m->m_next) {
 		cm = mtod(m, struct cmsghdr *);
 		clen = m->m_len;
 		while (cm != NULL) {
 			if (sizeof(struct cmsghdr) > clen ||
 			    cm->cmsg_len > clen) {
 				error = EINVAL;
 				break;
 			}
 
 			data   = CMSG_DATA(cm);
 			datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 			datalen_out = freebsd32_cmsg_convert(cm, data, datalen);
 
 			/*
 			 * Copy out the message header.  Preserve the native
 			 * message size in case we need to inspect the message
 			 * contents later.
 			 */
 			copylen = sizeof(struct cmsghdr);
 			if (len < copylen) {
 				msg->msg_flags |= MSG_CTRUNC;
 				m_dispose_extcontrolm(m);
 				goto exit;
 			}
 			oldclen = cm->cmsg_len;
 			cm->cmsg_len = FREEBSD32_ALIGN(sizeof(struct cmsghdr)) +
 			    datalen_out;
 			error = copyout(cm, ctlbuf, copylen);
 			cm->cmsg_len = oldclen;
 			if (error != 0)
 				goto exit;
 
 			ctlbuf += FREEBSD32_ALIGN(copylen);
 			len    -= FREEBSD32_ALIGN(copylen);
 
 			copylen = datalen_out;
 			if (len < copylen) {
 				msg->msg_flags |= MSG_CTRUNC;
 				m_dispose_extcontrolm(m);
 				break;
 			}
 
 			/* Copy out the message data. */
 			error = copyout(data, ctlbuf, copylen);
 			if (error)
 				goto exit;
 
 			ctlbuf += FREEBSD32_ALIGN(copylen);
 			len    -= FREEBSD32_ALIGN(copylen);
 
 			if (CMSG_SPACE(datalen) < clen) {
 				clen -= CMSG_SPACE(datalen);
 				cm = (struct cmsghdr *)
 				    ((caddr_t)cm + CMSG_SPACE(datalen));
 			} else {
 				clen = 0;
 				cm = NULL;
 			}
 
 			msg->msg_controllen +=
 			    FREEBSD32_CMSG_SPACE(datalen_out);
 		}
 	}
 	if (len == 0 && m != NULL) {
 		msg->msg_flags |= MSG_CTRUNC;
 		m_dispose_extcontrolm(m);
 	}
 
 exit:
 	return (error);
 }
 
 int
 freebsd32_recvmsg(struct thread *td, struct freebsd32_recvmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *uiov, *iov;
 	struct mbuf *control = NULL;
 	struct mbuf **controlp;
 	int error;
 
 	error = freebsd32_copyinmsghdr(uap->msg, &msg);
 	if (error)
 		return (error);
 	error = freebsd32_copyiniov((void *)msg.msg_iov, msg.msg_iovlen, &iov,
 	    EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags;
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 
 	controlp = (msg.msg_control != NULL) ?  &control : NULL;
 	error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, controlp);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 
 		if (control != NULL)
 			error = freebsd32_copy_msg_out(&msg, control);
 		else
 			msg.msg_controllen = 0;
 
 		if (error == 0)
 			error = freebsd32_copyoutmsghdr(&msg, uap->msg);
 	}
 	free(iov, M_IOV);
 
 	if (control != NULL) {
 		if (error != 0)
 			m_dispose_extcontrolm(control);
 		m_freem(control);
 	}
 
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_recvmsg(struct thread *td, struct ofreebsd32_recvmsg_args *uap)
 {
 	return (ENOSYS);
 }
 #endif
 
 /*
  * Copy-in the array of control messages constructed using alignment
  * and padding suitable for a 32-bit environment and construct an
  * mbuf using alignment and padding suitable for a 64-bit kernel.
  * The alignment and padding are defined indirectly by CMSG_DATA(),
  * CMSG_SPACE() and CMSG_LEN().
  */
 static int
 freebsd32_copyin_control(struct mbuf **mp, caddr_t buf, u_int buflen)
 {
 	struct cmsghdr *cm;
 	struct mbuf *m;
 	void *in, *in1, *md;
 	u_int msglen, outlen;
 	int error;
 
 	/* Enforce the size limit of the native implementation. */
 	if (buflen > MCLBYTES)
 		return (EINVAL);
 
 	in = malloc(buflen, M_TEMP, M_WAITOK);
 	error = copyin(buf, in, buflen);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Make a pass over the input buffer to determine the amount of space
 	 * required for 64 bit-aligned copies of the control messages.
 	 */
 	in1 = in;
 	outlen = 0;
 	while (buflen > 0) {
 		if (buflen < sizeof(*cm)) {
 			error = EINVAL;
 			break;
 		}
 		cm = (struct cmsghdr *)in1;
 		if (cm->cmsg_len < FREEBSD32_ALIGN(sizeof(*cm)) ||
 		    cm->cmsg_len > buflen) {
 			error = EINVAL;
 			break;
 		}
 		msglen = FREEBSD32_ALIGN(cm->cmsg_len);
 		if (msglen < cm->cmsg_len) {
 			error = EINVAL;
 			break;
 		}
 		/* The native ABI permits the final padding to be omitted. */
 		if (msglen > buflen)
 			msglen = buflen;
 		buflen -= msglen;
 
 		in1 = (char *)in1 + msglen;
 		outlen += CMSG_ALIGN(sizeof(*cm)) +
 		    CMSG_ALIGN(msglen - FREEBSD32_ALIGN(sizeof(*cm)));
 	}
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Allocate up to MJUMPAGESIZE space for the re-aligned and
 	 * re-padded control messages.  This allows a full MCLBYTES of
 	 * 32-bit sized and aligned messages to fit and avoids an ABI
 	 * mismatch with the native implementation.
 	 */
 	m = m_get2(outlen, M_WAITOK, MT_CONTROL, 0);
 	if (m == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	m->m_len = outlen;
 	md = mtod(m, void *);
 
 	/*
 	 * Make a second pass over input messages, copying them into the output
 	 * buffer.
 	 */
 	in1 = in;
 	while (outlen > 0) {
 		/* Copy the message header and align the length field. */
 		cm = md;
 		memcpy(cm, in1, sizeof(*cm));
 		msglen = cm->cmsg_len - FREEBSD32_ALIGN(sizeof(*cm));
 		cm->cmsg_len = CMSG_ALIGN(sizeof(*cm)) + msglen;
 
 		/* Copy the message body. */
 		in1 = (char *)in1 + FREEBSD32_ALIGN(sizeof(*cm));
 		md = (char *)md + CMSG_ALIGN(sizeof(*cm));
 		memcpy(md, in1, msglen);
 		in1 = (char *)in1 + FREEBSD32_ALIGN(msglen);
 		md = (char *)md + CMSG_ALIGN(msglen);
 		KASSERT(outlen >= CMSG_ALIGN(sizeof(*cm)) + CMSG_ALIGN(msglen),
 		    ("outlen %u underflow, msglen %u", outlen, msglen));
 		outlen -= CMSG_ALIGN(sizeof(*cm)) + CMSG_ALIGN(msglen);
 	}
 
 	*mp = m;
 out:
 	free(in, M_TEMP);
 	return (error);
 }
 
 int
 freebsd32_sendmsg(struct thread *td, struct freebsd32_sendmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	struct mbuf *control = NULL;
 	struct sockaddr *to = NULL;
 	int error;
 
 	error = freebsd32_copyinmsghdr(uap->msg, &msg);
 	if (error)
 		return (error);
 	error = freebsd32_copyiniov((void *)msg.msg_iov, msg.msg_iovlen, &iov,
 	    EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 	if (msg.msg_name != NULL) {
 		error = getsockaddr(&to, msg.msg_name, msg.msg_namelen);
 		if (error) {
 			to = NULL;
 			goto out;
 		}
 		msg.msg_name = to;
 	}
 
 	if (msg.msg_control) {
 		if (msg.msg_controllen < sizeof(struct cmsghdr)) {
 			error = EINVAL;
 			goto out;
 		}
 
 		error = freebsd32_copyin_control(&control, msg.msg_control,
 		    msg.msg_controllen);
 		if (error)
 			goto out;
 
 		msg.msg_control = NULL;
 		msg.msg_controllen = 0;
 	}
 
 	error = kern_sendit(td, uap->s, &msg, uap->flags, control,
 	    UIO_USERSPACE);
 
 out:
 	free(iov, M_IOV);
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_sendmsg(struct thread *td, struct ofreebsd32_sendmsg_args *uap)
 {
 	return (ENOSYS);
 }
 #endif
 
 
 int
 freebsd32_settimeofday(struct thread *td,
 		       struct freebsd32_settimeofday_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval tv, *tvp;
 	struct timezone tz, *tzp;
 	int error;
 
 	if (uap->tv) {
 		error = copyin(uap->tv, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, tv, tv_sec);
 		CP(tv32, tv, tv_usec);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->tzp) {
 		error = copyin(uap->tzp, &tz, sizeof(tz));
 		if (error)
 			return (error);
 		tzp = &tz;
 	} else
 		tzp = NULL;
 	return (kern_settimeofday(td, tvp, tzp));
 }
 
 int
 freebsd32_utimes(struct thread *td, struct freebsd32_utimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_lutimes(struct thread *td, struct freebsd32_lutimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimes(struct thread *td, struct freebsd32_futimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_futimes(td, uap->fd, sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimesat(struct thread *td, struct freebsd32_futimesat_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
 		sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimens(struct thread *td, struct freebsd32_futimens_args *uap)
 {
 	struct timespec32 ts32[2];
 	struct timespec ts[2], *tsp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32[0], ts[0], tv_sec);
 		CP(ts32[0], ts[0], tv_nsec);
 		CP(ts32[1], ts[1], tv_sec);
 		CP(ts32[1], ts[1], tv_nsec);
 		tsp = ts;
 	} else
 		tsp = NULL;
 	return (kern_futimens(td, uap->fd, tsp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_utimensat(struct thread *td, struct freebsd32_utimensat_args *uap)
 {
 	struct timespec32 ts32[2];
 	struct timespec ts[2], *tsp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32[0], ts[0], tv_sec);
 		CP(ts32[0], ts[0], tv_nsec);
 		CP(ts32[1], ts[1], tv_sec);
 		CP(ts32[1], ts[1], tv_nsec);
 		tsp = ts;
 	} else
 		tsp = NULL;
 	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    tsp, UIO_SYSSPACE, uap->flag));
 }
 
 int
 freebsd32_adjtime(struct thread *td, struct freebsd32_adjtime_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval delta, olddelta, *deltap;
 	int error;
 
 	if (uap->delta) {
 		error = copyin(uap->delta, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, delta, tv_sec);
 		CP(tv32, delta, tv_usec);
 		deltap = &delta;
 	} else
 		deltap = NULL;
 	error = kern_adjtime(td, deltap, &olddelta);
 	if (uap->olddelta && error == 0) {
 		CP(olddelta, tv32, tv_sec);
 		CP(olddelta, tv32, tv_usec);
 		error = copyout(&tv32, uap->olddelta, sizeof(tv32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_statfs(struct thread *td, struct freebsd4_freebsd32_statfs_args *uap)
 {
 	struct ostatfs32 s32;
 	struct statfs *sp;
 	int error;
 
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_fstatfs(struct thread *td, struct freebsd4_freebsd32_fstatfs_args *uap)
 {
 	struct ostatfs32 s32;
 	struct statfs *sp;
 	int error;
 
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_fhstatfs(struct thread *td, struct freebsd4_freebsd32_fhstatfs_args *uap)
 {
 	struct ostatfs32 s32;
 	struct statfs *sp;
 	fhandle_t fh;
 	int error;
 
 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
 		return (error);
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 int
 freebsd32_pread(struct thread *td, struct freebsd32_pread_args *uap)
 {
 
 	return (kern_pread(td, uap->fd, uap->buf, uap->nbyte,
 	    PAIR32TO64(off_t, uap->offset)));
 }
 
 int
 freebsd32_pwrite(struct thread *td, struct freebsd32_pwrite_args *uap)
 {
 
 	return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte,
 	    PAIR32TO64(off_t, uap->offset)));
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_lseek(struct thread *td, struct ofreebsd32_lseek_args *uap)
 {
 
 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 }
 #endif
 
 int
 freebsd32_lseek(struct thread *td, struct freebsd32_lseek_args *uap)
 {
 	int error;
 	off_t pos;
 
 	error = kern_lseek(td, uap->fd, PAIR32TO64(off_t, uap->offset),
 	    uap->whence);
 	/* Expand the quad return into two parts for eax and edx */
 	pos = td->td_uretoff.tdu_off;
 	td->td_retval[RETVAL_LO] = pos & 0xffffffff;	/* %eax */
 	td->td_retval[RETVAL_HI] = pos >> 32;		/* %edx */
 	return error;
 }
 
 int
 freebsd32_truncate(struct thread *td, struct freebsd32_truncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE,
 	    PAIR32TO64(off_t, uap->length)));
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_truncate(struct thread *td, struct ofreebsd32_truncate_args *uap)
 {
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 #endif
 
 int
 freebsd32_ftruncate(struct thread *td, struct freebsd32_ftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, PAIR32TO64(off_t, uap->length)));
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_ftruncate(struct thread *td, struct ofreebsd32_ftruncate_args *uap)
 {
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 
 int
 ofreebsd32_getdirentries(struct thread *td,
     struct ofreebsd32_getdirentries_args *uap)
 {
 	struct ogetdirentries_args ap;
 	int error;
 	long loff;
 	int32_t loff_cut;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	error = kern_ogetdirentries(td, &ap, &loff);
 	if (error == 0) {
 		loff_cut = loff;
 		error = copyout(&loff_cut, uap->basep, sizeof(int32_t));
 	}
 	return (error);
 }
 #endif
 
 #if defined(COMPAT_FREEBSD11)
 int
 freebsd11_freebsd32_getdirentries(struct thread *td,
     struct freebsd11_freebsd32_getdirentries_args *uap)
 {
 	long base;
 	int32_t base32;
 	int error;
 
 	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
 	    &base, NULL);
 	if (error)
 		return (error);
 	if (uap->basep != NULL) {
 		base32 = base;
 		error = copyout(&base32, uap->basep, sizeof(int32_t));
 	}
 	return (error);
 }
 #endif /* COMPAT_FREEBSD11 */
 
 #ifdef COMPAT_FREEBSD6
 /* versions with the 'int pad' argument */
 int
 freebsd6_freebsd32_pread(struct thread *td, struct freebsd6_freebsd32_pread_args *uap)
 {
 
 	return (kern_pread(td, uap->fd, uap->buf, uap->nbyte,
 	    PAIR32TO64(off_t, uap->offset)));
 }
 
 int
 freebsd6_freebsd32_pwrite(struct thread *td, struct freebsd6_freebsd32_pwrite_args *uap)
 {
 
 	return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte,
 	    PAIR32TO64(off_t, uap->offset)));
 }
 
 int
 freebsd6_freebsd32_lseek(struct thread *td, struct freebsd6_freebsd32_lseek_args *uap)
 {
 	int error;
 	off_t pos;
 
 	error = kern_lseek(td, uap->fd, PAIR32TO64(off_t, uap->offset),
 	    uap->whence);
 	/* Expand the quad return into two parts for eax and edx */
 	pos = *(off_t *)(td->td_retval);
 	td->td_retval[RETVAL_LO] = pos & 0xffffffff;	/* %eax */
 	td->td_retval[RETVAL_HI] = pos >> 32;		/* %edx */
 	return error;
 }
 
 int
 freebsd6_freebsd32_truncate(struct thread *td, struct freebsd6_freebsd32_truncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE,
 	    PAIR32TO64(off_t, uap->length)));
 }
 
 int
 freebsd6_freebsd32_ftruncate(struct thread *td, struct freebsd6_freebsd32_ftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, PAIR32TO64(off_t, uap->length)));
 }
 #endif /* COMPAT_FREEBSD6 */
 
 struct sf_hdtr32 {
 	uint32_t headers;
 	int hdr_cnt;
 	uint32_t trailers;
 	int trl_cnt;
 };
 
 static int
 freebsd32_do_sendfile(struct thread *td,
     struct freebsd32_sendfile_args *uap, int compat)
 {
 	struct sf_hdtr32 hdtr32;
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	struct file *fp;
 	cap_rights_t rights;
 	struct iovec32 *iov32;
 	off_t offset, sbytes;
 	int error;
 
 	offset = PAIR32TO64(off_t, uap->offset);
 	if (offset < 0)
 		return (EINVAL);
 
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr32, sizeof(hdtr32));
 		if (error)
 			goto out;
 		PTRIN_CP(hdtr32, hdtr, headers);
 		CP(hdtr32, hdtr, hdr_cnt);
 		PTRIN_CP(hdtr32, hdtr, trailers);
 		CP(hdtr32, hdtr, trl_cnt);
 
 		if (hdtr.headers != NULL) {
 			iov32 = PTRIN(hdtr32.headers);
 			error = freebsd32_copyinuio(iov32,
 			    hdtr32.hdr_cnt, &hdr_uio);
 			if (error)
 				goto out;
 #ifdef COMPAT_FREEBSD4
 			/*
 			 * In FreeBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 #endif
 		}
 		if (hdtr.trailers != NULL) {
 			iov32 = PTRIN(hdtr32.trailers);
 			error = freebsd32_copyinuio(iov32,
 			    hdtr32.trl_cnt, &trl_uio);
 			if (error)
 				goto out;
 		}
 	}
 
 	AUDIT_ARG_FD(uap->fd);
 
 	if ((error = fget_read(td, uap->fd,
 	    cap_rights_init_one(&rights, CAP_PREAD), &fp)) != 0)
 		goto out;
 
 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, offset,
 	    uap->nbytes, &sbytes, uap->flags, td);
 	fdrop(fp, td);
 
 	if (uap->sbytes != NULL)
 		(void)copyout(&sbytes, uap->sbytes, sizeof(off_t));
 
 out:
 	if (hdr_uio)
-		free(hdr_uio, M_IOV);
+		freeuio(hdr_uio);
 	if (trl_uio)
-		free(trl_uio, M_IOV);
+		freeuio(trl_uio);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_sendfile(struct thread *td,
     struct freebsd4_freebsd32_sendfile_args *uap)
 {
 	return (freebsd32_do_sendfile(td,
 	    (struct freebsd32_sendfile_args *)uap, 1));
 }
 #endif
 
 int
 freebsd32_sendfile(struct thread *td, struct freebsd32_sendfile_args *uap)
 {
 
 	return (freebsd32_do_sendfile(td, uap, 0));
 }
 
 static void
 copy_stat(struct stat *in, struct stat32 *out)
 {
 
 #ifndef __amd64__
 	/*
 	 * 32-bit architectures other than i386 have 64-bit time_t.  This
 	 * results in struct timespec32 with 12 bytes for tv_sec and tv_nsec,
 	 * and 4 bytes of padding.  Zero the padding holes in struct stat32.
 	 */
 	bzero(&out->st_atim, sizeof(out->st_atim));
 	bzero(&out->st_mtim, sizeof(out->st_mtim));
 	bzero(&out->st_ctim, sizeof(out->st_ctim));
 	bzero(&out->st_birthtim, sizeof(out->st_birthtim));
 #endif
 	CP(*in, *out, st_dev);
 	CP(*in, *out, st_ino);
 	CP(*in, *out, st_mode);
 	CP(*in, *out, st_nlink);
 	CP(*in, *out, st_uid);
 	CP(*in, *out, st_gid);
 	CP(*in, *out, st_rdev);
 	TS_CP(*in, *out, st_atim);
 	TS_CP(*in, *out, st_mtim);
 	TS_CP(*in, *out, st_ctim);
 	CP(*in, *out, st_size);
 	CP(*in, *out, st_blocks);
 	CP(*in, *out, st_blksize);
 	CP(*in, *out, st_flags);
 	CP(*in, *out, st_gen);
 	TS_CP(*in, *out, st_birthtim);
 	out->st_padding0 = 0;
 	out->st_padding1 = 0;
 #ifdef __STAT32_TIME_T_EXT
 	out->st_atim_ext = 0;
 	out->st_mtim_ext = 0;
 	out->st_ctim_ext = 0;
 	out->st_btim_ext = 0;
 #endif
 	bzero(out->st_spare, sizeof(out->st_spare));
 }
 
 #ifdef COMPAT_43
 static void
 copy_ostat(struct stat *in, struct ostat32 *out)
 {
 
 	bzero(out, sizeof(*out));
 	CP(*in, *out, st_dev);
 	CP(*in, *out, st_ino);
 	CP(*in, *out, st_mode);
 	CP(*in, *out, st_nlink);
 	CP(*in, *out, st_uid);
 	CP(*in, *out, st_gid);
 	CP(*in, *out, st_rdev);
 	out->st_size = MIN(in->st_size, INT32_MAX);
 	TS_CP(*in, *out, st_atim);
 	TS_CP(*in, *out, st_mtim);
 	TS_CP(*in, *out, st_ctim);
 	CP(*in, *out, st_blksize);
 	CP(*in, *out, st_blocks);
 	CP(*in, *out, st_flags);
 	CP(*in, *out, st_gen);
 }
 #endif
 
 #ifdef COMPAT_43
 int
 ofreebsd32_stat(struct thread *td, struct ofreebsd32_stat_args *uap)
 {
 	struct stat sb;
 	struct ostat32 sb32;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	copy_ostat(&sb, &sb32);
 	error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_fstat(struct thread *td, struct freebsd32_fstat_args *uap)
 {
 	struct stat ub;
 	struct stat32 ub32;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error)
 		return (error);
 	copy_stat(&ub, &ub32);
 	error = copyout(&ub32, uap->sb, sizeof(ub32));
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_fstat(struct thread *td, struct ofreebsd32_fstat_args *uap)
 {
 	struct stat ub;
 	struct ostat32 ub32;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error)
 		return (error);
 	copy_ostat(&ub, &ub32);
 	error = copyout(&ub32, uap->sb, sizeof(ub32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_fstatat(struct thread *td, struct freebsd32_fstatat_args *uap)
 {
 	struct stat ub;
 	struct stat32 ub32;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE,
 	    &ub);
 	if (error)
 		return (error);
 	copy_stat(&ub, &ub32);
 	error = copyout(&ub32, uap->buf, sizeof(ub32));
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_lstat(struct thread *td, struct ofreebsd32_lstat_args *uap)
 {
 	struct stat sb;
 	struct ostat32 sb32;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	copy_ostat(&sb, &sb32);
 	error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_fhstat(struct thread *td, struct freebsd32_fhstat_args *uap)
 {
 	struct stat sb;
 	struct stat32 sb32;
 	struct fhandle fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
         if (error != 0)
                 return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error != 0)
 		return (error);
 	copy_stat(&sb, &sb32);
 	error = copyout(&sb32, uap->sb, sizeof (sb32));
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 extern int ino64_trunc_error;
 
 static int
 freebsd11_cvtstat32(struct stat *in, struct freebsd11_stat32 *out)
 {
 
 #ifndef __amd64__
 	/*
 	 * 32-bit architectures other than i386 have 64-bit time_t.  This
 	 * results in struct timespec32 with 12 bytes for tv_sec and tv_nsec,
 	 * and 4 bytes of padding.  Zero the padding holes in freebsd11_stat32.
 	 */
 	bzero(&out->st_atim, sizeof(out->st_atim));
 	bzero(&out->st_mtim, sizeof(out->st_mtim));
 	bzero(&out->st_ctim, sizeof(out->st_ctim));
 	bzero(&out->st_birthtim, sizeof(out->st_birthtim));
 #endif
 
 	CP(*in, *out, st_ino);
 	if (in->st_ino != out->st_ino) {
 		switch (ino64_trunc_error) {
 		default:
 		case 0:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		case 2:
 			out->st_ino = UINT32_MAX;
 			break;
 		}
 	}
 	CP(*in, *out, st_nlink);
 	if (in->st_nlink != out->st_nlink) {
 		switch (ino64_trunc_error) {
 		default:
 		case 0:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		case 2:
 			out->st_nlink = UINT16_MAX;
 			break;
 		}
 	}
 	out->st_dev = in->st_dev;
 	if (out->st_dev != in->st_dev) {
 		switch (ino64_trunc_error) {
 		default:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		}
 	}
 	CP(*in, *out, st_mode);
 	CP(*in, *out, st_uid);
 	CP(*in, *out, st_gid);
 	out->st_rdev = in->st_rdev;
 	if (out->st_rdev != in->st_rdev) {
 		switch (ino64_trunc_error) {
 		default:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		}
 	}
 	TS_CP(*in, *out, st_atim);
 	TS_CP(*in, *out, st_mtim);
 	TS_CP(*in, *out, st_ctim);
 	CP(*in, *out, st_size);
 	CP(*in, *out, st_blocks);
 	CP(*in, *out, st_blksize);
 	CP(*in, *out, st_flags);
 	CP(*in, *out, st_gen);
 	TS_CP(*in, *out, st_birthtim);
 	out->st_lspare = 0;
 	bzero((char *)&out->st_birthtim + sizeof(out->st_birthtim),
 	    sizeof(*out) - offsetof(struct freebsd11_stat32,
 	    st_birthtim) - sizeof(out->st_birthtim));
 	return (0);
 }
 
 int
 freebsd11_freebsd32_stat(struct thread *td,
     struct freebsd11_freebsd32_stat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_fstat(struct thread *td,
     struct freebsd11_freebsd32_fstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->sb, sizeof (sb32));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_fstatat(struct thread *td,
     struct freebsd11_freebsd32_fstatat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE,
 	    &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->buf, sizeof (sb32));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_lstat(struct thread *td,
     struct freebsd11_freebsd32_lstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_fhstat(struct thread *td,
     struct freebsd11_freebsd32_fhstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	struct fhandle fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
         if (error != 0)
                 return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->sb, sizeof (sb32));
 	return (error);
 }
 
 static int
 freebsd11_cvtnstat32(struct stat *sb, struct nstat32 *nsb32)
 {
 	struct nstat nsb;
 	int error;
 
 	error = freebsd11_cvtnstat(sb, &nsb);
 	if (error != 0)
 		return (error);
 
 	bzero(nsb32, sizeof(*nsb32));
 	CP(nsb, *nsb32, st_dev);
 	CP(nsb, *nsb32, st_ino);
 	CP(nsb, *nsb32, st_mode);
 	CP(nsb, *nsb32, st_nlink);
 	CP(nsb, *nsb32, st_uid);
 	CP(nsb, *nsb32, st_gid);
 	CP(nsb, *nsb32, st_rdev);
 	CP(nsb, *nsb32, st_atim.tv_sec);
 	CP(nsb, *nsb32, st_atim.tv_nsec);
 	CP(nsb, *nsb32, st_mtim.tv_sec);
 	CP(nsb, *nsb32, st_mtim.tv_nsec);
 	CP(nsb, *nsb32, st_ctim.tv_sec);
 	CP(nsb, *nsb32, st_ctim.tv_nsec);
 	CP(nsb, *nsb32, st_size);
 	CP(nsb, *nsb32, st_blocks);
 	CP(nsb, *nsb32, st_blksize);
 	CP(nsb, *nsb32, st_flags);
 	CP(nsb, *nsb32, st_gen);
 	CP(nsb, *nsb32, st_birthtim.tv_sec);
 	CP(nsb, *nsb32, st_birthtim.tv_nsec);
 	return (0);
 }
 
 int
 freebsd11_freebsd32_nstat(struct thread *td,
     struct freebsd11_freebsd32_nstat_args *uap)
 {
 	struct stat sb;
 	struct nstat32 nsb;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtnstat32(&sb, &nsb);
 	if (error != 0)
 		error = copyout(&nsb, uap->ub, sizeof (nsb));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_nlstat(struct thread *td,
     struct freebsd11_freebsd32_nlstat_args *uap)
 {
 	struct stat sb;
 	struct nstat32 nsb;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtnstat32(&sb, &nsb);
 	if (error == 0)
 		error = copyout(&nsb, uap->ub, sizeof (nsb));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_nfstat(struct thread *td,
     struct freebsd11_freebsd32_nfstat_args *uap)
 {
 	struct nstat32 nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtnstat32(&ub, &nub);
 	if (error == 0)
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	return (error);
 }
 #endif
 
 int
 freebsd32___sysctl(struct thread *td, struct freebsd32___sysctl_args *uap)
 {
 	int error, name[CTL_MAXNAME];
 	size_t j, oldlen;
 	uint32_t tmp;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
  	error = copyin(uap->name, name, uap->namelen * sizeof(int));
  	if (error)
 		return (error);
 	if (uap->oldlenp) {
 		error = fueword32(uap->oldlenp, &tmp);
 		oldlen = tmp;
 	} else {
 		oldlen = 0;
 	}
 	if (error != 0)
 		return (EFAULT);
 	error = userland_sysctl(td, name, uap->namelen,
 		uap->old, &oldlen, 1,
 		uap->new, uap->newlen, &j, SCTL_MASK32);
 	if (error)
 		return (error);
 	if (uap->oldlenp != NULL && suword32(uap->oldlenp, j) != 0)
 		error = EFAULT;
 	return (error);
 }
 
 int
 freebsd32___sysctlbyname(struct thread *td,
     struct freebsd32___sysctlbyname_args *uap)
 {
 	size_t oldlen, rv;
 	int error;
 	uint32_t tmp;
 
 	if (uap->oldlenp != NULL) {
 		error = fueword32(uap->oldlenp, &tmp);
 		oldlen = tmp;
 	} else {
 		error = oldlen = 0;
 	}
 	if (error != 0)
 		return (EFAULT);
 	error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old,
 	    &oldlen, uap->new, uap->newlen, &rv, SCTL_MASK32, 1);
 	if (error != 0)
 		return (error);
 	if (uap->oldlenp != NULL && suword32(uap->oldlenp, rv) != 0)
 		error = EFAULT;
 	return (error);
 }
 
 int
 freebsd32_jail(struct thread *td, struct freebsd32_jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		/* FreeBSD single IPv4 jails. */
 		struct jail32_v0 j32_v0;
 
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j32_v0, sizeof(struct jail32_v0));
 		if (error)
 			return (error);
 		CP(j32_v0, j, version);
 		PTRIN_CP(j32_v0, j, path);
 		PTRIN_CP(j32_v0, j, hostname);
 		j.ip4s = htonl(j32_v0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 	{
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		struct jail32 j32;
 
 		error = copyin(uap->jail, &j32, sizeof(struct jail32));
 		if (error)
 			return (error);
 		CP(j32, j, version);
 		PTRIN_CP(j32, j, path);
 		PTRIN_CP(j32, j, hostname);
 		PTRIN_CP(j32, j, jailname);
 		CP(j32, j, ip4s);
 		CP(j32, j, ip6s);
 		PTRIN_CP(j32, j, ip4);
 		PTRIN_CP(j32, j, ip6);
 		break;
 	}
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 freebsd32_jail_set(struct thread *td, struct freebsd32_jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 freebsd32_jail_get(struct thread *td, struct freebsd32_jail_get_args *uap)
 {
 	struct iovec32 iov32;
 	struct uio *auio;
 	int error, i;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		for (i = 0; i < uap->iovcnt; i++) {
 			PTROUT_CP(auio->uio_iov[i], iov32, iov_base);
 			CP(auio->uio_iov[i], iov32, iov_len);
 			error = copyout(&iov32, uap->iovp + i, sizeof(iov32));
 			if (error != 0)
 				break;
 		}
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 freebsd32_sigaction(struct thread *td, struct freebsd32_sigaction_args *uap)
 {
 	struct sigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->act) {
 		error = copyin(uap->act, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		CP(s32, sa, sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->sig, sap, &osa, 0);
 	if (error == 0 && uap->oact != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		CP(osa, s32, sa_mask);
 		error = copyout(&s32, uap->oact, sizeof(s32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_sigaction(struct thread *td,
 			     struct freebsd4_freebsd32_sigaction_args *uap)
 {
 	struct sigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->act) {
 		error = copyin(uap->act, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		CP(s32, sa, sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->sig, sap, &osa, KSA_FREEBSD4);
 	if (error == 0 && uap->oact != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		CP(osa, s32, sa_mask);
 		error = copyout(&s32, uap->oact, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_43
 struct osigaction32 {
 	uint32_t	sa_u;
 	osigset_t	sa_mask;
 	int		sa_flags;
 };
 
 #define	ONSIG	32
 
 int
 ofreebsd32_sigaction(struct thread *td,
 			     struct ofreebsd32_sigaction_args *uap)
 {
 	struct osigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	if (uap->nsa) {
 		error = copyin(uap->nsa, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		OSIG2SIG(s32.sa_mask, sa.sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
 	if (error == 0 && uap->osa != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		SIG2OSIG(osa.sa_mask, s32.sa_mask);
 		error = copyout(&s32, uap->osa, sizeof(s32));
 	}
 	return (error);
 }
 
 struct sigvec32 {
 	uint32_t	sv_handler;
 	int		sv_mask;
 	int		sv_flags;
 };
 
 int
 ofreebsd32_sigvec(struct thread *td,
 			  struct ofreebsd32_sigvec_args *uap)
 {
 	struct sigvec32 vec;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	if (uap->nsv) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(vec.sv_handler);
 		OSIG2SIG(vec.sv_mask, sa.sa_mask);
 		sa.sa_flags = vec.sv_flags;
 		sa.sa_flags ^= SA_RESTART;
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
 	if (error == 0 && uap->osv != NULL) {
 		vec.sv_handler = PTROUT(osa.sa_handler);
 		SIG2OSIG(osa.sa_mask, vec.sv_mask);
 		vec.sv_flags = osa.sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 struct sigstack32 {
 	uint32_t	ss_sp;
 	int		ss_onstack;
 };
 
 int
 ofreebsd32_sigstack(struct thread *td,
 			    struct ofreebsd32_sigstack_args *uap)
 {
 	struct sigstack32 s32;
 	struct sigstack nss, oss;
 	int error = 0, unss;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		nss.ss_sp = PTRIN(s32.ss_sp);
 		CP(s32, nss, ss_onstack);
 		unss = 1;
 	} else {
 		unss = 0;
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (unss) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= (nss.ss_onstack & SS_ONSTACK);
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL) {
 		s32.ss_sp = PTROUT(oss.ss_sp);
 		CP(oss, s32, ss_onstack);
 		error = copyout(&s32, uap->oss, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_nanosleep(struct thread *td, struct freebsd32_nanosleep_args *uap)
 {
 
 	return (freebsd32_user_clock_nanosleep(td, CLOCK_REALTIME,
 	    TIMER_RELTIME, uap->rqtp, uap->rmtp));
 }
 
 int
 freebsd32_clock_nanosleep(struct thread *td,
     struct freebsd32_clock_nanosleep_args *uap)
 {
 	int error;
 
 	error = freebsd32_user_clock_nanosleep(td, uap->clock_id, uap->flags,
 	    uap->rqtp, uap->rmtp);
 	return (kern_posix_error(td, error));
 }
 
 static int
 freebsd32_user_clock_nanosleep(struct thread *td, clockid_t clock_id,
     int flags, const struct timespec32 *ua_rqtp, struct timespec32 *ua_rmtp)
 {
 	struct timespec32 rmt32, rqt32;
 	struct timespec rmt, rqt;
 	int error, error2;
 
 	error = copyin(ua_rqtp, &rqt32, sizeof(rqt32));
 	if (error)
 		return (error);
 
 	CP(rqt32, rqt, tv_sec);
 	CP(rqt32, rqt, tv_nsec);
 
 	error = kern_clock_nanosleep(td, clock_id, flags, &rqt, &rmt);
 	if (error == EINTR && ua_rmtp != NULL && (flags & TIMER_ABSTIME) == 0) {
 		CP(rmt, rmt32, tv_sec);
 		CP(rmt, rmt32, tv_nsec);
 
 		error2 = copyout(&rmt32, ua_rmtp, sizeof(rmt32));
 		if (error2 != 0)
 			error = error2;
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_gettime(struct thread *td,
 			struct freebsd32_clock_gettime_args *uap)
 {
 	struct timespec	ats;
 	struct timespec32 ats32;
 	int error;
 
 	error = kern_clock_gettime(td, uap->clock_id, &ats);
 	if (error == 0) {
 		CP(ats, ats32, tv_sec);
 		CP(ats, ats32, tv_nsec);
 		error = copyout(&ats32, uap->tp, sizeof(ats32));
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_settime(struct thread *td,
 			struct freebsd32_clock_settime_args *uap)
 {
 	struct timespec	ats;
 	struct timespec32 ats32;
 	int error;
 
 	error = copyin(uap->tp, &ats32, sizeof(ats32));
 	if (error)
 		return (error);
 	CP(ats32, ats, tv_sec);
 	CP(ats32, ats, tv_nsec);
 
 	return (kern_clock_settime(td, uap->clock_id, &ats));
 }
 
 int
 freebsd32_clock_getres(struct thread *td,
 		       struct freebsd32_clock_getres_args *uap)
 {
 	struct timespec	ts;
 	struct timespec32 ts32;
 	int error;
 
 	if (uap->tp == NULL)
 		return (0);
 	error = kern_clock_getres(td, uap->clock_id, &ts);
 	if (error == 0) {
 		CP(ts, ts32, tv_sec);
 		CP(ts, ts32, tv_nsec);
 		error = copyout(&ts32, uap->tp, sizeof(ts32));
 	}
 	return (error);
 }
 
 int freebsd32_ktimer_create(struct thread *td,
     struct freebsd32_ktimer_create_args *uap)
 {
 	struct sigevent32 ev32;
 	struct sigevent ev, *evp;
 	int error, id;
 
 	if (uap->evp == NULL) {
 		evp = NULL;
 	} else {
 		evp = &ev;
 		error = copyin(uap->evp, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 	}
 	error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1);
 	if (error == 0) {
 		error = copyout(&id, uap->timerid, sizeof(int));
 		if (error != 0)
 			kern_ktimer_delete(td, id);
 	}
 	return (error);
 }
 
 int
 freebsd32_ktimer_settime(struct thread *td,
     struct freebsd32_ktimer_settime_args *uap)
 {
 	struct itimerspec32 val32, oval32;
 	struct itimerspec val, oval, *ovalp;
 	int error;
 
 	error = copyin(uap->value, &val32, sizeof(val32));
 	if (error != 0)
 		return (error);
 	ITS_CP(val32, val);
 	ovalp = uap->ovalue != NULL ? &oval : NULL;
 	error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp);
 	if (error == 0 && uap->ovalue != NULL) {
 		ITS_CP(oval, oval32);
 		error = copyout(&oval32, uap->ovalue, sizeof(oval32));
 	}
 	return (error);
 }
 
 int
 freebsd32_ktimer_gettime(struct thread *td,
     struct freebsd32_ktimer_gettime_args *uap)
 {
 	struct itimerspec32 val32;
 	struct itimerspec val;
 	int error;
 
 	error = kern_ktimer_gettime(td, uap->timerid, &val);
 	if (error == 0) {
 		ITS_CP(val, val32);
 		error = copyout(&val32, uap->value, sizeof(val32));
 	}
 	return (error);
 }
 
 int
 freebsd32_timerfd_gettime(struct thread *td,
     struct freebsd32_timerfd_gettime_args *uap)
 {
 	struct itimerspec curr_value;
 	struct itimerspec32 curr_value32;
 	int error;
 
 	error = kern_timerfd_gettime(td, uap->fd, &curr_value);
 	if (error == 0) {
 		CP(curr_value, curr_value32, it_value.tv_sec);
 		CP(curr_value, curr_value32, it_value.tv_nsec);
 		CP(curr_value, curr_value32, it_interval.tv_sec);
 		CP(curr_value, curr_value32, it_interval.tv_nsec);
 		error = copyout(&curr_value32, uap->curr_value,
 		    sizeof(curr_value32));
 	}
 
 	return (error);
 }
 
 int
 freebsd32_timerfd_settime(struct thread *td,
     struct freebsd32_timerfd_settime_args *uap)
 {
 	struct itimerspec new_value, old_value;
 	struct itimerspec32 new_value32, old_value32;
 	int error;
 
 	error = copyin(uap->new_value, &new_value32, sizeof(new_value32));
 	if (error != 0)
 		return (error);
 	CP(new_value32, new_value, it_value.tv_sec);
 	CP(new_value32, new_value, it_value.tv_nsec);
 	CP(new_value32, new_value, it_interval.tv_sec);
 	CP(new_value32, new_value, it_interval.tv_nsec);
 	if (uap->old_value == NULL) {
 		error = kern_timerfd_settime(td, uap->fd, uap->flags,
 		    &new_value, NULL);
 	} else {
 		error = kern_timerfd_settime(td, uap->fd, uap->flags,
 		    &new_value, &old_value);
 		if (error == 0) {
 			CP(old_value, old_value32, it_value.tv_sec);
 			CP(old_value, old_value32, it_value.tv_nsec);
 			CP(old_value, old_value32, it_interval.tv_sec);
 			CP(old_value, old_value32, it_interval.tv_nsec);
 			error = copyout(&old_value32, uap->old_value,
 			    sizeof(old_value32));
 		}
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_getcpuclockid2(struct thread *td,
     struct freebsd32_clock_getcpuclockid2_args *uap)
 {
 	clockid_t clk_id;
 	int error;
 
 	error = kern_clock_getcpuclockid2(td, PAIR32TO64(id_t, uap->id),
 	    uap->which, &clk_id);
 	if (error == 0)
 		error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
 	return (error);
 }
 
 int
 freebsd32_thr_new(struct thread *td,
 		  struct freebsd32_thr_new_args *uap)
 {
 	struct thr_param32 param32;
 	struct thr_param param;
 	int error;
 
 	if (uap->param_size < 0 ||
 	    uap->param_size > sizeof(struct thr_param32))
 		return (EINVAL);
 	bzero(&param, sizeof(struct thr_param));
 	bzero(&param32, sizeof(struct thr_param32));
 	error = copyin(uap->param, &param32, uap->param_size);
 	if (error != 0)
 		return (error);
 	param.start_func = PTRIN(param32.start_func);
 	param.arg = PTRIN(param32.arg);
 	param.stack_base = PTRIN(param32.stack_base);
 	param.stack_size = param32.stack_size;
 	param.tls_base = PTRIN(param32.tls_base);
 	param.tls_size = param32.tls_size;
 	param.child_tid = PTRIN(param32.child_tid);
 	param.parent_tid = PTRIN(param32.parent_tid);
 	param.flags = param32.flags;
 	param.rtp = PTRIN(param32.rtp);
 	param.spare[0] = PTRIN(param32.spare[0]);
 	param.spare[1] = PTRIN(param32.spare[1]);
 	param.spare[2] = PTRIN(param32.spare[2]);
 
 	return (kern_thr_new(td, &param));
 }
 
 int
 freebsd32_thr_suspend(struct thread *td, struct freebsd32_thr_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	error = 0;
 	tsp = NULL;
 	if (uap->timeout != NULL) {
 		error = copyin((const void *)uap->timeout, (void *)&ts32,
 		    sizeof(struct timespec32));
 		if (error != 0)
 			return (error);
 		ts.tv_sec = ts32.tv_sec;
 		ts.tv_nsec = ts32.tv_nsec;
 		tsp = &ts;
 	}
 	return (kern_thr_suspend(td, tsp));
 }
 
 void
 siginfo_to_siginfo32(const siginfo_t *src, struct siginfo32 *dst)
 {
 	bzero(dst, sizeof(*dst));
 	dst->si_signo = src->si_signo;
 	dst->si_errno = src->si_errno;
 	dst->si_code = src->si_code;
 	dst->si_pid = src->si_pid;
 	dst->si_uid = src->si_uid;
 	dst->si_status = src->si_status;
 	dst->si_addr = (uintptr_t)src->si_addr;
 	dst->si_value.sival_int = src->si_value.sival_int;
 	dst->si_timerid = src->si_timerid;
 	dst->si_overrun = src->si_overrun;
 }
 
 #ifndef _FREEBSD32_SYSPROTO_H_
 struct freebsd32_sigqueue_args {
         pid_t pid;
         int signum;
         /* union sigval32 */ int value;
 };
 #endif
 int
 freebsd32_sigqueue(struct thread *td, struct freebsd32_sigqueue_args *uap)
 {
 	union sigval sv;
 
 	/*
 	 * On 32-bit ABIs, sival_int and sival_ptr are the same.
 	 * On 64-bit little-endian ABIs, the low bits are the same.
 	 * In 64-bit big-endian ABIs, sival_int overlaps with
 	 * sival_ptr's HIGH bits.  We choose to support sival_int
 	 * rather than sival_ptr in this case as it seems to be
 	 * more common.
 	 */
 	bzero(&sv, sizeof(sv));
 	sv.sival_int = (uint32_t)(uint64_t)uap->value;
 
 	return (kern_sigqueue(td, uap->pid, uap->signum, &sv));
 }
 
 int
 freebsd32_sigtimedwait(struct thread *td, struct freebsd32_sigtimedwait_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	struct siginfo32 si32;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		ts.tv_sec = ts32.tv_sec;
 		ts.tv_nsec = ts32.tv_nsec;
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info) {
 		siginfo_to_siginfo32(&ksi.ksi_info, &si32);
 		error = copyout(&si32, uap->info, sizeof(struct siginfo32));
 	}
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 freebsd32_sigwaitinfo(struct thread *td, struct freebsd32_sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	struct siginfo32 si32;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info) {
 		siginfo_to_siginfo32(&ksi.ksi_info, &si32);
 		error = copyout(&si32, uap->info, sizeof(struct siginfo32));
 	}	
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 freebsd32_cpuset_setid(struct thread *td,
     struct freebsd32_cpuset_setid_args *uap)
 {
 
 	return (kern_cpuset_setid(td, uap->which,
 	    PAIR32TO64(id_t, uap->id), uap->setid));
 }
 
 int
 freebsd32_cpuset_getid(struct thread *td,
     struct freebsd32_cpuset_getid_args *uap)
 {
 
 	return (kern_cpuset_getid(td, uap->level, uap->which,
 	    PAIR32TO64(id_t, uap->id), uap->setid));
 }
 
 static int
 copyin32_set(const void *u, void *k, size_t size)
 {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 	int rv;
 	struct bitset *kb = k;
 	int *p;
 
 	rv = copyin(u, k, size);
 	if (rv != 0)
 		return (rv);
 
 	p = (int *)kb->__bits;
 	/* Loop through swapping words.
 	 * `size' is in bytes, we need bits. */
 	for (int i = 0; i < __bitset_words(size * 8); i++) {
 		int tmp = p[0];
 		p[0] = p[1];
 		p[1] = tmp;
 		p += 2;
 	}
 	return (0);
 #else
 	return (copyin(u, k, size));
 #endif
 }
 
 static int
 copyout32_set(const void *k, void *u, size_t size)
 {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 	const struct bitset *kb = k;
 	struct bitset *ub = u;
 	const int *kp = (const int *)kb->__bits;
 	int *up = (int *)ub->__bits;
 	int rv;
 
 	for (int i = 0; i < __bitset_words(CPU_SETSIZE); i++) {
 		/* `size' is in bytes, we need bits. */
 		for (int i = 0; i < __bitset_words(size * 8); i++) {
 			rv = suword32(up, kp[1]);
 			if (rv == 0)
 				rv = suword32(up + 1, kp[0]);
 			if (rv != 0)
 				return (EFAULT);
 		}
 	}
 	return (0);
 #else
 	return (copyout(k, u, size));
 #endif
 }
 
 static const struct cpuset_copy_cb cpuset_copy32_cb = {
 	.cpuset_copyin = copyin32_set,
 	.cpuset_copyout = copyout32_set
 };
 
 int
 freebsd32_cpuset_getaffinity(struct thread *td,
     struct freebsd32_cpuset_getaffinity_args *uap)
 {
 
 	return (user_cpuset_getaffinity(td, uap->level, uap->which,
 	    PAIR32TO64(id_t,uap->id), uap->cpusetsize, uap->mask,
 	    &cpuset_copy32_cb));
 }
 
 int
 freebsd32_cpuset_setaffinity(struct thread *td,
     struct freebsd32_cpuset_setaffinity_args *uap)
 {
 
 	return (user_cpuset_setaffinity(td, uap->level, uap->which,
 	    PAIR32TO64(id_t,uap->id), uap->cpusetsize, uap->mask,
 	    &cpuset_copy32_cb));
 }
 
 int
 freebsd32_cpuset_getdomain(struct thread *td,
     struct freebsd32_cpuset_getdomain_args *uap)
 {
 
 	return (kern_cpuset_getdomain(td, uap->level, uap->which,
 	    PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy,
 	    &cpuset_copy32_cb));
 }
 
 int
 freebsd32_cpuset_setdomain(struct thread *td,
     struct freebsd32_cpuset_setdomain_args *uap)
 {
 
 	return (kern_cpuset_setdomain(td, uap->level, uap->which,
 	    PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy,
 	    &cpuset_copy32_cb));
 }
 
 int
 freebsd32_nmount(struct thread *td,
     struct freebsd32_nmount_args /* {
     	struct iovec *iovp;
     	unsigned int iovcnt;
     	int flags;
     } */ *uap)
 {
 	struct uio *auio;
 	uint64_t flags;
 	int error;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit archtectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	/*
 	 * check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((uap->iovcnt & 1) || (uap->iovcnt < 4))
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = vfs_donmount(td, flags, auio);
 
-	free(auio, M_IOV);
+	freeuio(auio);
 	return error;
 }
 
 #if 0
 int
 freebsd32_xxx(struct thread *td, struct freebsd32_xxx_args *uap)
 {
 	struct yyy32 *p32, s32;
 	struct yyy *p = NULL, s;
 	struct xxx_arg ap;
 	int error;
 
 	if (uap->zzz) {
 		error = copyin(uap->zzz, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		/* translate in */
 		p = &s;
 	}
 	error = kern_xxx(td, p);
 	if (error)
 		return (error);
 	if (uap->zzz) {
 		/* translate out */
 		error = copyout(&s32, p32, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 int
 syscall32_module_handler(struct module *mod, int what, void *arg)
 {
 
 	return (kern_syscall_module_handler(freebsd32_sysent, mod, what, arg));
 }
 
 int
 syscall32_helper_register(struct syscall_helper_data *sd, int flags)
 {
 
 	return (kern_syscall_helper_register(freebsd32_sysent, sd, flags));
 }
 
 int
 syscall32_helper_unregister(struct syscall_helper_data *sd)
 {
 
 	return (kern_syscall_helper_unregister(freebsd32_sysent, sd));
 }
 
 int
 freebsd32_copyout_strings(struct image_params *imgp, uintptr_t *stack_base)
 {
 	struct sysentvec *sysent;
 	int argc, envc, i;
 	uint32_t *vectp;
 	char *stringp;
 	uintptr_t destp, ustringp;
 	struct freebsd32_ps_strings *arginfo;
 	char canary[sizeof(long) * 8];
 	int32_t pagesizes32[MAXPAGESIZES];
 	size_t execpath_len;
 	int error, szsigcode;
 
 	sysent = imgp->sysent;
 
 	arginfo = (struct freebsd32_ps_strings *)PROC_PS_STRINGS(imgp->proc);
 	imgp->ps_strings = arginfo;
 	destp =	(uintptr_t)arginfo;
 
 	/*
 	 * Install sigcode.
 	 */
 	if (!PROC_HAS_SHP(imgp->proc)) {
 		szsigcode = *sysent->sv_szsigcode;
 		destp -= szsigcode;
 		destp = rounddown2(destp, sizeof(uint32_t));
 		error = copyout(sysent->sv_sigcode, (void *)destp,
 		    szsigcode);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL) {
 		execpath_len = strlen(imgp->execpath) + 1;
 		destp -= execpath_len;
 		imgp->execpathp = (void *)destp;
 		error = copyout(imgp->execpath, imgp->execpathp, execpath_len);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	destp -= sizeof(canary);
 	imgp->canary = (void *)destp;
 	error = copyout(canary, imgp->canary, sizeof(canary));
 	if (error != 0)
 		return (error);
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
 	for (i = 0; i < MAXPAGESIZES; i++)
 		pagesizes32[i] = (uint32_t)pagesizes[i];
 	destp -= sizeof(pagesizes32);
 	destp = rounddown2(destp, sizeof(uint32_t));
 	imgp->pagesizes = (void *)destp;
 	error = copyout(pagesizes32, imgp->pagesizes, sizeof(pagesizes32));
 	if (error != 0)
 		return (error);
 	imgp->pagesizeslen = sizeof(pagesizes32);
 
 	/*
 	 * Allocate room for the argument and environment strings.
 	 */
 	destp -= ARG_MAX - imgp->args->stringspace;
 	destp = rounddown2(destp, sizeof(uint32_t));
 	ustringp = destp;
 
 	if (imgp->auxargs) {
 		/*
 		 * Allocate room on the stack for the ELF auxargs
 		 * array.  It has up to AT_COUNT entries.
 		 */
 		destp -= AT_COUNT * sizeof(Elf32_Auxinfo);
 		destp = rounddown2(destp, sizeof(uint32_t));
 	}
 
 	vectp = (uint32_t *)destp;
 
 	/*
 	 * Allocate room for the argv[] and env vectors including the
 	 * terminating NULL pointers.
 	 */
 	vectp -= imgp->args->argc + 1 + imgp->args->envc + 1;
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	*stack_base = (uintptr_t)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	error = copyout(stringp, (void *)ustringp,
 	    ARG_MAX - imgp->args->stringspace);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	imgp->argv = vectp;
 	if (suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp) != 0 ||
 	    suword32(&arginfo->ps_nargvstr, argc) != 0)
 		return (EFAULT);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		if (suword32(vectp++, ustringp) != 0)
 			return (EFAULT);
 		while (*stringp++ != 0)
 			ustringp++;
 		ustringp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	if (suword32(vectp++, 0) != 0)
 		return (EFAULT);
 
 	imgp->envv = vectp;
 	if (suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp) != 0 ||
 	    suword32(&arginfo->ps_nenvstr, envc) != 0)
 		return (EFAULT);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		if (suword32(vectp++, ustringp) != 0)
 			return (EFAULT);
 		while (*stringp++ != 0)
 			ustringp++;
 		ustringp++;
 	}
 
 	/* end of vector table is a null pointer */
 	if (suword32(vectp, 0) != 0)
 		return (EFAULT);
 
 	if (imgp->auxargs) {
 		vectp++;
 		error = imgp->sysent->sv_copyout_auxargs(imgp,
 		    (uintptr_t)vectp);
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 int
 freebsd32_kldstat(struct thread *td, struct freebsd32_kldstat_args *uap)
 {
 	struct kld_file_stat *stat;
 	struct kld_file_stat32 *stat32;
 	int error, version;
 
 	if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
 	    != 0)
 		return (error);
 	if (version != sizeof(struct kld_file_stat_1_32) &&
 	    version != sizeof(struct kld_file_stat32))
 		return (EINVAL);
 
 	stat = malloc(sizeof(*stat), M_TEMP, M_WAITOK | M_ZERO);
 	stat32 = malloc(sizeof(*stat32), M_TEMP, M_WAITOK | M_ZERO);
 	error = kern_kldstat(td, uap->fileid, stat);
 	if (error == 0) {
 		bcopy(&stat->name[0], &stat32->name[0], sizeof(stat->name));
 		CP(*stat, *stat32, refs);
 		CP(*stat, *stat32, id);
 		PTROUT_CP(*stat, *stat32, address);
 		CP(*stat, *stat32, size);
 		bcopy(&stat->pathname[0], &stat32->pathname[0],
 		    sizeof(stat->pathname));
 		stat32->version  = version;
 		error = copyout(stat32, uap->stat, version);
 	}
 	free(stat, M_TEMP);
 	free(stat32, M_TEMP);
 	return (error);
 }
 
 int
 freebsd32_posix_fallocate(struct thread *td,
     struct freebsd32_posix_fallocate_args *uap)
 {
 	int error;
 
 	error = kern_posix_fallocate(td, uap->fd,
 	    PAIR32TO64(off_t, uap->offset), PAIR32TO64(off_t, uap->len));
 	return (kern_posix_error(td, error));
 }
 
 int
 freebsd32_posix_fadvise(struct thread *td,
     struct freebsd32_posix_fadvise_args *uap)
 {
 	int error;
 
 	error = kern_posix_fadvise(td, uap->fd, PAIR32TO64(off_t, uap->offset),
 	    PAIR32TO64(off_t, uap->len), uap->advice);
 	return (kern_posix_error(td, error));
 }
 
 int
 convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
 {
 
 	CP(*sig32, *sig, sigev_notify);
 	switch (sig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_THREAD_ID:
 		CP(*sig32, *sig, sigev_notify_thread_id);
 		/* FALLTHROUGH */
 	case SIGEV_SIGNAL:
 		CP(*sig32, *sig, sigev_signo);
 		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
 		break;
 	case SIGEV_KEVENT:
 		CP(*sig32, *sig, sigev_notify_kqueue);
 		CP(*sig32, *sig, sigev_notify_kevent_flags);
 		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 freebsd32_procctl(struct thread *td, struct freebsd32_procctl_args *uap)
 {
 	void *data;
 	union {
 		struct procctl_reaper_status rs;
 		struct procctl_reaper_pids rp;
 		struct procctl_reaper_kill rk;
 	} x;
 	union {
 		struct procctl_reaper_pids32 rp;
 	} x32;
 	int error, error1, flags, signum;
 
 	if (uap->com >= PROC_PROCCTL_MD_MIN)
 		return (cpu_procctl(td, uap->idtype, PAIR32TO64(id_t, uap->id),
 		    uap->com, PTRIN(uap->data)));
 
 	switch (uap->com) {
 	case PROC_ASLR_CTL:
 	case PROC_PROTMAX_CTL:
 	case PROC_SPROTECT:
 	case PROC_STACKGAP_CTL:
 	case PROC_TRACE_CTL:
 	case PROC_TRAPCAP_CTL:
 	case PROC_NO_NEW_PRIVS_CTL:
 	case PROC_WXMAP_CTL:
 		error = copyin(PTRIN(uap->data), &flags, sizeof(flags));
 		if (error != 0)
 			return (error);
 		data = &flags;
 		break;
 	case PROC_REAP_ACQUIRE:
 	case PROC_REAP_RELEASE:
 		if (uap->data != NULL)
 			return (EINVAL);
 		data = NULL;
 		break;
 	case PROC_REAP_STATUS:
 		data = &x.rs;
 		break;
 	case PROC_REAP_GETPIDS:
 		error = copyin(uap->data, &x32.rp, sizeof(x32.rp));
 		if (error != 0)
 			return (error);
 		CP(x32.rp, x.rp, rp_count);
 		PTRIN_CP(x32.rp, x.rp, rp_pids);
 		data = &x.rp;
 		break;
 	case PROC_REAP_KILL:
 		error = copyin(uap->data, &x.rk, sizeof(x.rk));
 		if (error != 0)
 			return (error);
 		data = &x.rk;
 		break;
 	case PROC_ASLR_STATUS:
 	case PROC_PROTMAX_STATUS:
 	case PROC_STACKGAP_STATUS:
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 	case PROC_NO_NEW_PRIVS_STATUS:
 	case PROC_WXMAP_STATUS:
 		data = &flags;
 		break;
 	case PROC_PDEATHSIG_CTL:
 		error = copyin(uap->data, &signum, sizeof(signum));
 		if (error != 0)
 			return (error);
 		data = &signum;
 		break;
 	case PROC_PDEATHSIG_STATUS:
 		data = &signum;
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = kern_procctl(td, uap->idtype, PAIR32TO64(id_t, uap->id),
 	    uap->com, data);
 	switch (uap->com) {
 	case PROC_REAP_STATUS:
 		if (error == 0)
 			error = copyout(&x.rs, uap->data, sizeof(x.rs));
 		break;
 	case PROC_REAP_KILL:
 		error1 = copyout(&x.rk, uap->data, sizeof(x.rk));
 		if (error == 0)
 			error = error1;
 		break;
 	case PROC_ASLR_STATUS:
 	case PROC_PROTMAX_STATUS:
 	case PROC_STACKGAP_STATUS:
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 	case PROC_NO_NEW_PRIVS_STATUS:
 	case PROC_WXMAP_STATUS:
 		if (error == 0)
 			error = copyout(&flags, uap->data, sizeof(flags));
 		break;
 	case PROC_PDEATHSIG_STATUS:
 		if (error == 0)
 			error = copyout(&signum, uap->data, sizeof(signum));
 		break;
 	}
 	return (error);
 }
 
 int
 freebsd32_fcntl(struct thread *td, struct freebsd32_fcntl_args *uap)
 {
 	long tmp;
 
 	switch (uap->cmd) {
 	/*
 	 * Do unsigned conversion for arg when operation
 	 * interprets it as flags or pointer.
 	 */
 	case F_SETLK_REMOTE:
 	case F_SETLKW:
 	case F_SETLK:
 	case F_GETLK:
 	case F_SETFD:
 	case F_SETFL:
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 	case F_KINFO:
 		tmp = (unsigned int)(uap->arg);
 		break;
 	default:
 		tmp = uap->arg;
 		break;
 	}
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, tmp));
 }
 
 int
 freebsd32_ppoll(struct thread *td, struct freebsd32_ppoll_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	sigset_t set, *ssp;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts32, sizeof(ts32));
 		if (error != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 	if (uap->set != NULL) {
 		error = copyin(uap->set, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		ssp = &set;
 	} else
 		ssp = NULL;
 
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
 }
 
 int
 freebsd32_sched_rr_get_interval(struct thread *td,
     struct freebsd32_sched_rr_get_interval_args *uap)
 {
 	struct timespec ts;
 	struct timespec32 ts32;
 	int error;
 
 	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
 	if (error == 0) {
 		CP(ts, ts32, tv_sec);
 		CP(ts, ts32, tv_nsec);
 		error = copyout(&ts32, uap->interval, sizeof(ts32));
 	}
 	return (error);
 }
 
 static void
 timex_to_32(struct timex32 *dst, struct timex *src)
 {
 	CP(*src, *dst, modes);
 	CP(*src, *dst, offset);
 	CP(*src, *dst, freq);
 	CP(*src, *dst, maxerror);
 	CP(*src, *dst, esterror);
 	CP(*src, *dst, status);
 	CP(*src, *dst, constant);
 	CP(*src, *dst, precision);
 	CP(*src, *dst, tolerance);
 	CP(*src, *dst, ppsfreq);
 	CP(*src, *dst, jitter);
 	CP(*src, *dst, shift);
 	CP(*src, *dst, stabil);
 	CP(*src, *dst, jitcnt);
 	CP(*src, *dst, calcnt);
 	CP(*src, *dst, errcnt);
 	CP(*src, *dst, stbcnt);
 }
 
 static void
 timex_from_32(struct timex *dst, struct timex32 *src)
 {
 	CP(*src, *dst, modes);
 	CP(*src, *dst, offset);
 	CP(*src, *dst, freq);
 	CP(*src, *dst, maxerror);
 	CP(*src, *dst, esterror);
 	CP(*src, *dst, status);
 	CP(*src, *dst, constant);
 	CP(*src, *dst, precision);
 	CP(*src, *dst, tolerance);
 	CP(*src, *dst, ppsfreq);
 	CP(*src, *dst, jitter);
 	CP(*src, *dst, shift);
 	CP(*src, *dst, stabil);
 	CP(*src, *dst, jitcnt);
 	CP(*src, *dst, calcnt);
 	CP(*src, *dst, errcnt);
 	CP(*src, *dst, stbcnt);
 }
 
 int
 freebsd32_ntp_adjtime(struct thread *td, struct freebsd32_ntp_adjtime_args *uap)
 {
 	struct timex tx;
 	struct timex32 tx32;
 	int error, retval;
 
 	error = copyin(uap->tp, &tx32, sizeof(tx32));
 	if (error == 0) {
 		timex_from_32(&tx, &tx32);
 		error = kern_ntp_adjtime(td, &tx, &retval);
 		if (error == 0) {
 			timex_to_32(&tx32, &tx);
 			error = copyout(&tx32, uap->tp, sizeof(tx32));
 			if (error == 0)
 				td->td_retval[0] = retval;
 		}
 	}
 	return (error);
 }
 
 #ifdef FFCLOCK
 extern struct mtx ffclock_mtx;
 extern struct ffclock_estimate ffclock_estimate;
 extern int8_t ffclock_updated;
 
 int
 freebsd32_ffclock_setestimate(struct thread *td,
     struct freebsd32_ffclock_setestimate_args *uap)
 {
 	struct ffclock_estimate cest;
 	struct ffclock_estimate32 cest32;
 	int error;
 
 	/* Reuse of PRIV_CLOCK_SETTIME. */
 	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 		return (error);
 
 	if ((error = copyin(uap->cest, &cest32,
 	    sizeof(struct ffclock_estimate32))) != 0)
 		return (error);
 
 	CP(cest.update_time, cest32.update_time, sec);
 	memcpy(&cest.update_time.frac, &cest32.update_time.frac, sizeof(uint64_t));
 	CP(cest, cest32, update_ffcount);
 	CP(cest, cest32, leapsec_next);
 	CP(cest, cest32, period);
 	CP(cest, cest32, errb_abs);
 	CP(cest, cest32, errb_rate);
 	CP(cest, cest32, status);
 	CP(cest, cest32, leapsec_total);
 	CP(cest, cest32, leapsec);
 
 	mtx_lock(&ffclock_mtx);
 	memcpy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate));
 	ffclock_updated++;
 	mtx_unlock(&ffclock_mtx);
 	return (error);
 }
 
 int
 freebsd32_ffclock_getestimate(struct thread *td,
     struct freebsd32_ffclock_getestimate_args *uap)
 {
 	struct ffclock_estimate cest;
 	struct ffclock_estimate32 cest32;
 	int error;
 
 	mtx_lock(&ffclock_mtx);
 	memcpy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
 	mtx_unlock(&ffclock_mtx);
 
 	CP(cest32.update_time, cest.update_time, sec);
 	memcpy(&cest32.update_time.frac, &cest.update_time.frac, sizeof(uint64_t));
 	CP(cest32, cest, update_ffcount);
 	CP(cest32, cest, leapsec_next);
 	CP(cest32, cest, period);
 	CP(cest32, cest, errb_abs);
 	CP(cest32, cest, errb_rate);
 	CP(cest32, cest, status);
 	CP(cest32, cest, leapsec_total);
 	CP(cest32, cest, leapsec);
 
 	error = copyout(&cest32, uap->cest, sizeof(struct ffclock_estimate32));
 	return (error);
 }
 #else /* !FFCLOCK */
 int
 freebsd32_ffclock_setestimate(struct thread *td,
     struct freebsd32_ffclock_setestimate_args *uap)
 {
 	return (ENOSYS);
 }
 
 int
 freebsd32_ffclock_getestimate(struct thread *td,
     struct freebsd32_ffclock_getestimate_args *uap)
 {
 	return (ENOSYS);
 }
 #endif /* FFCLOCK */
 
 #ifdef COMPAT_43
 int
 ofreebsd32_sethostid(struct thread *td, struct ofreebsd32_sethostid_args *uap)
 {
 	int name[] = { CTL_KERN, KERN_HOSTID };
 	long hostid;
 
 	hostid = uap->hostid;
 	return (kernel_sysctl(td, name, nitems(name), NULL, NULL, &hostid,
 	    sizeof(hostid), NULL, 0));
 }
 #endif
diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c
index 6eab8980a5c5..1279490ae8be 100644
--- a/sys/compat/linux/linux_file.c
+++ b/sys/compat/linux/linux_file.c
@@ -1,1877 +1,1877 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1994-1995 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/selinfo.h>
 #include <sys/pipe.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/tty.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #ifdef COMPAT_LINUX32
 #include <compat/freebsd32/freebsd32_misc.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_misc.h>
 #include <compat/linux/linux_util.h>
 #include <compat/linux/linux_file.h>
 
 static int	linux_common_open(struct thread *, int, const char *, int, int,
 		    enum uio_seg);
 static int	linux_do_accessat(struct thread *t, int, const char *, int, int);
 static int	linux_getdents_error(struct thread *, int, int);
 
 static struct bsd_to_linux_bitmap seal_bitmap[] = {
 	BITMAP_1t1_LINUX(F_SEAL_SEAL),
 	BITMAP_1t1_LINUX(F_SEAL_SHRINK),
 	BITMAP_1t1_LINUX(F_SEAL_GROW),
 	BITMAP_1t1_LINUX(F_SEAL_WRITE),
 };
 
 #define	MFD_HUGETLB_ENTRY(_size)					\
 	{								\
 		.bsd_value = MFD_HUGE_##_size,				\
 		.linux_value = LINUX_HUGETLB_FLAG_ENCODE_##_size	\
 	}
 static struct bsd_to_linux_bitmap mfd_bitmap[] = {
 	BITMAP_1t1_LINUX(MFD_CLOEXEC),
 	BITMAP_1t1_LINUX(MFD_ALLOW_SEALING),
 	BITMAP_1t1_LINUX(MFD_HUGETLB),
 	MFD_HUGETLB_ENTRY(64KB),
 	MFD_HUGETLB_ENTRY(512KB),
 	MFD_HUGETLB_ENTRY(1MB),
 	MFD_HUGETLB_ENTRY(2MB),
 	MFD_HUGETLB_ENTRY(8MB),
 	MFD_HUGETLB_ENTRY(16MB),
 	MFD_HUGETLB_ENTRY(32MB),
 	MFD_HUGETLB_ENTRY(256MB),
 	MFD_HUGETLB_ENTRY(512MB),
 	MFD_HUGETLB_ENTRY(1GB),
 	MFD_HUGETLB_ENTRY(2GB),
 	MFD_HUGETLB_ENTRY(16GB),
 };
 #undef MFD_HUGETLB_ENTRY
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_creat(struct thread *td, struct linux_creat_args *args)
 {
 
 	return (kern_openat(td, AT_FDCWD, args->path, UIO_USERSPACE,
 	    O_WRONLY | O_CREAT | O_TRUNC, args->mode));
 }
 #endif
 
 static int
 linux_common_openflags(int l_flags)
 {
 	int bsd_flags;
 
 	bsd_flags = 0;
 	switch (l_flags & LINUX_O_ACCMODE) {
 	case LINUX_O_WRONLY:
 		bsd_flags |= O_WRONLY;
 		break;
 	case LINUX_O_RDWR:
 		bsd_flags |= O_RDWR;
 		break;
 	default:
 		bsd_flags |= O_RDONLY;
 	}
 	if (l_flags & LINUX_O_NDELAY)
 		bsd_flags |= O_NONBLOCK;
 	if (l_flags & LINUX_O_APPEND)
 		bsd_flags |= O_APPEND;
 	if (l_flags & LINUX_O_SYNC)
 		bsd_flags |= O_FSYNC;
 	if (l_flags & LINUX_O_CLOEXEC)
 		bsd_flags |= O_CLOEXEC;
 	if (l_flags & LINUX_O_NONBLOCK)
 		bsd_flags |= O_NONBLOCK;
 	if (l_flags & LINUX_O_ASYNC)
 		bsd_flags |= O_ASYNC;
 	if (l_flags & LINUX_O_CREAT)
 		bsd_flags |= O_CREAT;
 	if (l_flags & LINUX_O_TRUNC)
 		bsd_flags |= O_TRUNC;
 	if (l_flags & LINUX_O_EXCL)
 		bsd_flags |= O_EXCL;
 	if (l_flags & LINUX_O_NOCTTY)
 		bsd_flags |= O_NOCTTY;
 	if (l_flags & LINUX_O_DIRECT)
 		bsd_flags |= O_DIRECT;
 	if (l_flags & LINUX_O_NOFOLLOW)
 		bsd_flags |= O_NOFOLLOW;
 	if (l_flags & LINUX_O_DIRECTORY)
 		bsd_flags |= O_DIRECTORY;
 	if (l_flags & LINUX_O_PATH)
 		bsd_flags |= O_PATH;
 	/* XXX LINUX_O_NOATIME: unable to be easily implemented. */
 	return (bsd_flags);
 }
 
 static int
 linux_common_open(struct thread *td, int dirfd, const char *path, int l_flags,
     int mode, enum uio_seg seg)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp;
 	int fd;
 	int bsd_flags, error;
 
 	bsd_flags = linux_common_openflags(l_flags);
 	error = kern_openat(td, dirfd, path, seg, bsd_flags, mode);
 	if (error != 0) {
 		if (error == EMLINK)
 			error = ELOOP;
 		goto done;
 	}
 	if (p->p_flag & P_CONTROLT)
 		goto done;
 	if (bsd_flags & O_NOCTTY)
 		goto done;
 
 	/*
 	 * XXX In between kern_openat() and fget(), another process
 	 * having the same filedesc could use that fd without
 	 * checking below.
 	*/
 	fd = td->td_retval[0];
 	if (fget(td, fd, &cap_ioctl_rights, &fp) == 0) {
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			goto done;
 		}
 		sx_slock(&proctree_lock);
 		PROC_LOCK(p);
 		if (SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) {
 			PROC_UNLOCK(p);
 			sx_sunlock(&proctree_lock);
 			/* XXXPJD: Verify if TIOCSCTTY is allowed. */
 			(void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0,
 			    td->td_ucred, td);
 		} else {
 			PROC_UNLOCK(p);
 			sx_sunlock(&proctree_lock);
 		}
 		fdrop(fp, td);
 	}
 
 done:
 	return (error);
 }
 
 int
 linux_openat(struct thread *td, struct linux_openat_args *args)
 {
 	int dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	return (linux_common_open(td, dfd, args->filename, args->flags,
 	    args->mode, UIO_USERSPACE));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_open(struct thread *td, struct linux_open_args *args)
 {
 
 	return (linux_common_open(td, AT_FDCWD, args->path, args->flags,
 	    args->mode, UIO_USERSPACE));
 }
 #endif
 
 int
 linux_name_to_handle_at(struct thread *td,
     struct linux_name_to_handle_at_args *args)
 {
 	static const l_int valid_flags = (LINUX_AT_SYMLINK_FOLLOW |
 	    LINUX_AT_EMPTY_PATH);
 	static const l_uint fh_size = sizeof(fhandle_t);
 
 	fhandle_t fh;
 	l_uint fh_bytes;
 	l_int mount_id;
 	int error, fd, bsd_flags;
 
 	if (args->flags & ~valid_flags)
 		return (EINVAL);
 
 	fd = args->dirfd;
 	if (fd == LINUX_AT_FDCWD)
 		fd = AT_FDCWD;
 
 	bsd_flags = 0;
 	if (!(args->flags & LINUX_AT_SYMLINK_FOLLOW))
 		bsd_flags |= AT_SYMLINK_NOFOLLOW;
 	if ((args->flags & LINUX_AT_EMPTY_PATH) != 0)
 		bsd_flags |= AT_EMPTY_PATH;
 
 	error = kern_getfhat(td, bsd_flags, fd, args->name,
 	    UIO_USERSPACE, &fh, UIO_SYSSPACE);
 	if (error != 0)
 		return (error);
 
 	/* Emit mount_id -- required before EOVERFLOW case. */
 	mount_id = (fh.fh_fsid.val[0] ^ fh.fh_fsid.val[1]);
 	error = copyout(&mount_id, args->mnt_id, sizeof(mount_id));
 	if (error != 0)
 		return (error);
 
 	/* Check if there is room for handle. */
 	error = copyin(&args->handle->handle_bytes, &fh_bytes,
 	    sizeof(fh_bytes));
 	if (error != 0)
 		return (error);
 
 	if (fh_bytes < fh_size) {
 		error = copyout(&fh_size, &args->handle->handle_bytes,
 		    sizeof(fh_size));
 		if (error == 0)
 			error = EOVERFLOW;
 		return (error);
 	}
 
 	/* Emit handle. */
 	mount_id = 0;
 	/*
 	 * We don't use handle_type for anything yet, but initialize a known
 	 * value.
 	 */
 	error = copyout(&mount_id, &args->handle->handle_type,
 	    sizeof(mount_id));
 	if (error != 0)
 		return (error);
 
 	error = copyout(&fh, &args->handle->f_handle,
 	    sizeof(fh));
 	return (error);
 }
 
 int
 linux_open_by_handle_at(struct thread *td,
     struct linux_open_by_handle_at_args *args)
 {
 	l_uint fh_bytes;
 	int bsd_flags, error;
 
 	error = copyin(&args->handle->handle_bytes, &fh_bytes,
 	    sizeof(fh_bytes));
 	if (error != 0)
 		return (error);
 
 	if (fh_bytes < sizeof(fhandle_t))
 		return (EINVAL);
 
 	bsd_flags = linux_common_openflags(args->flags);
 	return (kern_fhopen(td, (void *)&args->handle->f_handle, bsd_flags));
 }
 
 int
 linux_lseek(struct thread *td, struct linux_lseek_args *args)
 {
 
 	return (kern_lseek(td, args->fdes, args->off, args->whence));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_llseek(struct thread *td, struct linux_llseek_args *args)
 {
 	int error;
 	off_t off;
 
 	off = (args->olow) | (((off_t) args->ohigh) << 32);
 
 	error = kern_lseek(td, args->fd, off, args->whence);
 	if (error != 0)
 		return (error);
 
 	error = copyout(td->td_retval, args->res, sizeof(off_t));
 	if (error != 0)
 		return (error);
 
 	td->td_retval[0] = 0;
 	return (0);
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 /*
  * Note that linux_getdents(2) and linux_getdents64(2) have the same
  * arguments. They only differ in the definition of struct dirent they
  * operate on.
  * Note that linux_readdir(2) is a special case of linux_getdents(2)
  * where count is always equals 1, meaning that the buffer is one
  * dirent-structure in size and that the code can't handle more anyway.
  * Note that linux_readdir(2) can't be implemented by means of linux_getdents(2)
  * as in case when the *dent buffer size is equal to 1 linux_getdents(2) will
  * trash user stack.
  */
 
 static int
 linux_getdents_error(struct thread *td, int fd, int err)
 {
 	struct vnode *vp;
 	struct file *fp;
 	int error;
 
 	/* Linux return ENOTDIR in case when fd is not a directory. */
 	error = getvnode(td, fd, &cap_read_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 	if (vp->v_type != VDIR) {
 		fdrop(fp, td);
 		return (ENOTDIR);
 	}
 	fdrop(fp, td);
 	return (err);
 }
 
 struct l_dirent {
 	l_ulong		d_ino;
 	l_off_t		d_off;
 	l_ushort	d_reclen;
 	char		d_name[LINUX_NAME_MAX + 1];
 };
 
 struct l_dirent64 {
 	uint64_t	d_ino;
 	int64_t		d_off;
 	l_ushort	d_reclen;
 	u_char		d_type;
 	char		d_name[LINUX_NAME_MAX + 1];
 };
 
 /*
  * Linux uses the last byte in the dirent buffer to store d_type,
  * at least glibc-2.7 requires it. That is why l_dirent is padded with 2 bytes.
  */
 #define LINUX_RECLEN(namlen)						\
     roundup(offsetof(struct l_dirent, d_name) + (namlen) + 2, sizeof(l_ulong))
 
 #define LINUX_RECLEN64(namlen)						\
     roundup(offsetof(struct l_dirent64, d_name) + (namlen) + 1,		\
     sizeof(uint64_t))
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_getdents(struct thread *td, struct linux_getdents_args *args)
 {
 	struct dirent *bdp;
 	caddr_t inp, buf;		/* BSD-format */
 	int len, reclen;		/* BSD-format */
 	caddr_t outp;			/* Linux-format */
 	int resid, linuxreclen;		/* Linux-format */
 	caddr_t lbuf;			/* Linux-format */
 	off_t base;
 	struct l_dirent *linux_dirent;
 	int buflen, error;
 	size_t retval;
 
 	buflen = min(args->count, MAXBSIZE);
 	buf = malloc(buflen, M_LINUX, M_WAITOK);
 
 	error = kern_getdirentries(td, args->fd, buf, buflen,
 	    &base, NULL, UIO_SYSSPACE);
 	if (error != 0) {
 		error = linux_getdents_error(td, args->fd, error);
 		goto out1;
 	}
 
 	lbuf = malloc(LINUX_RECLEN(LINUX_NAME_MAX), M_LINUX, M_WAITOK | M_ZERO);
 
 	len = td->td_retval[0];
 	inp = buf;
 	outp = (caddr_t)args->dent;
 	resid = args->count;
 	retval = 0;
 
 	while (len > 0) {
 		bdp = (struct dirent *) inp;
 		reclen = bdp->d_reclen;
 		linuxreclen = LINUX_RECLEN(bdp->d_namlen);
 		/*
 		 * No more space in the user supplied dirent buffer.
 		 * Return EINVAL.
 		 */
 		if (resid < linuxreclen) {
 			error = EINVAL;
 			goto out;
 		}
 
 		linux_dirent = (struct l_dirent*)lbuf;
 		linux_dirent->d_ino = bdp->d_fileno;
 		linux_dirent->d_off = bdp->d_off;
 		linux_dirent->d_reclen = linuxreclen;
 		/*
 		 * Copy d_type to last byte of l_dirent buffer
 		 */
 		lbuf[linuxreclen - 1] = bdp->d_type;
 		strlcpy(linux_dirent->d_name, bdp->d_name,
 		    linuxreclen - offsetof(struct l_dirent, d_name)-1);
 		error = copyout(linux_dirent, outp, linuxreclen);
 		if (error != 0)
 			goto out;
 
 		inp += reclen;
 		base += reclen;
 		len -= reclen;
 
 		retval += linuxreclen;
 		outp += linuxreclen;
 		resid -= linuxreclen;
 	}
 	td->td_retval[0] = retval;
 
 out:
 	free(lbuf, M_LINUX);
 out1:
 	free(buf, M_LINUX);
 	return (error);
 }
 #endif
 
 int
 linux_getdents64(struct thread *td, struct linux_getdents64_args *args)
 {
 	struct dirent *bdp;
 	caddr_t inp, buf;		/* BSD-format */
 	int len, reclen;		/* BSD-format */
 	caddr_t outp;			/* Linux-format */
 	int resid, linuxreclen;		/* Linux-format */
 	off_t base;
 	struct l_dirent64 *linux_dirent64;
 	int buflen, error;
 	size_t retval;
 
 	buflen = min(args->count, MAXBSIZE);
 	buf = malloc(buflen, M_LINUX, M_WAITOK);
 
 	error = kern_getdirentries(td, args->fd, buf, buflen,
 	    &base, NULL, UIO_SYSSPACE);
 	if (error != 0) {
 		error = linux_getdents_error(td, args->fd, error);
 		goto out1;
 	}
 
 	linux_dirent64 = malloc(LINUX_RECLEN64(LINUX_NAME_MAX), M_LINUX,
 	    M_WAITOK | M_ZERO);
 
 	len = td->td_retval[0];
 	inp = buf;
 	outp = (caddr_t)args->dirent;
 	resid = args->count;
 	retval = 0;
 
 	while (len > 0) {
 		bdp = (struct dirent *) inp;
 		reclen = bdp->d_reclen;
 		linuxreclen = LINUX_RECLEN64(bdp->d_namlen);
 		/*
 		 * No more space in the user supplied dirent buffer.
 		 * Return EINVAL.
 		 */
 		if (resid < linuxreclen) {
 			error = EINVAL;
 			goto out;
 		}
 
 		linux_dirent64->d_ino = bdp->d_fileno;
 		linux_dirent64->d_off = bdp->d_off;
 		linux_dirent64->d_reclen = linuxreclen;
 		linux_dirent64->d_type = bdp->d_type;
 		strlcpy(linux_dirent64->d_name, bdp->d_name,
 		    linuxreclen - offsetof(struct l_dirent64, d_name));
 		error = copyout(linux_dirent64, outp, linuxreclen);
 		if (error != 0)
 			goto out;
 
 		inp += reclen;
 		base += reclen;
 		len -= reclen;
 
 		retval += linuxreclen;
 		outp += linuxreclen;
 		resid -= linuxreclen;
 	}
 	td->td_retval[0] = retval;
 
 out:
 	free(linux_dirent64, M_LINUX);
 out1:
 	free(buf, M_LINUX);
 	return (error);
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_readdir(struct thread *td, struct linux_readdir_args *args)
 {
 	struct dirent *bdp;
 	caddr_t buf;			/* BSD-format */
 	int linuxreclen;		/* Linux-format */
 	off_t base;
 	struct l_dirent *linux_dirent;	/* Linux-format */
 	int buflen, error;
 
 	buflen = sizeof(*bdp);
 	buf = malloc(buflen, M_LINUX, M_WAITOK);
 
 	error = kern_getdirentries(td, args->fd, buf, buflen,
 	    &base, NULL, UIO_SYSSPACE);
 	if (error != 0) {
 		error = linux_getdents_error(td, args->fd, error);
 		goto out;
 	}
 	if (td->td_retval[0] == 0)
 		goto out;
 
 	linux_dirent = malloc(LINUX_RECLEN(LINUX_NAME_MAX), M_LINUX,
 	    M_WAITOK | M_ZERO);
 
 	bdp = (struct dirent *) buf;
 	linuxreclen = LINUX_RECLEN(bdp->d_namlen);
 
 	linux_dirent->d_ino = bdp->d_fileno;
 	linux_dirent->d_off = bdp->d_off;
 	linux_dirent->d_reclen = bdp->d_namlen;
 	strlcpy(linux_dirent->d_name, bdp->d_name,
 	    linuxreclen - offsetof(struct l_dirent, d_name));
 	error = copyout(linux_dirent, args->dent, linuxreclen);
 	if (error == 0)
 		td->td_retval[0] = linuxreclen;
 
 	free(linux_dirent, M_LINUX);
 out:
 	free(buf, M_LINUX);
 	return (error);
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 /*
  * These exist mainly for hooks for doing /compat/linux translation.
  */
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_access(struct thread *td, struct linux_access_args *args)
 {
 
 	/* Linux convention. */
 	if (args->amode & ~(F_OK | X_OK | W_OK | R_OK))
 		return (EINVAL);
 
 	return (kern_accessat(td, AT_FDCWD, args->path, UIO_USERSPACE, 0,
 	    args->amode));
 }
 #endif
 
 static int
 linux_do_accessat(struct thread *td, int ldfd, const char *filename,
     int amode, int flags)
 {
 	int dfd;
 
 	/* Linux convention. */
 	if (amode & ~(F_OK | X_OK | W_OK | R_OK))
 		return (EINVAL);
 
 	dfd = (ldfd == LINUX_AT_FDCWD) ? AT_FDCWD : ldfd;
 	return (kern_accessat(td, dfd, filename, UIO_USERSPACE, flags, amode));
 }
 
 int
 linux_faccessat(struct thread *td, struct linux_faccessat_args *args)
 {
 
 	return (linux_do_accessat(td, args->dfd, args->filename, args->amode,
 	    0));
 }
 
 int
 linux_faccessat2(struct thread *td, struct linux_faccessat2_args *args)
 {
 	int flags, unsupported;
 
 	/* XXX. AT_SYMLINK_NOFOLLOW is not supported by kern_accessat */
 	unsupported = args->flags & ~(LINUX_AT_EACCESS | LINUX_AT_EMPTY_PATH);
 	if (unsupported != 0) {
 		linux_msg(td, "faccessat2 unsupported flag 0x%x", unsupported);
 		return (EINVAL);
 	}
 
 	flags = (args->flags & LINUX_AT_EACCESS) == 0 ? 0 :
 	    AT_EACCESS;
 	flags |= (args->flags & LINUX_AT_EMPTY_PATH) == 0 ? 0 :
 	    AT_EMPTY_PATH;
 	return (linux_do_accessat(td, args->dfd, args->filename, args->amode,
 	    flags));
 }
 
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_unlink(struct thread *td, struct linux_unlink_args *args)
 {
 	int error;
 	struct stat st;
 
 	error = kern_funlinkat(td, AT_FDCWD, args->path, FD_NONE,
 	    UIO_USERSPACE, 0, 0);
 	if (error == EPERM) {
 		/* Introduce POSIX noncompliant behaviour of Linux */
 		if (kern_statat(td, 0, AT_FDCWD, args->path,
 		    UIO_USERSPACE, &st) == 0) {
 			if (S_ISDIR(st.st_mode))
 				error = EISDIR;
 		}
 	}
 
 	return (error);
 }
 #endif
 
 static int
 linux_unlinkat_impl(struct thread *td, enum uio_seg pathseg, const char *path,
     int dfd, struct linux_unlinkat_args *args)
 {
 	struct stat st;
 	int error;
 
 	if (args->flag & LINUX_AT_REMOVEDIR)
 		error = kern_frmdirat(td, dfd, path, FD_NONE, pathseg, 0);
 	else
 		error = kern_funlinkat(td, dfd, path, FD_NONE, pathseg, 0, 0);
 	if (error == EPERM && !(args->flag & LINUX_AT_REMOVEDIR)) {
 		/* Introduce POSIX noncompliant behaviour of Linux */
 		if (kern_statat(td, AT_SYMLINK_NOFOLLOW, dfd, path,
 		    pathseg, &st) == 0 && S_ISDIR(st.st_mode))
 			error = EISDIR;
 	}
 	return (error);
 }
 
 int
 linux_unlinkat(struct thread *td, struct linux_unlinkat_args *args)
 {
 	int dfd;
 
 	if (args->flag & ~LINUX_AT_REMOVEDIR)
 		return (EINVAL);
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	return (linux_unlinkat_impl(td, UIO_USERSPACE, args->pathname,
 	    dfd, args));
 }
 
 int
 linux_chdir(struct thread *td, struct linux_chdir_args *args)
 {
 
 	return (kern_chdir(td, args->path, UIO_USERSPACE));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_chmod(struct thread *td, struct linux_chmod_args *args)
 {
 
 	return (kern_fchmodat(td, AT_FDCWD, args->path, UIO_USERSPACE,
 	    args->mode, 0));
 }
 #endif
 
 int
 linux_fchmodat(struct thread *td, struct linux_fchmodat_args *args)
 {
 	int dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	return (kern_fchmodat(td, dfd, args->filename, UIO_USERSPACE,
 	    args->mode, 0));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_mkdir(struct thread *td, struct linux_mkdir_args *args)
 {
 
 	return (kern_mkdirat(td, AT_FDCWD, args->path, UIO_USERSPACE, args->mode));
 }
 #endif
 
 int
 linux_mkdirat(struct thread *td, struct linux_mkdirat_args *args)
 {
 	int dfd;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	return (kern_mkdirat(td, dfd, args->pathname, UIO_USERSPACE, args->mode));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_rmdir(struct thread *td, struct linux_rmdir_args *args)
 {
 
 	return (kern_frmdirat(td, AT_FDCWD, args->path, FD_NONE,
 	    UIO_USERSPACE, 0));
 }
 
 int
 linux_rename(struct thread *td, struct linux_rename_args *args)
 {
 
 	return (kern_renameat(td, AT_FDCWD, args->from, AT_FDCWD,
 	    args->to, UIO_USERSPACE));
 }
 #endif
 
 int
 linux_renameat(struct thread *td, struct linux_renameat_args *args)
 {
 	struct linux_renameat2_args renameat2_args = {
 	    .olddfd = args->olddfd,
 	    .oldname = args->oldname,
 	    .newdfd = args->newdfd,
 	    .newname = args->newname,
 	    .flags = 0
 	};
 
 	return (linux_renameat2(td, &renameat2_args));
 }
 
 int
 linux_renameat2(struct thread *td, struct linux_renameat2_args *args)
 {
 	int olddfd, newdfd;
 
 	if (args->flags != 0) {
 		if (args->flags & ~(LINUX_RENAME_EXCHANGE |
 		    LINUX_RENAME_NOREPLACE | LINUX_RENAME_WHITEOUT))
 			return (EINVAL);
 		if (args->flags & LINUX_RENAME_EXCHANGE &&
 		    args->flags & (LINUX_RENAME_NOREPLACE |
 		    LINUX_RENAME_WHITEOUT))
 			return (EINVAL);
 #if 0
 		/*
 		 * This spams the console on Ubuntu Focal.
 		 *
 		 * What's needed here is a general mechanism to let users know
 		 * about missing features without hogging the system.
 		 */
 		linux_msg(td, "renameat2 unsupported flags 0x%x",
 		    args->flags);
 #endif
 		return (EINVAL);
 	}
 
 	olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd;
 	newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
 	return (kern_renameat(td, olddfd, args->oldname, newdfd,
 	    args->newname, UIO_USERSPACE));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_symlink(struct thread *td, struct linux_symlink_args *args)
 {
 
 	return (kern_symlinkat(td, args->path, AT_FDCWD, args->to,
 	    UIO_USERSPACE));
 }
 #endif
 
 int
 linux_symlinkat(struct thread *td, struct linux_symlinkat_args *args)
 {
 	int dfd;
 
 	dfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
 	return (kern_symlinkat(td, args->oldname, dfd, args->newname,
 	    UIO_USERSPACE));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_readlink(struct thread *td, struct linux_readlink_args *args)
 {
 
 	if (args->count <= 0)
 		return (EINVAL);
 
 	return (kern_readlinkat(td, AT_FDCWD, args->name, UIO_USERSPACE,
 	    args->buf, UIO_USERSPACE, args->count));
 }
 #endif
 
 int
 linux_readlinkat(struct thread *td, struct linux_readlinkat_args *args)
 {
 	int dfd;
 
 	if (args->bufsiz <= 0)
 		return (EINVAL);
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
 	return (kern_readlinkat(td, dfd, args->path, UIO_USERSPACE,
 	    args->buf, UIO_USERSPACE, args->bufsiz));
 }
 
 int
 linux_truncate(struct thread *td, struct linux_truncate_args *args)
 {
 
 	return (kern_truncate(td, args->path, UIO_USERSPACE, args->length));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_truncate64(struct thread *td, struct linux_truncate64_args *args)
 {
 	off_t length;
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	length = PAIR32TO64(off_t, args->length);
 #else
 	length = args->length;
 #endif
 
 	return (kern_truncate(td, args->path, UIO_USERSPACE, length));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 int
 linux_ftruncate(struct thread *td, struct linux_ftruncate_args *args)
 {
 
 	return (kern_ftruncate(td, args->fd, args->length));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
 {
 	off_t length;
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	length = PAIR32TO64(off_t, args->length);
 #else
 	length = args->length;
 #endif
 
 	return (kern_ftruncate(td, args->fd, length));
 }
 #endif
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_link(struct thread *td, struct linux_link_args *args)
 {
 
 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, args->path, args->to,
 	    UIO_USERSPACE, AT_SYMLINK_FOLLOW));
 }
 #endif
 
 int
 linux_linkat(struct thread *td, struct linux_linkat_args *args)
 {
 	int olddfd, newdfd, flag;
 
 	if (args->flag & ~(LINUX_AT_SYMLINK_FOLLOW | LINUX_AT_EMPTY_PATH))
 		return (EINVAL);
 
 	flag = (args->flag & LINUX_AT_SYMLINK_FOLLOW) != 0 ? AT_SYMLINK_FOLLOW :
 	    0;
 	flag |= (args->flag & LINUX_AT_EMPTY_PATH) != 0 ? AT_EMPTY_PATH : 0;
 
 	olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd;
 	newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
 	return (kern_linkat(td, olddfd, newdfd, args->oldname,
 	    args->newname, UIO_USERSPACE, flag));
 }
 
 int
 linux_fdatasync(struct thread *td, struct linux_fdatasync_args *uap)
 {
 
 	return (kern_fsync(td, uap->fd, false));
 }
 
 int
 linux_sync_file_range(struct thread *td, struct linux_sync_file_range_args *uap)
 {
 	off_t nbytes, offset;
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	nbytes = PAIR32TO64(off_t, uap->nbytes);
 	offset = PAIR32TO64(off_t, uap->offset);
 #else
 	nbytes = uap->nbytes;
 	offset = uap->offset;
 #endif
 
 	if (offset < 0 || nbytes < 0 ||
 	    (uap->flags & ~(LINUX_SYNC_FILE_RANGE_WAIT_BEFORE |
 	    LINUX_SYNC_FILE_RANGE_WRITE |
 	    LINUX_SYNC_FILE_RANGE_WAIT_AFTER)) != 0) {
 		return (EINVAL);
 	}
 
 	return (kern_fsync(td, uap->fd, false));
 }
 
 int
 linux_pread(struct thread *td, struct linux_pread_args *uap)
 {
 	struct vnode *vp;
 	off_t offset;
 	int error;
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	offset = PAIR32TO64(off_t, uap->offset);
 #else
 	offset = uap->offset;
 #endif
 
 	error = kern_pread(td, uap->fd, uap->buf, uap->nbyte, offset);
 	if (error == 0) {
 		/* This seems to violate POSIX but Linux does it. */
 		error = fgetvp(td, uap->fd, &cap_pread_rights, &vp);
 		if (error != 0)
 			return (error);
 		if (vp->v_type == VDIR)
 			error = EISDIR;
 		vrele(vp);
 	}
 	return (error);
 }
 
 int
 linux_pwrite(struct thread *td, struct linux_pwrite_args *uap)
 {
 	off_t offset;
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	offset = PAIR32TO64(off_t, uap->offset);
 #else
 	offset = uap->offset;
 #endif
 
 	return (linux_enobufs2eagain(td, uap->fd,
 	    kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, offset)));
 }
 
 #define HALF_LONG_BITS ((sizeof(l_long) * NBBY / 2))
 
 static inline off_t
 pos_from_hilo(unsigned long high, unsigned long low)
 {
 
 	return (((off_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 }
 
 int
 linux_preadv(struct thread *td, struct linux_preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 	off_t offset;
 
 	/*
 	 * According http://man7.org/linux/man-pages/man2/preadv.2.html#NOTES
 	 * pos_l and pos_h, respectively, contain the
 	 * low order and high order 32 bits of offset.
 	 */
 	offset = pos_from_hilo(uap->pos_h, uap->pos_l);
 	if (offset < 0)
 		return (EINVAL);
 #ifdef COMPAT_LINUX32
 	error = freebsd32_copyinuio(PTRIN(uap->vec), uap->vlen, &auio);
 #else
 	error = copyinuio(uap->vec, uap->vlen, &auio);
 #endif
 	if (error != 0)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, offset);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 linux_pwritev(struct thread *td, struct linux_pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 	off_t offset;
 
 	/*
 	 * According http://man7.org/linux/man-pages/man2/pwritev.2.html#NOTES
 	 * pos_l and pos_h, respectively, contain the
 	 * low order and high order 32 bits of offset.
 	 */
 	offset = pos_from_hilo(uap->pos_h, uap->pos_l);
 	if (offset < 0)
 		return (EINVAL);
 #ifdef COMPAT_LINUX32
 	error = freebsd32_copyinuio(PTRIN(uap->vec), uap->vlen, &auio);
 #else
 	error = copyinuio(uap->vec, uap->vlen, &auio);
 #endif
 	if (error != 0)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, offset);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (linux_enobufs2eagain(td, uap->fd, error));
 }
 
 int
 linux_mount(struct thread *td, struct linux_mount_args *args)
 {
 	struct mntarg *ma = NULL;
 	char *fstypename, *mntonname, *mntfromname, *data;
 	int error, fsflags;
 
 	fstypename = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	mntonname = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	mntfromname = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	data = NULL;
 	error = copyinstr(args->filesystemtype, fstypename, MNAMELEN - 1,
 	    NULL);
 	if (error != 0)
 		goto out;
 	if (args->specialfile != NULL) {
 		error = copyinstr(args->specialfile, mntfromname, MNAMELEN - 1, NULL);
 		if (error != 0)
 			goto out;
 	} else {
 		mntfromname[0] = '\0';
 	}
 	error = copyinstr(args->dir, mntonname, MNAMELEN - 1, NULL);
 	if (error != 0)
 		goto out;
 
 	if (strcmp(fstypename, "ext2") == 0) {
 		strcpy(fstypename, "ext2fs");
 	} else if (strcmp(fstypename, "proc") == 0) {
 		strcpy(fstypename, "linprocfs");
 	} else if (strcmp(fstypename, "vfat") == 0) {
 		strcpy(fstypename, "msdosfs");
 	} else if (strcmp(fstypename, "fuse") == 0 ||
 	    strncmp(fstypename, "fuse.", 5) == 0) {
 		char *fuse_options, *fuse_option, *fuse_name;
 
 		strcpy(mntfromname, "/dev/fuse");
 		strcpy(fstypename, "fusefs");
 		data = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 		error = copyinstr(args->data, data, MNAMELEN - 1, NULL);
 		if (error != 0)
 			goto out;
 
 		fuse_options = data;
 		while ((fuse_option = strsep(&fuse_options, ",")) != NULL) {
 			fuse_name = strsep(&fuse_option, "=");
 			if (fuse_name == NULL || fuse_option == NULL)
 				goto out;
 			ma = mount_arg(ma, fuse_name, fuse_option, -1);
 		}
 
 		/*
 		 * The FUSE server uses Linux errno values instead of FreeBSD
 		 * ones; add a flag to tell fuse(4) to do errno translation.
 		 */
 		ma = mount_arg(ma, "linux_errnos", "1", -1);
 	}
 
 	fsflags = 0;
 
 	/*
 	 * Linux SYNC flag is not included; the closest equivalent
 	 * FreeBSD has is !ASYNC, which is our default.
 	 */
 	if (args->rwflag & LINUX_MS_RDONLY)
 		fsflags |= MNT_RDONLY;
 	if (args->rwflag & LINUX_MS_NOSUID)
 		fsflags |= MNT_NOSUID;
 	if (args->rwflag & LINUX_MS_NOEXEC)
 		fsflags |= MNT_NOEXEC;
 	if (args->rwflag & LINUX_MS_REMOUNT)
 		fsflags |= MNT_UPDATE;
 
 	ma = mount_arg(ma, "fstype", fstypename, -1);
 	ma = mount_arg(ma, "fspath", mntonname, -1);
 	ma = mount_arg(ma, "from", mntfromname, -1);
 	error = kernel_mount(ma, fsflags);
 out:
 	free(fstypename, M_TEMP);
 	free(mntonname, M_TEMP);
 	free(mntfromname, M_TEMP);
 	free(data, M_TEMP);
 	return (error);
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_oldumount(struct thread *td, struct linux_oldumount_args *args)
 {
 
 	return (kern_unmount(td, args->path, 0));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_umount(struct thread *td, struct linux_umount_args *args)
 {
 	int flags;
 
 	flags = 0;
 	if ((args->flags & LINUX_MNT_FORCE) != 0) {
 		args->flags &= ~LINUX_MNT_FORCE;
 		flags |= MNT_FORCE;
 	}
 	if (args->flags != 0) {
 		linux_msg(td, "unsupported umount2 flags %#x", args->flags);
 		return (EINVAL);
 	}
 
 	return (kern_unmount(td, args->path, flags));
 }
 #endif
 
 /*
  * fcntl family of syscalls
  */
 
 struct l_flock {
 	l_short		l_type;
 	l_short		l_whence;
 	l_off_t		l_start;
 	l_off_t		l_len;
 	l_pid_t		l_pid;
 }
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 __packed
 #endif
 ;
 
 static void
 linux_to_bsd_flock(struct l_flock *linux_flock, struct flock *bsd_flock)
 {
 	switch (linux_flock->l_type) {
 	case LINUX_F_RDLCK:
 		bsd_flock->l_type = F_RDLCK;
 		break;
 	case LINUX_F_WRLCK:
 		bsd_flock->l_type = F_WRLCK;
 		break;
 	case LINUX_F_UNLCK:
 		bsd_flock->l_type = F_UNLCK;
 		break;
 	default:
 		bsd_flock->l_type = -1;
 		break;
 	}
 	bsd_flock->l_whence = linux_flock->l_whence;
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
 	bsd_flock->l_sysid = 0;
 }
 
 static void
 bsd_to_linux_flock(struct flock *bsd_flock, struct l_flock *linux_flock)
 {
 	switch (bsd_flock->l_type) {
 	case F_RDLCK:
 		linux_flock->l_type = LINUX_F_RDLCK;
 		break;
 	case F_WRLCK:
 		linux_flock->l_type = LINUX_F_WRLCK;
 		break;
 	case F_UNLCK:
 		linux_flock->l_type = LINUX_F_UNLCK;
 		break;
 	}
 	linux_flock->l_whence = bsd_flock->l_whence;
 	linux_flock->l_start = (l_off_t)bsd_flock->l_start;
 	linux_flock->l_len = (l_off_t)bsd_flock->l_len;
 	linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 struct l_flock64 {
 	l_short		l_type;
 	l_short		l_whence;
 	l_loff_t	l_start;
 	l_loff_t	l_len;
 	l_pid_t		l_pid;
 }
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 __packed
 #endif
 ;
 
 static void
 linux_to_bsd_flock64(struct l_flock64 *linux_flock, struct flock *bsd_flock)
 {
 	switch (linux_flock->l_type) {
 	case LINUX_F_RDLCK:
 		bsd_flock->l_type = F_RDLCK;
 		break;
 	case LINUX_F_WRLCK:
 		bsd_flock->l_type = F_WRLCK;
 		break;
 	case LINUX_F_UNLCK:
 		bsd_flock->l_type = F_UNLCK;
 		break;
 	default:
 		bsd_flock->l_type = -1;
 		break;
 	}
 	bsd_flock->l_whence = linux_flock->l_whence;
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
 	bsd_flock->l_sysid = 0;
 }
 
 static void
 bsd_to_linux_flock64(struct flock *bsd_flock, struct l_flock64 *linux_flock)
 {
 	switch (bsd_flock->l_type) {
 	case F_RDLCK:
 		linux_flock->l_type = LINUX_F_RDLCK;
 		break;
 	case F_WRLCK:
 		linux_flock->l_type = LINUX_F_WRLCK;
 		break;
 	case F_UNLCK:
 		linux_flock->l_type = LINUX_F_UNLCK;
 		break;
 	}
 	linux_flock->l_whence = bsd_flock->l_whence;
 	linux_flock->l_start = (l_loff_t)bsd_flock->l_start;
 	linux_flock->l_len = (l_loff_t)bsd_flock->l_len;
 	linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 static int
 fcntl_common(struct thread *td, struct linux_fcntl_args *args)
 {
 	struct l_flock linux_flock;
 	struct flock bsd_flock;
 	struct pipe *fpipe;
 	struct file *fp;
 	long arg;
 	int error, result;
 
 	switch (args->cmd) {
 	case LINUX_F_DUPFD:
 		return (kern_fcntl(td, args->fd, F_DUPFD, args->arg));
 
 	case LINUX_F_GETFD:
 		return (kern_fcntl(td, args->fd, F_GETFD, 0));
 
 	case LINUX_F_SETFD:
 		return (kern_fcntl(td, args->fd, F_SETFD, args->arg));
 
 	case LINUX_F_GETFL:
 		error = kern_fcntl(td, args->fd, F_GETFL, 0);
 		result = td->td_retval[0];
 		td->td_retval[0] = 0;
 		if (result & O_RDONLY)
 			td->td_retval[0] |= LINUX_O_RDONLY;
 		if (result & O_WRONLY)
 			td->td_retval[0] |= LINUX_O_WRONLY;
 		if (result & O_RDWR)
 			td->td_retval[0] |= LINUX_O_RDWR;
 		if (result & O_NDELAY)
 			td->td_retval[0] |= LINUX_O_NONBLOCK;
 		if (result & O_APPEND)
 			td->td_retval[0] |= LINUX_O_APPEND;
 		if (result & O_FSYNC)
 			td->td_retval[0] |= LINUX_O_SYNC;
 		if (result & O_ASYNC)
 			td->td_retval[0] |= LINUX_O_ASYNC;
 #ifdef LINUX_O_NOFOLLOW
 		if (result & O_NOFOLLOW)
 			td->td_retval[0] |= LINUX_O_NOFOLLOW;
 #endif
 #ifdef LINUX_O_DIRECT
 		if (result & O_DIRECT)
 			td->td_retval[0] |= LINUX_O_DIRECT;
 #endif
 		return (error);
 
 	case LINUX_F_SETFL:
 		arg = 0;
 		if (args->arg & LINUX_O_NDELAY)
 			arg |= O_NONBLOCK;
 		if (args->arg & LINUX_O_APPEND)
 			arg |= O_APPEND;
 		if (args->arg & LINUX_O_SYNC)
 			arg |= O_FSYNC;
 		if (args->arg & LINUX_O_ASYNC)
 			arg |= O_ASYNC;
 #ifdef LINUX_O_NOFOLLOW
 		if (args->arg & LINUX_O_NOFOLLOW)
 			arg |= O_NOFOLLOW;
 #endif
 #ifdef LINUX_O_DIRECT
 		if (args->arg & LINUX_O_DIRECT)
 			arg |= O_DIRECT;
 #endif
 		return (kern_fcntl(td, args->fd, F_SETFL, arg));
 
 	case LINUX_F_GETLK:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, &bsd_flock);
 		error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock);
 		if (error)
 			return (error);
 		bsd_to_linux_flock(&bsd_flock, &linux_flock);
 		return (copyout(&linux_flock, (void *)args->arg,
 		    sizeof(linux_flock)));
 
 	case LINUX_F_SETLK:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, &bsd_flock);
 		return (kern_fcntl(td, args->fd, F_SETLK,
 		    (intptr_t)&bsd_flock));
 
 	case LINUX_F_SETLKW:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, &bsd_flock);
 		return (kern_fcntl(td, args->fd, F_SETLKW,
 		     (intptr_t)&bsd_flock));
 
 	case LINUX_F_GETOWN:
 		return (kern_fcntl(td, args->fd, F_GETOWN, 0));
 
 	case LINUX_F_SETOWN:
 		/*
 		 * XXX some Linux applications depend on F_SETOWN having no
 		 * significant effect for pipes (SIGIO is not delivered for
 		 * pipes under Linux-2.2.35 at least).
 		 */
 		error = fget(td, args->fd,
 		    &cap_fcntl_rights, &fp);
 		if (error)
 			return (error);
 		if (fp->f_type == DTYPE_PIPE) {
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		fdrop(fp, td);
 
 		return (kern_fcntl(td, args->fd, F_SETOWN, args->arg));
 
 	case LINUX_F_DUPFD_CLOEXEC:
 		return (kern_fcntl(td, args->fd, F_DUPFD_CLOEXEC, args->arg));
 	/*
 	 * Our F_SEAL_* values match Linux one for maximum compatibility.  So we
 	 * only needed to account for different values for fcntl(2) commands.
 	 */
 	case LINUX_F_GET_SEALS:
 		error = kern_fcntl(td, args->fd, F_GET_SEALS, 0);
 		if (error != 0)
 			return (error);
 		td->td_retval[0] = bsd_to_linux_bits(td->td_retval[0],
 		    seal_bitmap, 0);
 		return (0);
 
 	case LINUX_F_ADD_SEALS:
 		return (kern_fcntl(td, args->fd, F_ADD_SEALS,
 		    linux_to_bsd_bits(args->arg, seal_bitmap, 0)));
 
 	case LINUX_F_GETPIPE_SZ:
 		error = fget(td, args->fd,
 		    &cap_fcntl_rights, &fp);
 		if (error != 0)
 			return (error);
 		if (fp->f_type != DTYPE_PIPE) {
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		fpipe = fp->f_data;
 		td->td_retval[0] = fpipe->pipe_buffer.size;
 		fdrop(fp, td);
 		return (0);
 
 	default:
 		linux_msg(td, "unsupported fcntl cmd %d", args->cmd);
 		return (EINVAL);
 	}
 }
 
 int
 linux_fcntl(struct thread *td, struct linux_fcntl_args *args)
 {
 
 	return (fcntl_common(td, args));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_fcntl64(struct thread *td, struct linux_fcntl64_args *args)
 {
 	struct l_flock64 linux_flock;
 	struct flock bsd_flock;
 	struct linux_fcntl_args fcntl_args;
 	int error;
 
 	switch (args->cmd) {
 	case LINUX_F_GETLK64:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, &bsd_flock);
 		error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock);
 		if (error)
 			return (error);
 		bsd_to_linux_flock64(&bsd_flock, &linux_flock);
 		return (copyout(&linux_flock, (void *)args->arg,
 			    sizeof(linux_flock)));
 
 	case LINUX_F_SETLK64:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, &bsd_flock);
 		return (kern_fcntl(td, args->fd, F_SETLK,
 		    (intptr_t)&bsd_flock));
 
 	case LINUX_F_SETLKW64:
 		error = copyin((void *)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, &bsd_flock);
 		return (kern_fcntl(td, args->fd, F_SETLKW,
 		    (intptr_t)&bsd_flock));
 	}
 
 	fcntl_args.fd = args->fd;
 	fcntl_args.cmd = args->cmd;
 	fcntl_args.arg = args->arg;
 	return (fcntl_common(td, &fcntl_args));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_chown(struct thread *td, struct linux_chown_args *args)
 {
 
 	return (kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE,
 	    args->uid, args->gid, 0));
 }
 #endif
 
 int
 linux_fchownat(struct thread *td, struct linux_fchownat_args *args)
 {
 	int dfd, flag, unsupported;
 
 	unsupported = args->flag & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH);
 	if (unsupported != 0) {
 		linux_msg(td, "fchownat unsupported flag 0x%x", unsupported);
 		return (EINVAL);
 	}
 
 	flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) == 0 ? 0 :
 	    AT_SYMLINK_NOFOLLOW;
 	flag |= (args->flag & LINUX_AT_EMPTY_PATH) == 0 ? 0 :
 	    AT_EMPTY_PATH;
 
 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD :  args->dfd;
 	return (kern_fchownat(td, dfd, args->filename, UIO_USERSPACE,
 	    args->uid, args->gid, flag));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_lchown(struct thread *td, struct linux_lchown_args *args)
 {
 
 	return (kern_fchownat(td, AT_FDCWD, args->path, UIO_USERSPACE, args->uid,
 	    args->gid, AT_SYMLINK_NOFOLLOW));
 }
 #endif
 
 static int
 convert_fadvice(int advice)
 {
 	switch (advice) {
 	case LINUX_POSIX_FADV_NORMAL:
 		return (POSIX_FADV_NORMAL);
 	case LINUX_POSIX_FADV_RANDOM:
 		return (POSIX_FADV_RANDOM);
 	case LINUX_POSIX_FADV_SEQUENTIAL:
 		return (POSIX_FADV_SEQUENTIAL);
 	case LINUX_POSIX_FADV_WILLNEED:
 		return (POSIX_FADV_WILLNEED);
 	case LINUX_POSIX_FADV_DONTNEED:
 		return (POSIX_FADV_DONTNEED);
 	case LINUX_POSIX_FADV_NOREUSE:
 		return (POSIX_FADV_NOREUSE);
 	default:
 		return (-1);
 	}
 }
 
 int
 linux_fadvise64(struct thread *td, struct linux_fadvise64_args *args)
 {
 	off_t offset;
 	int advice;
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	offset = PAIR32TO64(off_t, args->offset);
 #else
 	offset = args->offset;
 #endif
 
 	advice = convert_fadvice(args->advice);
 	if (advice == -1)
 		return (EINVAL);
 	return (kern_posix_fadvise(td, args->fd, offset, args->len, advice));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 int
 linux_fadvise64_64(struct thread *td, struct linux_fadvise64_64_args *args)
 {
 	off_t len, offset;
 	int advice;
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	len = PAIR32TO64(off_t, args->len);
 	offset = PAIR32TO64(off_t, args->offset);
 #else
 	len = args->len;
 	offset = args->offset;
 #endif
 
 	advice = convert_fadvice(args->advice);
 	if (advice == -1)
 		return (EINVAL);
 	return (kern_posix_fadvise(td, args->fd, offset, len, advice));
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_pipe(struct thread *td, struct linux_pipe_args *args)
 {
 	int fildes[2];
 	int error;
 
 	error = kern_pipe(td, fildes, 0, NULL, NULL);
 	if (error != 0)
 		return (error);
 
 	error = copyout(fildes, args->pipefds, sizeof(fildes));
 	if (error != 0) {
 		(void)kern_close(td, fildes[0]);
 		(void)kern_close(td, fildes[1]);
 	}
 
 	return (error);
 }
 #endif
 
 int
 linux_pipe2(struct thread *td, struct linux_pipe2_args *args)
 {
 	int fildes[2];
 	int error, flags;
 
 	if ((args->flags & ~(LINUX_O_NONBLOCK | LINUX_O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	flags = 0;
 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
 		flags |= O_NONBLOCK;
 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
 		flags |= O_CLOEXEC;
 	error = kern_pipe(td, fildes, flags, NULL, NULL);
 	if (error != 0)
 		return (error);
 
 	error = copyout(fildes, args->pipefds, sizeof(fildes));
 	if (error != 0) {
 		(void)kern_close(td, fildes[0]);
 		(void)kern_close(td, fildes[1]);
 	}
 
 	return (error);
 }
 
 int
 linux_dup3(struct thread *td, struct linux_dup3_args *args)
 {
 	int cmd;
 	intptr_t newfd;
 
 	if (args->oldfd == args->newfd)
 		return (EINVAL);
 	if ((args->flags & ~LINUX_O_CLOEXEC) != 0)
 		return (EINVAL);
 	if (args->flags & LINUX_O_CLOEXEC)
 		cmd = F_DUP2FD_CLOEXEC;
 	else
 		cmd = F_DUP2FD;
 
 	newfd = args->newfd;
 	return (kern_fcntl(td, args->oldfd, cmd, newfd));
 }
 
 int
 linux_fallocate(struct thread *td, struct linux_fallocate_args *args)
 {
 	off_t len, offset;
 
 	/*
 	 * We emulate only posix_fallocate system call for which
 	 * mode should be 0.
 	 */
 	if (args->mode != 0)
 		return (EOPNOTSUPP);
 
 #if defined(__amd64__) && defined(COMPAT_LINUX32)
 	len = PAIR32TO64(off_t, args->len);
 	offset = PAIR32TO64(off_t, args->offset);
 #else
 	len = args->len;
 	offset = args->offset;
 #endif
 
 	return (kern_posix_fallocate(td, args->fd, offset, len));
 }
 
 int
 linux_copy_file_range(struct thread *td, struct linux_copy_file_range_args
     *args)
 {
 	l_loff_t inoff, outoff, *inoffp, *outoffp;
 	int error, flags;
 
 	/*
 	 * copy_file_range(2) on Linux doesn't define any flags (yet), so is
 	 * the native implementation.  Enforce it.
 	 */
 	if (args->flags != 0) {
 		linux_msg(td, "copy_file_range unsupported flags 0x%x",
 		    args->flags);
 		return (EINVAL);
 	}
 	flags = 0;
 	inoffp = outoffp = NULL;
 	if (args->off_in != NULL) {
 		error = copyin(args->off_in, &inoff, sizeof(l_loff_t));
 		if (error != 0)
 			return (error);
 		inoffp = &inoff;
 	}
 	if (args->off_out != NULL) {
 		error = copyin(args->off_out, &outoff, sizeof(l_loff_t));
 		if (error != 0)
 			return (error);
 		outoffp = &outoff;
 	}
 
 	error = kern_copy_file_range(td, args->fd_in, inoffp, args->fd_out,
 	    outoffp, args->len, flags);
 	if (error == 0 && args->off_in != NULL)
 		error = copyout(inoffp, args->off_in, sizeof(l_loff_t));
 	if (error == 0 && args->off_out != NULL)
 		error = copyout(outoffp, args->off_out, sizeof(l_loff_t));
 	return (error);
 }
 
 #define	LINUX_MEMFD_PREFIX	"memfd:"
 
 int
 linux_memfd_create(struct thread *td, struct linux_memfd_create_args *args)
 {
 	char memfd_name[LINUX_NAME_MAX + 1];
 	int error, flags, shmflags, oflags;
 
 	/*
 	 * This is our clever trick to avoid the heap allocation to copy in the
 	 * uname.  We don't really need to go this far out of our way, but it
 	 * does keep the rest of this function fairly clean as they don't have
 	 * to worry about cleanup on the way out.
 	 */
 	error = copyinstr(args->uname_ptr,
 	    memfd_name + sizeof(LINUX_MEMFD_PREFIX) - 1,
 	    LINUX_NAME_MAX - sizeof(LINUX_MEMFD_PREFIX) - 1, NULL);
 	if (error != 0) {
 		if (error == ENAMETOOLONG)
 			error = EINVAL;
 		return (error);
 	}
 
 	memcpy(memfd_name, LINUX_MEMFD_PREFIX, sizeof(LINUX_MEMFD_PREFIX) - 1);
 	flags = linux_to_bsd_bits(args->flags, mfd_bitmap, 0);
 	if ((flags & ~(MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB |
 	    MFD_HUGE_MASK)) != 0)
 		return (EINVAL);
 	/* Size specified but no HUGETLB. */
 	if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0)
 		return (EINVAL);
 	/* We don't actually support HUGETLB. */
 	if ((flags & MFD_HUGETLB) != 0)
 		return (ENOSYS);
 	oflags = O_RDWR;
 	shmflags = SHM_GROW_ON_WRITE;
 	if ((flags & MFD_CLOEXEC) != 0)
 		oflags |= O_CLOEXEC;
 	if ((flags & MFD_ALLOW_SEALING) != 0)
 		shmflags |= SHM_ALLOW_SEALING;
 	return (kern_shm_open2(td, SHM_ANON, oflags, 0, shmflags, NULL,
 	    memfd_name));
 }
 
 int
 linux_splice(struct thread *td, struct linux_splice_args *args)
 {
 
 	linux_msg(td, "syscall splice not really implemented");
 
 	/*
 	 * splice(2) is documented to return EINVAL in various circumstances;
 	 * returning it instead of ENOSYS should hint the caller to use fallback
 	 * instead.
 	 */
 	return (EINVAL);
 }
 
 int
 linux_close_range(struct thread *td, struct linux_close_range_args *args)
 {
 	u_int flags = 0;
 
 	/*
 	 * Implementing close_range(CLOSE_RANGE_UNSHARE) allows Linux to
 	 * unshare filedesc table of the calling thread from others threads
 	 * in a thread group (i.e., process in the FreeBSD) or others processes,
 	 * which shares the same table, before closing the files. FreeBSD does
 	 * not have compatible unsharing mechanism due to the fact that sharing
 	 * process resources, including filedesc table, is at thread level in the
 	 * Linux, while in the FreeBSD it is at the process level.
 	 * Return EINVAL for now if the CLOSE_RANGE_UNSHARE flag is specified
 	 * until this new Linux API stabilizes.
 	 */
 
 	if ((args->flags & ~(LINUX_CLOSE_RANGE_CLOEXEC)) != 0)
 		return (EINVAL);
 	if (args->first > args->last)
 		return (EINVAL);
 	if ((args->flags & LINUX_CLOSE_RANGE_CLOEXEC) != 0)
 		flags |= CLOSE_RANGE_CLOEXEC;
 	return (kern_close_range(td, flags, args->first, args->last));
 }
 
 int
 linux_enobufs2eagain(struct thread *td, int fd, int error)
 {
 	struct file *fp;
 
 	if (error != ENOBUFS)
 		return (error);
 	if (fget(td, fd, &cap_no_rights, &fp) != 0)
 		return (error);
 	if (fp->f_type == DTYPE_SOCKET && (fp->f_flag & FNONBLOCK) != 0)
 		error = EAGAIN;
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 linux_write(struct thread *td, struct linux_write_args *args)
 {
 	struct write_args bargs = {
 		.fd	= args->fd,
 		.buf	= args->buf,
 		.nbyte	= args->nbyte,
 	};
 
 	return (linux_enobufs2eagain(td, args->fd, sys_write(td, &bargs)));
 }
 
 int
 linux_writev(struct thread *td, struct linux_writev_args *args)
 {
 	struct uio *auio;
 	int error;
 
 #ifdef COMPAT_LINUX32
 	error = freebsd32_copyinuio(PTRIN(args->iovp), args->iovcnt, &auio);
 #else
 	error = copyinuio(args->iovp, args->iovcnt, &auio);
 #endif
 	if (error != 0)
 		return (error);
 	error = kern_writev(td, args->fd, auio);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (linux_enobufs2eagain(td, args->fd, error));
 }
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 91466c46bc62..1f775f78e581 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -1,5014 +1,5014 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2008 Bjoern A. Zeeb.
  * Copyright (c) 2009 James Gritton.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_nfs.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/osd.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/epoch.h>
 #include <sys/taskqueue.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/rctl.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 #include <security/mac/mac_framework.h>
 
 #define	PRISON0_HOSTUUID_MODULE	"hostuuid"
 
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
 
 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
 #ifdef INET
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
 #endif
 #else /* !INET */
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	0
 #endif
 #endif
 
 /* prison0 describes what is "real" about the system. */
 struct prison prison0 = {
 	.pr_id		= 0,
 	.pr_name	= "0",
 	.pr_ref		= 1,
 	.pr_uref	= 1,
 	.pr_path	= "/",
 	.pr_securelevel	= -1,
 	.pr_devfs_rsnum = 0,
 	.pr_state	= PRISON_STATE_ALIVE,
 	.pr_childmax	= JAIL_MAX,
 	.pr_hostuuid	= DEFAULT_HOSTUUID,
 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
 #ifdef VIMAGE
 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
 #else
 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
 #endif
 	.pr_allow	= PR_ALLOW_ALL_STATIC,
 };
 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
 
 struct bool_flags {
 	const char	*name;
 	const char	*noname;
 	volatile u_int	 flag;
 };
 struct jailsys_flags {
 	const char	*name;
 	unsigned	 disable;
 	unsigned	 new;
 };
 
 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
 struct	sx allprison_lock;
 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
 LIST_HEAD(, prison_racct) allprison_racct;
 int	lastprid = 0;
 
 static int get_next_prid(struct prison **insprp);
 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
 static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
 static int prison_lock_xlock(struct prison *pr, int flags);
 static void prison_cleanup(struct prison *pr);
 static void prison_free_not_last(struct prison *pr);
 static void prison_proc_free_not_last(struct prison *pr);
 static void prison_proc_relink(struct prison *opr, struct prison *npr,
     struct proc *p);
 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
     int enable);
 static char *prison_path(struct prison *pr1, struct prison *pr2);
 #ifdef RACCT
 static void prison_racct_attach(struct prison *pr);
 static void prison_racct_modify(struct prison *pr);
 static void prison_racct_detach(struct prison *pr);
 #endif
 
 /* Flags for prison_deref */
 #define	PD_DEREF	0x01	/* Decrement pr_ref */
 #define	PD_DEUREF	0x02	/* Decrement pr_uref */
 #define	PD_KILL		0x04	/* Remove jail, kill processes, etc */
 #define	PD_LOCKED	0x10	/* pr_mtx is held */
 #define	PD_LIST_SLOCKED	0x20	/* allprison_lock is held shared */
 #define	PD_LIST_XLOCKED	0x40	/* allprison_lock is held exclusive */
 #define PD_OP_FLAGS	0x07	/* Operation flags */
 #define PD_LOCK_FLAGS	0x70	/* Lock status flags */
 
 /*
  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
  * as we cannot figure out the size of a sparse array, or an array without a
  * terminating entry.
  */
 static struct bool_flags pr_flag_bool[] = {
 	{"persist", "nopersist", PR_PERSIST},
 #ifdef INET
 	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
 #endif
 #ifdef INET6
 	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
 #endif
 };
 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
 
 static struct jailsys_flags pr_flag_jailsys[] = {
 	{"host", 0, PR_HOST},
 #ifdef VIMAGE
 	{"vnet", 0, PR_VNET},
 #endif
 #ifdef INET
 	{"ip4", PR_IP4_USER, PR_IP4_USER},
 #endif
 #ifdef INET6
 	{"ip6", PR_IP6_USER, PR_IP6_USER},
 #endif
 };
 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
 
 /*
  * Make this array full-size so dynamic parameters can be added.
  * It is protected by prison0.mtx, but lockless reading is allowed
  * with an atomic check of the flag values.
  */
 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
 	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
 	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
 	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
 	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
 	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
 	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
 	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
 	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
 	{"allow.reserved_ports", "allow.noreserved_ports",
 	 PR_ALLOW_RESERVED_PORTS},
 	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
 	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
 	 PR_ALLOW_UNPRIV_DEBUG},
 	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
 #ifdef VIMAGE
 	{"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
 #endif
 };
 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
 
 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
 					 PR_ALLOW_RESERVED_PORTS | \
 					 PR_ALLOW_UNPRIV_DEBUG | \
 					 PR_ALLOW_SUSER)
 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
 #if defined(INET) || defined(INET6)
 static unsigned jail_max_af_ips = 255;
 #endif
 
 /*
  * Initialize the parts of prison0 that can't be static-initialized with
  * constants.  This is called from proc0_init() after creating thread0 cpuset.
  */
 void
 prison0_init(void)
 {
 	uint8_t *file, *data;
 	size_t size;
 	char buf[sizeof(prison0.pr_hostuuid)];
 	bool valid;
 
 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
 	prison0.pr_osreldate = osreldate;
 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
 
 	/* If we have a preloaded hostuuid, use it. */
 	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
 	if (file != NULL) {
 		data = preload_fetch_addr(file);
 		size = preload_fetch_size(file);
 		if (data != NULL) {
 			/*
 			 * The preloaded data may include trailing whitespace, almost
 			 * certainly a newline; skip over any whitespace or
 			 * non-printable characters to be safe.
 			 */
 			while (size > 0 && data[size - 1] <= 0x20) {
 				size--;
 			}
 
 			valid = false;
 
 			/*
 			 * Not NUL-terminated when passed from loader, but
 			 * validate_uuid requires that due to using sscanf (as
 			 * does the subsequent strlcpy, since it still reads
 			 * past the given size to return the true length);
 			 * bounce to a temporary buffer to fix.
 			 */
 			if (size >= sizeof(buf))
 				goto done;
 
 			memcpy(buf, data, size);
 			buf[size] = '\0';
 
 			if (validate_uuid(buf, size, NULL, 0) != 0)
 				goto done;
 
 			valid = true;
 			(void)strlcpy(prison0.pr_hostuuid, buf,
 			    sizeof(prison0.pr_hostuuid));
 
 done:
 			if (bootverbose && !valid) {
 				printf("hostuuid: preload data malformed: '%.*s'\n",
 				    (int)size, data);
 			}
 		}
 	}
 	if (bootverbose)
 		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
 }
 
 /*
  * struct jail_args {
  *	struct jail *jail;
  * };
  */
 int
 sys_jail(struct thread *td, struct jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		struct jail_v0 j0;
 
 		/* FreeBSD single IPv4 jails. */
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
 		if (error)
 			return (error);
 		j.version = j0.version;
 		j.path = j0.path;
 		j.hostname = j0.hostname;
 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		error = copyin(uap->jail, &j, sizeof(struct jail));
 		if (error)
 			return (error);
 		break;
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 kern_jail(struct thread *td, struct jail *j)
 {
 	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
 #ifdef INET
 			    + 1
 #endif
 #ifdef INET6
 			    + 1
 #endif
 			    )];
 	struct uio opt;
 	char *u_path, *u_hostname, *u_name;
 	struct bool_flags *bf;
 #ifdef INET
 	uint32_t ip4s;
 	struct in_addr *u_ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *u_ip6;
 #endif
 	size_t tmplen;
 	int error, enforce_statfs;
 
 	bzero(&optiov, sizeof(optiov));
 	opt.uio_iov = optiov;
 	opt.uio_iovcnt = 0;
 	opt.uio_offset = -1;
 	opt.uio_resid = -1;
 	opt.uio_segflg = UIO_SYSSPACE;
 	opt.uio_rw = UIO_READ;
 	opt.uio_td = td;
 
 	/* Set permissions for top-level jails from sysctls. */
 	if (!jailed(td->td_ucred)) {
 		for (bf = pr_flag_allow;
 		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 			atomic_load_int(&bf->flag) != 0;
 		     bf++) {
 			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
 			    (jail_default_allow & bf->flag)
 			    ? bf->name : bf->noname);
 			optiov[opt.uio_iovcnt].iov_len =
 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
 			opt.uio_iovcnt += 2;
 		}
 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
 		opt.uio_iovcnt++;
 		enforce_statfs = jail_default_enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
 		opt.uio_iovcnt++;
 	}
 
 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
 #ifdef INET
 	ip4s = (j->version == 0) ? 1 : j->ip4s;
 	if (ip4s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += ip4s * sizeof(struct in_addr);
 #else
 	if (j->ip4s > 0)
 		return (EINVAL);
 #endif
 #ifdef INET6
 	if (j->ip6s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += j->ip6s * sizeof(struct in6_addr);
 #else
 	if (j->ip6s > 0)
 		return (EINVAL);
 #endif
 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
 	u_hostname = u_path + MAXPATHLEN;
 	u_name = u_hostname + MAXHOSTNAMELEN;
 #ifdef INET
 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #ifdef INET6
 #ifdef INET
 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
 #else
 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #endif
 	optiov[opt.uio_iovcnt].iov_base = "path";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_path;
 	error = copyinstr(j->path, u_path, MAXPATHLEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	if (j->jailname != NULL) {
 		optiov[opt.uio_iovcnt].iov_base = "name";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
 		opt.uio_iovcnt++;
 		optiov[opt.uio_iovcnt].iov_base = u_name;
 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
 		    &optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 		opt.uio_iovcnt++;
 	}
 #ifdef INET
 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
 	if (j->version == 0)
 		u_ip4->s_addr = j->ip4s;
 	else {
 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 	}
 	opt.uio_iovcnt++;
 #endif
 #ifdef INET6
 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 #endif
 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
 	free(u_path, M_TEMP);
 	return (error);
 }
 
 /*
  * struct jail_set_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_set(struct thread *td, struct jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 #if defined(INET) || defined(INET6)
 typedef int prison_addr_cmp_t(const void *, const void *);
 typedef bool prison_addr_valid_t(const void *);
 static const struct pr_family {
 	size_t			size;
 	prison_addr_cmp_t	*cmp;
 	prison_addr_valid_t	*valid;
 	int			ip_flag;
 } pr_families[PR_FAMILY_MAX] = {
 #ifdef INET
 	[PR_INET] = {
 		.size = sizeof(struct in_addr),
 		.cmp = prison_qcmp_v4,
 		.valid = prison_valid_v4,
 		.ip_flag = PR_IP4_USER,
 	 },
 #endif
 #ifdef INET6
 	[PR_INET6] = {
 		.size = sizeof(struct in6_addr),
 		.cmp = prison_qcmp_v6,
 		.valid = prison_valid_v6,
 		.ip_flag = PR_IP6_USER,
 	},
 #endif
 };
 
 /*
  * Network address lists (pr_addrs) allocation for jails.  The addresses
  * are accessed locklessly by the network stack, thus need to be protected by
  * the network epoch.
  */
 struct prison_ip {
 	struct epoch_context ctx;
 	uint32_t	ips;
 #ifdef FUTURE_C
 	/*
 	 * XXX Variable-length automatic arrays in union may be
 	 * supported in future C.
 	 */
 	union {
 		char pr_ip[];
 		struct in_addr pr_ip4[];
 		struct in6_addr pr_ip6[];
 	};
 #else /* No future C :( */
 	char pr_ip[];
 #endif
 };
 
 static char *
 PR_IP(struct prison_ip *pip, const pr_family_t af, int idx)
 {
 	MPASS(pip);
 	MPASS(af < PR_FAMILY_MAX);
 	MPASS(idx >= 0 && idx < pip->ips);
 
 	return (pip->pr_ip + pr_families[af].size * idx);
 }
 
 static struct prison_ip *
 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags)
 {
 	struct prison_ip *pip;
 
 	pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size,
 	    M_PRISON, flags);
 	if (pip != NULL)
 		pip->ips = cnt;
 	return (pip);
 }
 
 /*
  * Allocate and copyin user supplied address list, sorting and validating.
  * kern_jail_set() helper.
  */
 static struct prison_ip *
 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt)
 {
 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
 	const size_t size = pr_families[af].size;
 	struct prison_ip *pip;
 
 	pip = prison_ip_alloc(af, cnt, M_WAITOK);
 	bcopy(op, pip->pr_ip, cnt * size);
 	/*
 	 * IP addresses are all sorted but ip[0] to preserve
 	 * the primary IP address as given from userland.
 	 * This special IP is used for unbound outgoing
 	 * connections as well for "loopback" traffic in case
 	 * source address selection cannot find any more fitting
 	 * address to connect from.
 	 */
 	if (cnt > 1)
 		qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp);
 	/*
 	 * Check for duplicate addresses and do some simple
 	 * zero and broadcast checks. If users give other bogus
 	 * addresses it is their problem.
 	 */
 	for (int i = 0; i < cnt; i++) {
 		if (!pr_families[af].valid(PR_IP(pip, af, i))) {
 			free(pip, M_PRISON);
 			return (NULL);
 		}
 		if (i + 1 < cnt &&
 		    (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 ||
 		     cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) {
 			free(pip, M_PRISON);
 			return (NULL);
 		}
 	}
 
 	return (pip);
 }
 
 /*
  * Allocate and dup parent prison address list.
  * kern_jail_set() helper.
  */
 static void
 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af)
 {
 	const struct prison_ip *ppip = ppr->pr_addrs[af];
 	struct prison_ip *pip;
 
 	if (ppip != NULL) {
 		pip = prison_ip_alloc(af, ppip->ips, M_WAITOK);
 		bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size);
 		pr->pr_addrs[af] = pip;
 	}
 }
 
 /*
  * Make sure the new set of IP addresses is a subset of the parent's list.
  * Don't worry about the parent being unlocked, as any setting is done with
  * allprison_lock held.
  * kern_jail_set() helper.
  */
 static bool
 prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip,
     const pr_family_t af)
 {
 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
 	int i, j;
 
 	if (ppip == NULL)
 		return (false);
 
 	for (i = 0; i < ppip->ips; i++)
 		if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0)
 			break;
 
 	if (i == ppip->ips)
 		/* Main address not present in parent. */
 		return (false);
 
 	if (pip->ips > 1) {
 		for (i = j = 1; i < pip->ips; i++) {
 			if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0)
 				/* Equals to parent primary address. */
 				continue;
 			for (; j < ppip->ips; j++)
 				if (cmp(PR_IP(pip, af, i),
 				    PR_IP(ppip, af, j)) == 0)
 					break;
 			if (j == ppip->ips)
 				break;
 		}
 		if (j == ppip->ips)
 			/* Address not present in parent. */
 			return (false);
 	}
 	return (true);
 }
 
 /*
  * Check for conflicting IP addresses.  We permit them if there is no more
  * than one IP on each jail.  If there is a duplicate on a jail with more
  * than one IP stop checking and return error.
  * kern_jail_set() helper.
  */
 static bool
 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr,
     struct prison_ip *pip, pr_family_t af)
 {
 	const struct prison *tppr, *tpr;
 	int descend;
 
 #ifdef VIMAGE
 	for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
 		if (tppr->pr_flags & PR_VNET)
 			break;
 #else
 	tppr = &prison0;
 #endif
 	FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 		if (tpr == pr ||
 #ifdef VIMAGE
 		    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
 		    !prison_isalive(tpr)) {
 			descend = 0;
 			continue;
 		}
 		if (!(tpr->pr_flags & pr_families[af].ip_flag))
 			continue;
 		descend = 0;
 		if (tpr->pr_addrs[af] == NULL ||
 		    (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1))
 			continue;
 		for (int i = 0; i < pip->ips; i++)
 			if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0)
 				return (false);
 	}
 
 	return (true);
 }
 
 _Static_assert(offsetof(struct prison_ip, ctx) == 0,
     "prison must start with epoch context");
 static void
 prison_ip_free_deferred(epoch_context_t ctx)
 {
 
 	free(ctx, M_PRISON);
 }
 
 static void
 prison_ip_free(struct prison_ip *pip)
 {
 
 	if (pip != NULL)
 		NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx);
 }
 
 static void
 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new)
 {
 	struct prison_ip **mem, *old;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 
 	mem = &pr->pr_addrs[af];
 
 	old = *mem;
 	atomic_store_ptr(mem, new);
 	prison_ip_free(old);
 }
 
 /*
  * Restrict a prison's IP address list with its parent's, possibly replacing
  * it.  Return true if succeed, otherwise should redo.
  * kern_jail_set() helper.
  */
 static bool
 prison_ip_restrict(struct prison *pr, const pr_family_t af,
     struct prison_ip **newp)
 {
 	struct prison_ip *ppip = pr->pr_parent->pr_addrs[af];
 	struct prison_ip *pip = pr->pr_addrs[af];
 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
 	const size_t size = pr_families[af].size;
 	struct prison_ip *new = newp != NULL ? *newp : NULL;
 	uint32_t ips;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 
 	/*
 	 * Due to epoch-synchronized access to the IP address lists we always
 	 * allocate a new list even if the old one has enough space.  We could
 	 * atomically update an IPv4 address inside a list, but that would
 	 * screw up sorting, and in case of IPv6 we can't even atomically write
 	 * one.
 	 */
 	if (ppip == NULL) {
 		if (pip != NULL)
 			prison_ip_set(pr, af, NULL);
 		return (true);
 	}
 
 	if (!(pr->pr_flags & pr_families[af].ip_flag)) {
 		if (new == NULL) {
 			new = prison_ip_alloc(af, ppip->ips, M_NOWAIT);
 			if (new == NULL)
 				return (false); /* Redo */
 		}
 		/* This has no user settings, so just copy the parent's list. */
 		MPASS(new->ips == ppip->ips);
 		bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size);
 		prison_ip_set(pr, af, new);
 		if (newp != NULL)
 			*newp = NULL; /* Used */
 	} else if (pip != NULL) {
 		/* Remove addresses that aren't in the parent. */
 		int i;
 
 		i = 0; /* index in pip */
 		ips = 0; /* index in new */
 
 		if (new == NULL) {
 			new = prison_ip_alloc(af, pip->ips, M_NOWAIT);
 			if (new == NULL)
 				return (false); /* Redo */
 		}
 
 		for (int pi = 0; pi < ppip->ips; pi++)
 			if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) {
 				/* Found our primary address in parent. */
 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
 				    size);
 				i++;
 				ips++;
 				break;
 			}
 		for (int pi = 1; i < pip->ips; ) {
 			/* Check against primary, which is unsorted. */
 			if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) {
 				/* Matches parent's primary address. */
 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
 				    size);
 				i++;
 				ips++;
 				continue;
 			}
 			/* The rest are sorted. */
 			switch (pi >= ppip->ips ? -1 :
 				cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) {
 			case -1:
 				i++;
 				break;
 			case 0:
 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
 				    size);
 				i++;
 				pi++;
 				ips++;
 				break;
 			case 1:
 				pi++;
 				break;
 			}
 		}
 		if (ips == 0) {
 			if (newp == NULL || *newp == NULL)
 				prison_ip_free(new);
 			new = NULL;
 		} else {
 			/* Shrink to real size */
 			KASSERT((new->ips >= ips),
 			    ("Out-of-bounds write to prison_ip %p", new));
 			new->ips = ips;
 		}
 		prison_ip_set(pr, af, new);
 		if (newp != NULL)
 			*newp = NULL; /* Used */
 	}
 	return (true);
 }
 
 /*
  * Fast-path check if an address belongs to a prison.
  */
 int
 prison_ip_check(const struct prison *pr, const pr_family_t af,
     const void *addr)
 {
 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
 	struct prison_ip *pip;
 	int i, a, z, d;
 
 	MPASS(mtx_owned(&pr->pr_mtx) ||
 	    in_epoch(net_epoch_preempt) ||
 	    sx_xlocked(&allprison_lock));
 
 	pip = atomic_load_ptr(&pr->pr_addrs[af]);
 	if (__predict_false(pip == NULL))
 		return (EAFNOSUPPORT);
 
 	/* Check the primary IP. */
 	if (cmp(PR_IP(pip, af, 0), addr) == 0)
 		return (0);
 
 	/*
 	 * All the other IPs are sorted so we can do a binary search.
 	 */
 	a = 0;
 	z = pip->ips - 2;
 	while (a <= z) {
 		i = (a + z) / 2;
 		d = cmp(PR_IP(pip, af, i + 1), addr);
 		if (d > 0)
 			z = i - 1;
 		else if (d < 0)
 			a = i + 1;
 		else
 			return (0);
 	}
 
 	return (EADDRNOTAVAIL);
 }
 
 /*
  * Grab primary IP.  Historically required mutex, but nothing prevents
  * us to support epoch-protected access.  Is it used in fast path?
  * in{6}_jail.c helper
  */
 const void *
 prison_ip_get0(const struct prison *pr, const pr_family_t af)
 {
 	const struct prison_ip *pip = pr->pr_addrs[af];
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	MPASS(pip);
 
 	return (pip->pr_ip);
 }
 
 u_int
 prison_ip_cnt(const struct prison *pr, const pr_family_t af)
 {
 
 	return (pr->pr_addrs[af]->ips);
 }
 #endif	/* defined(INET) || defined(INET6) */
 
 int
 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 {
 	struct nameidata nd;
 #ifdef INET
 	struct prison_ip *ip4;
 #endif
 #ifdef INET6
 	struct prison_ip *ip6;
 #endif
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr;
 	struct vnode *root;
 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
 	char *g_path, *osrelstr;
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 #if defined(INET) || defined(INET6)
 	void *op;
 #endif
 	unsigned long hid;
 	size_t namelen, onamelen, pnamelen;
 	int born, created, cuflags, descend, drflags, enforce;
 	int error, errmsg_len, errmsg_pos;
 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
 	int jid, jsys, len, level;
 	int childmax, osreldt, rsnum, slevel;
 #ifdef INET
 	int ip4s;
 	bool redo_ip4;
 #endif
 #ifdef INET6
 	int ip6s;
 	bool redo_ip6;
 #endif
 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
 	uint64_t pr_allow_diff;
 	unsigned tallow;
 	char numbuf[12];
 
 	error = priv_check(td, PRIV_JAIL_SET);
 	if (!error && (flags & JAIL_ATTACH))
 		error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 	mypr = td->td_ucred->cr_prison;
 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
 		return (EPERM);
 	if (flags & ~JAIL_SET_MASK)
 		return (EINVAL);
 
 	/*
 	 * Check all the parameters before committing to anything.  Not all
 	 * errors can be caught early, but we may as well try.  Also, this
 	 * takes care of some expensive stuff (path lookup) before getting
 	 * the allprison lock.
 	 *
 	 * XXX Jails are not filesystems, and jail parameters are not mount
 	 *     options.  But it makes more sense to re-use the vfsopt code
 	 *     than duplicate it under a different name.
 	 */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 #ifdef INET
 	ip4 = NULL;
 #endif
 #ifdef INET6
 	ip6 = NULL;
 #endif
 	g_path = NULL;
 
 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
 	if (!cuflags) {
 		error = EINVAL;
 		vfs_opterror(opts, "no valid operation (create or update)");
 		goto done_errmsg;
 	}
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == ENOENT)
 		jid = 0;
 	else if (error != 0)
 		goto done_free;
 
 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
 	if (error == ENOENT)
 		gotslevel = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotslevel = 1;
 
 	error =
 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
 	if (error == ENOENT)
 		gotchildmax = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotchildmax = 1;
 
 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
 	if (error == ENOENT)
 		gotenforce = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (enforce < 0 || enforce > 2) {
 		error = EINVAL;
 		goto done_free;
 	} else
 		gotenforce = 1;
 
 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
 	if (error == ENOENT)
 		gotrsnum = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotrsnum = 1;
 
 	pr_flags = ch_flags = 0;
 	for (bf = pr_flag_bool;
 	     bf < pr_flag_bool + nitems(pr_flag_bool);
 	     bf++) {
 		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
 		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
 	}
 	ch_flags |= pr_flags;
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
 		if (error == ENOENT)
 			continue;
 		if (error != 0)
 			goto done_free;
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			if (!jsf->disable) {
 				error = EINVAL;
 				goto done_free;
 			}
 			pr_flags |= jsf->disable;
 			break;
 		case JAIL_SYS_NEW:
 			pr_flags |= jsf->new;
 			break;
 		case JAIL_SYS_INHERIT:
 			break;
 		default:
 			error = EINVAL;
 			goto done_free;
 		}
 		ch_flags |= jsf->new | jsf->disable;
 	}
 	if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
 	    && !(pr_flags & PR_PERSIST)) {
 		error = EINVAL;
 		vfs_opterror(opts, "new jail must persist or attach");
 		goto done_errmsg;
 	}
 #ifdef VIMAGE
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
 		error = EINVAL;
 		vfs_opterror(opts, "vnet cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip4 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET6
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip6 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 
 	pr_allow = ch_allow = 0;
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
 		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
 	}
 	ch_allow |= pr_allow;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == ENOENT)
 		name = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
 	if (error == ENOENT)
 		host = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || host[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
 	if (error == ENOENT)
 		domain = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || domain[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
 	if (error == ENOENT)
 		uuid = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || uuid[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > HOSTUUIDLEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32;
 
 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
 		hid = hid32;
 	} else
 #endif
 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
 	if (error == ENOENT)
 		gothid = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		gothid = 1;
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 	}
 
 #ifdef INET
 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
 	if (error == ENOENT)
 		ip4s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip4s & (sizeof(struct in_addr) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP4_USER;
 		pr_flags |= PR_IP4_USER;
 		if (ip4s > 0) {
 			ip4s /= sizeof(struct in_addr);
 			if (ip4s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv4 addresses");
 				goto done_errmsg;
 			}
 			ip4 = prison_ip_copyin(PR_INET, op, ip4s);
 			if (ip4 == NULL) {
 				error = EINVAL;
 				goto done_free;
 			}
 		}
 	}
 #endif
 
 #ifdef INET6
 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
 	if (error == ENOENT)
 		ip6s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip6s & (sizeof(struct in6_addr) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP6_USER;
 		pr_flags |= PR_IP6_USER;
 		if (ip6s > 0) {
 			ip6s /= sizeof(struct in6_addr);
 			if (ip6s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv6 addresses");
 				goto done_errmsg;
 			}
 			ip6 = prison_ip_copyin(PR_INET6, op, ip6s);
 			if (ip6 == NULL) {
 				error = EINVAL;
 				goto done_free;
 			}
 		}
 	}
 #endif
 
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 		error = EINVAL;
 		vfs_opterror(opts,
 		    "vnet jails cannot have IP address restrictions");
 		goto done_errmsg;
 	}
 #endif
 
 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
 	if (error == ENOENT)
 		osrelstr = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osrelease cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || osrelstr[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len >= OSRELEASELEN) {
 			error = ENAMETOOLONG;
 			vfs_opterror(opts,
 			    "osrelease string must be 1-%d bytes long",
 			    OSRELEASELEN - 1);
 			goto done_errmsg;
 		}
 	}
 
 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
 	if (error == ENOENT)
 		osreldt = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osreldate cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (osreldt == 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "osreldate cannot be 0");
 			goto done_errmsg;
 		}
 	}
 
 	root = NULL;
 	error = vfs_getopt(opts, "path", (void **)&path, &len);
 	if (error == ENOENT)
 		path = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "path cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || path[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
 		error = namei(&nd);
 		if (error)
 			goto done_free;
 		root = nd.ni_vp;
 		NDFREE_PNBUF(&nd);
 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		strlcpy(g_path, path, MAXPATHLEN);
 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
 		if (error == 0) {
 			path = g_path;
 		} else {
 			/* exit on other errors */
 			goto done_free;
 		}
 		if (root->v_type != VDIR) {
 			error = ENOTDIR;
 			vput(root);
 			goto done_free;
 		}
 		VOP_UNLOCK(root);
 	}
 
 	/*
 	 * Find the specified jail, or at least its parent.
 	 * This abuses the file error codes ENOENT and EEXIST.
 	 */
 	pr = NULL;
 	inspr = NULL;
 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
 		namelc = strrchr(name, '.');
 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
 		if (*p != '\0')
 			jid = 0;
 	}
 	sx_xlock(&allprison_lock);
 	drflags = PD_LIST_XLOCKED;
 	ppr = mypr;
 	if (!prison_isalive(ppr)) {
 		/* This jail is dying.  This process will surely follow. */
 		error = EAGAIN;
 		goto done_deref;
 	}
 	if (jid != 0) {
 		if (jid < 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "negative jid");
 			goto done_deref;
 		}
 		/*
 		 * See if a requested jid already exists.  Keep track of
 		 * where it can be inserted later.
 		 */
 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
 			if (inspr->pr_id < jid)
 				continue;
 			if (inspr->pr_id > jid)
 				break;
 			pr = inspr;
 			mtx_lock(&pr->pr_mtx);
 			drflags |= PD_LOCKED;
 			inspr = NULL;
 			break;
 		}
 		if (pr != NULL) {
 			/* Create: jid must not exist. */
 			if (cuflags == JAIL_CREATE) {
 				/*
 				 * Even creators that cannot see the jail will
 				 * get EEXIST.
 				 */
 				error = EEXIST;
 				vfs_opterror(opts, "jail %d already exists",
 				    jid);
 				goto done_deref;
 			}
 			if (!prison_ischild(mypr, pr)) {
 				/*
 				 * Updaters get ENOENT if they cannot see the
 				 * jail.  This is true even for CREATE | UPDATE,
 				 * which normally cannot give this error.
 				 */
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d not found", jid);
 				goto done_deref;
 			}
 			ppr = pr->pr_parent;
 			if (!prison_isalive(ppr)) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d is dying",
 				    ppr->pr_id);
 				goto done_deref;
 			}
 			if (!prison_isalive(pr)) {
 				if (!(flags & JAIL_DYING)) {
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_deref;
 				}
 				if ((flags & JAIL_ATTACH) ||
 				    (pr_flags & PR_PERSIST)) {
 					/*
 					 * A dying jail might be resurrected
 					 * (via attach or persist), but first
 					 * it must determine if another jail
 					 * has claimed its name.  Accomplish
 					 * this by implicitly re-setting the
 					 * name.
 					 */
 					if (name == NULL)
 						name = prison_name(mypr, pr);
 				}
 			}
 		} else {
 			/* Update: jid must exist. */
 			if (cuflags == JAIL_UPDATE) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d not found", jid);
 				goto done_deref;
 			}
 		}
 	}
 	/*
 	 * If the caller provided a name, look for a jail by that name.
 	 * This has different semantics for creates and updates keyed by jid
 	 * (where the name must not already exist in a different jail),
 	 * and updates keyed by the name itself (where the name must exist
 	 * because that is the jail being updated).
 	 */
 	namelc = NULL;
 	if (name != NULL) {
 		namelc = strrchr(name, '.');
 		if (namelc == NULL)
 			namelc = name;
 		else {
 			/*
 			 * This is a hierarchical name.  Split it into the
 			 * parent and child names, and make sure the parent
 			 * exists or matches an already found jail.
 			 */
 			if (pr != NULL) {
 				if (strncmp(name, ppr->pr_name, namelc - name)
 				    || ppr->pr_name[namelc - name] != '\0') {
 					error = EINVAL;
 					vfs_opterror(opts,
 					    "cannot change jail's parent");
 					goto done_deref;
 				}
 			} else {
 				*namelc = '\0';
 				ppr = prison_find_name(mypr, name);
 				if (ppr == NULL) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" not found", name);
 					goto done_deref;
 				}
 				mtx_unlock(&ppr->pr_mtx);
 				if (!prison_isalive(ppr)) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" is dying", name);
 					goto done_deref;
 				}
 				*namelc = '.';
 			}
 			namelc++;
 		}
 		if (namelc[0] != '\0') {
 			pnamelen =
 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 			deadpr = NULL;
 			FOREACH_PRISON_CHILD(ppr, tpr) {
 				if (tpr != pr &&
 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
 					if (prison_isalive(tpr)) {
 						if (pr == NULL &&
 						    cuflags != JAIL_CREATE) {
 							/*
 							 * Use this jail
 							 * for updates.
 							 */
 							pr = tpr;
 							mtx_lock(&pr->pr_mtx);
 							drflags |= PD_LOCKED;
 							break;
 						}
 						/*
 						 * Create, or update(jid):
 						 * name must not exist in an
 						 * active sibling jail.
 						 */
 						error = EEXIST;
 						vfs_opterror(opts,
 						   "jail \"%s\" already exists",
 						   name);
 						goto done_deref;
 					}
 					if (pr == NULL &&
 					    cuflags != JAIL_CREATE) {
 						deadpr = tpr;
 					}
 				}
 			}
 			/* If no active jail is found, use a dying one. */
 			if (deadpr != NULL && pr == NULL) {
 				if (flags & JAIL_DYING) {
 					pr = deadpr;
 					mtx_lock(&pr->pr_mtx);
 					drflags |= PD_LOCKED;
 				} else if (cuflags == JAIL_UPDATE) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" is dying", name);
 					goto done_deref;
 				}
 			}
 			/* Update: name must exist if no jid. */
 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" not found",
 				    name);
 				goto done_deref;
 			}
 		}
 	}
 	/* Update: must provide a jid or name. */
 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
 		error = ENOENT;
 		vfs_opterror(opts, "update specified no jail");
 		goto done_deref;
 	}
 
 	/* If there's no prison to update, create a new one and link it in. */
 	created = pr == NULL;
 	if (created) {
 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
 			if (tpr->pr_childcount >= tpr->pr_childmax) {
 				error = EPERM;
 				vfs_opterror(opts, "prison limit exceeded");
 				goto done_deref;
 			}
 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
 			error = EAGAIN;
 			vfs_opterror(opts, "no available jail IDs");
 			goto done_deref;
 		}
 
 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 		pr->pr_state = PRISON_STATE_INVALID;
 		refcount_init(&pr->pr_ref, 1);
 		refcount_init(&pr->pr_uref, 0);
 		drflags |= PD_DEREF;
 		LIST_INIT(&pr->pr_children);
 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 
 		pr->pr_id = jid;
 		if (inspr != NULL)
 			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
 		else
 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
 
 		pr->pr_parent = ppr;
 		prison_hold(ppr);
 		prison_proc_hold(ppr);
 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount++;
 
 		/* Set some default values, and inherit some from the parent. */
 		if (namelc == NULL)
 			namelc = "";
 		if (path == NULL) {
 			path = "/";
 			root = mypr->pr_root;
 			vref(root);
 		}
 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
 		pr->pr_flags |= PR_HOST;
 #if defined(INET) || defined(INET6)
 #ifdef VIMAGE
 		if (!(pr_flags & PR_VNET))
 #endif
 		{
 #ifdef INET
 			if (!(ch_flags & PR_IP4_USER))
 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
 			else if (!(pr_flags & PR_IP4_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
 				prison_ip_dup(ppr, pr, PR_INET);
 			}
 #endif
 #ifdef INET6
 			if (!(ch_flags & PR_IP6_USER))
 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
 			else if (!(pr_flags & PR_IP6_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
 				prison_ip_dup(ppr, pr, PR_INET6);
 			}
 #endif
 		}
 #endif
 		/* Source address selection is always on by default. */
 		pr->pr_flags |= _PR_IP_SADDRSEL;
 
 		pr->pr_securelevel = ppr->pr_securelevel;
 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
 
 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
 		if (osrelstr == NULL)
 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
 			    sizeof(pr->pr_osrelease));
 		else
 			strlcpy(pr->pr_osrelease, osrelstr,
 			    sizeof(pr->pr_osrelease));
 
 #ifdef VIMAGE
 		/* Allocate a new vnet if specified. */
 		pr->pr_vnet = (pr_flags & PR_VNET)
 		    ? vnet_alloc() : ppr->pr_vnet;
 #endif
 		/*
 		 * Allocate a dedicated cpuset for each jail.
 		 * Unlike other initial settings, this may return an error.
 		 */
 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
 		if (error)
 			goto done_deref;
 
 		mtx_lock(&pr->pr_mtx);
 		drflags |= PD_LOCKED;
 	} else {
 		/*
 		 * Grab a reference for existing prisons, to ensure they
 		 * continue to exist for the duration of the call.
 		 */
 		prison_hold(pr);
 		drflags |= PD_DEREF;
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 		if ((pr->pr_flags & PR_VNET) &&
 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "vnet jails cannot have IP address restrictions");
 			goto done_deref;
 		}
 #endif
 #ifdef INET
 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip4 cannot be changed after creation");
 			goto done_deref;
 		}
 #endif
 #ifdef INET6
 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip6 cannot be changed after creation");
 			goto done_deref;
 		}
 #endif
 	}
 
 	/* Do final error checking before setting anything. */
 	if (gotslevel) {
 		if (slevel < ppr->pr_securelevel) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotchildmax) {
 		if (childmax >= ppr->pr_childmax) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotenforce) {
 		if (enforce < ppr->pr_enforce_statfs) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotrsnum) {
 		/*
 		 * devfs_rsnum is a uint16_t
 		 */
 		if (rsnum < 0 || rsnum > 65535) {
 			error = EINVAL;
 			goto done_deref;
 		}
 		/*
 		 * Nested jails always inherit parent's devfs ruleset
 		 */
 		if (jailed(td->td_ucred)) {
 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
 				error = EPERM;
 				goto done_deref;
 			} else
 				rsnum = ppr->pr_devfs_rsnum;
 		}
 	}
 #ifdef INET
 	if (ip4s > 0) {
 		if ((ppr->pr_flags & PR_IP4) &&
 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4,
 		    PR_INET)) {
 			error = EPERM;
 			goto done_deref;
 		}
 		if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) {
 			error = EADDRINUSE;
 			vfs_opterror(opts, "IPv4 addresses clash");
 			goto done_deref;
 		}
 	}
 #endif
 #ifdef INET6
 	if (ip6s > 0) {
 		if ((ppr->pr_flags & PR_IP6) &&
 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6,
 		    PR_INET6)) {
 			error = EPERM;
 			goto done_deref;
 		}
 		if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) {
 			error = EADDRINUSE;
 			vfs_opterror(opts, "IPv6 addresses clash");
 			goto done_deref;
 		}
 	}
 #endif
 	onamelen = namelen = 0;
 	if (namelc != NULL) {
 		/* Give a default name of the jid.  Also allow the name to be
 		 * explicitly the jid - but not any other number, and only in
 		 * normal form (no leading zero/etc).
 		 */
 		if (namelc[0] == '\0')
 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
 		else if ((strtoul(namelc, &p, 10) != jid ||
 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "name cannot be numeric (unless it is the jid)");
 			goto done_deref;
 		}
 		/*
 		 * Make sure the name isn't too long for the prison or its
 		 * children.
 		 */
 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 		onamelen = strlen(pr->pr_name + pnamelen);
 		namelen = strlen(namelc);
 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
 			error = ENAMETOOLONG;
 			goto done_deref;
 		}
 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
 			    sizeof(pr->pr_name)) {
 				error = ENAMETOOLONG;
 				goto done_deref;
 			}
 		}
 	}
 	pr_allow_diff = pr_allow & ~ppr->pr_allow;
 	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
 		error = EPERM;
 		goto done_deref;
 	}
 
 	/*
 	 * Let modules check their parameters.  This requires unlocking and
 	 * then re-locking the prison, but this is still a valid state as long
 	 * as allprison_lock remains xlocked.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
 	if (error != 0)
 		goto done_deref;
 	mtx_lock(&pr->pr_mtx);
 	drflags |= PD_LOCKED;
 
 	/* At this point, all valid parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_deref;
 		}
 	}
 
 	/* Set the parameters of the prison. */
 #ifdef INET
 	redo_ip4 = false;
 	if (pr_flags & PR_IP4_USER) {
 		pr->pr_flags |= PR_IP4;
 		prison_ip_set(pr, PR_INET, ip4);
 		ip4 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (!prison_ip_restrict(tpr, PR_INET, NULL)) {
 				redo_ip4 = true;
 				descend = 0;
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	redo_ip6 = false;
 	if (pr_flags & PR_IP6_USER) {
 		pr->pr_flags |= PR_IP6;
 		prison_ip_set(pr, PR_INET6, ip6);
 		ip6 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (!prison_ip_restrict(tpr, PR_INET6, NULL)) {
 				redo_ip6 = true;
 				descend = 0;
 			}
 		}
 	}
 #endif
 	if (gotslevel) {
 		pr->pr_securelevel = slevel;
 		/* Set all child jails to be at least this level. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_securelevel < slevel)
 				tpr->pr_securelevel = slevel;
 	}
 	if (gotchildmax) {
 		pr->pr_childmax = childmax;
 		/* Set all child jails to under this limit. */
 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
 			if (tpr->pr_childmax > childmax - level)
 				tpr->pr_childmax = childmax > level
 				    ? childmax - level : 0;
 	}
 	if (gotenforce) {
 		pr->pr_enforce_statfs = enforce;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_enforce_statfs < enforce)
 				tpr->pr_enforce_statfs = enforce;
 	}
 	if (gotrsnum) {
 		pr->pr_devfs_rsnum = rsnum;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_devfs_rsnum = rsnum;
 	}
 	if (namelc != NULL) {
 		if (ppr == &prison0)
 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
 		else
 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
 			    ppr->pr_name, namelc);
 		/* Change this component of child names. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
 			    strlen(tpr->pr_name + onamelen) + 1);
 			bcopy(pr->pr_name, tpr->pr_name, namelen);
 		}
 	}
 	if (path != NULL) {
 		/* Try to keep a real-rooted full pathname. */
 		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
 		pr->pr_root = root;
 		root = NULL;
 	}
 	if (PR_HOST & ch_flags & ~pr_flags) {
 		if (pr->pr_flags & PR_HOST) {
 			/*
 			 * Copy the parent's host info.  As with pr_ip4 above,
 			 * the lack of a lock on the parent is not a problem;
 			 * it is always set with allprison_lock at least
 			 * shared, and is held exclusively here.
 			 */
 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
 			    sizeof(pr->pr_hostname));
 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
 			    sizeof(pr->pr_domainname));
 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
 			    sizeof(pr->pr_hostuuid));
 			pr->pr_hostid = pr->pr_parent->pr_hostid;
 		}
 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
 		/* Set this prison, and any descendants without PR_HOST. */
 		if (host != NULL)
 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
 		if (domain != NULL)
 			strlcpy(pr->pr_domainname, domain, 
 			    sizeof(pr->pr_domainname));
 		if (uuid != NULL)
 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
 		if (gothid)
 			pr->pr_hostid = hid;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			if (tpr->pr_flags & PR_HOST)
 				descend = 0;
 			else {
 				if (host != NULL)
 					strlcpy(tpr->pr_hostname,
 					    pr->pr_hostname,
 					    sizeof(tpr->pr_hostname));
 				if (domain != NULL)
 					strlcpy(tpr->pr_domainname, 
 					    pr->pr_domainname,
 					    sizeof(tpr->pr_domainname));
 				if (uuid != NULL)
 					strlcpy(tpr->pr_hostuuid,
 					    pr->pr_hostuuid,
 					    sizeof(tpr->pr_hostuuid));
 				if (gothid)
 					tpr->pr_hostid = hid;
 			}
 		}
 	}
 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
 	if ((tallow = ch_allow & ~pr_allow))
 		prison_set_allow_locked(pr, tallow, 0);
 	/*
 	 * Persistent prisons get an extra reference, and prisons losing their
 	 * persist flag lose that reference.
 	 */
 	born = !prison_isalive(pr);
 	if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
 		if (pr_flags & PR_PERSIST) {
 			prison_hold(pr);
 			/*
 			 * This may make a dead prison alive again, but wait
 			 * to label it as such until after OSD calls have had
 			 * a chance to run (and perhaps to fail).
 			 */
 			refcount_acquire(&pr->pr_uref);
 		} else {
 			drflags |= PD_DEUREF;
 			prison_free_not_last(pr);
 		}
 	}
 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 	/*
 	 * Any errors past this point will need to de-persist newly created
 	 * prisons, as well as call remove methods.
 	 */
 	if (born)
 		drflags |= PD_KILL;
 
 #ifdef RACCT
 	if (racct_enable && created)
 		prison_racct_attach(pr);
 #endif
 
 	/* Locks may have prevented a complete restriction of child IP
 	 * addresses.  If so, allocate some more memory and try again.
 	 */
 #ifdef INET
 	while (redo_ip4) {
 		ip4s = pr->pr_addrs[PR_INET]->ips;
 		MPASS(ip4 == NULL);
 		ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip4 = false;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (!prison_ip_restrict(tpr, PR_INET, &ip4))
 				redo_ip4 = true;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 #ifdef INET6
 	while (redo_ip6) {
 		ip6s = pr->pr_addrs[PR_INET6]->ips;
 		MPASS(ip6 == NULL);
 		ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip6 = false;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (!prison_ip_restrict(tpr, PR_INET6, &ip6))
 				redo_ip6 = true;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 
 	/* Let the modules do their work. */
 	if (born) {
 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 		if (error)
 			goto done_deref;
 	}
 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
 	if (error)
 		goto done_deref;
 
 	/*
 	 * A new prison is now ready to be seen; either it has gained a user
 	 * reference via persistence, or is about to gain one via attachment.
 	 */
 	if (born) {
 		drflags = prison_lock_xlock(pr, drflags);
 		pr->pr_state = PRISON_STATE_ALIVE;
 	}
 
 	/* Attach this process to the prison if requested. */
 	if (flags & JAIL_ATTACH) {
 		error = do_jail_attach(td, pr,
 		    prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
 		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
 		if (error) {
 			vfs_opterror(opts, "attach failed");
 			goto done_deref;
 		}
 	}
 
 #ifdef RACCT
 	if (racct_enable && !created) {
 		if (drflags & PD_LOCKED) {
 			mtx_unlock(&pr->pr_mtx);
 			drflags &= ~PD_LOCKED;
 		}
 		if (drflags & PD_LIST_XLOCKED) {
 			sx_xunlock(&allprison_lock);
 			drflags &= ~PD_LIST_XLOCKED;
 		}
 		prison_racct_modify(pr);
 	}
 #endif
 
 	if (born && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
 	    (pr->pr_root->v_vflag & VV_ROOT) == 0)
 		printf("Warning jail jid=%d: mountd/nfsd requires a separate"
 		   " file system\n", pr->pr_id);
 
 	drflags &= ~PD_KILL;
 	td->td_retval[0] = pr->pr_id;
 
  done_deref:
 	/* Release any temporary prison holds and/or locks. */
 	if (pr != NULL)
 		prison_deref(pr, drflags);
 	else if (drflags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	else if (drflags & PD_LIST_XLOCKED)
 		sx_xunlock(&allprison_lock);
 	if (root != NULL)
 		vrele(root);
  done_errmsg:
 	if (error) {
 		/* Write the error message back to userspace. */
 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
 		    &errmsg_len) == 0 && errmsg_len > 0) {
 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				(void)copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
  done_free:
 #ifdef INET
 	prison_ip_free(ip4);
 #endif
 #ifdef INET6
 	prison_ip_free(ip6);
 #endif
 	if (g_path != NULL)
 		free(g_path, M_TEMP);
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Find the next available prison ID.  Return the ID on success, or zero
  * on failure.  Also set a pointer to the allprison list entry the prison
  * should be inserted before.
  */
 static int
 get_next_prid(struct prison **insprp)
 {
 	struct prison *inspr;
 	int jid, maxid;
 
 	jid = lastprid % JAIL_MAX + 1;
 	if (TAILQ_EMPTY(&allprison) ||
 	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
 		/*
 		 * A common case is for all jails to be implicitly numbered,
 		 * which means they'll go on the end of the list, at least
 		 * for the first JAIL_MAX times.
 		 */
 		inspr = NULL;
 	} else {
 		/*
 		 * Take two passes through the allprison list: first starting
 		 * with the proposed jid, then ending with it.
 		 */
 		for (maxid = JAIL_MAX; maxid != 0; ) {
 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
 				if (inspr->pr_id < jid)
 					continue;
 				if (inspr->pr_id > jid) {
 					/* Found an opening. */
 					maxid = 0;
 					break;
 				}
 				if (++jid > maxid) {
 					if (lastprid == maxid || lastprid == 0)
 					{
 						/*
 						 * The entire legal range
 						 * has been traversed
 						 */
 						return 0;
 					}
 					/* Try again from the start. */
 					jid = 1;
 					maxid = lastprid;
 					break;
 				}
 			}
 			if (inspr == NULL) {
 				/* Found room at the end of the list. */
 				break;
 			}
 		}
 	}
 	*insprp = inspr;
 	lastprid = jid;
 	return (jid);
 }
 
 /*
  * struct jail_get_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_get(struct thread *td, struct jail_get_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		error = copyout(auio->uio_iov, uap->iovp,
-		    uap->iovcnt * sizeof (struct iovec));
-	free(auio, M_IOV);
+		    uap->iovcnt * sizeof(struct iovec));
+	freeuio(auio);
 	return (error);
 }
 
 int
 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 {
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 	struct prison *pr, *mypr;
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	char *errmsg, *name;
 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
 	unsigned f;
 
 	if (flags & ~JAIL_GET_MASK)
 		return (EINVAL);
 
 	/* Get the parameter list. */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
 	mypr = td->td_ucred->cr_prison;
 	pr = NULL;
 
 	/*
 	 * Find the prison specified by one of: lastjid, jid, name.
 	 */
 	sx_slock(&allprison_lock);
 	drflags = PD_LIST_SLOCKED;
 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 	if (error == 0) {
 		TAILQ_FOREACH(pr, &allprison, pr_list) {
 			if (pr->pr_id > jid &&
 			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
 			    prison_ischild(mypr, pr)) {
 				mtx_lock(&pr->pr_mtx);
 				drflags |= PD_LOCKED;
 				goto found_prison;
 			}
 		}
 		error = ENOENT;
 		vfs_opterror(opts, "no jail after %d", jid);
 		goto done;
 	} else if (error != ENOENT)
 		goto done;
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == 0) {
 		if (jid != 0) {
 			pr = prison_find_child(mypr, jid);
 			if (pr != NULL) {
 				drflags |= PD_LOCKED;
 				if (!(prison_isalive(pr) ||
 				    (flags & JAIL_DYING))) {
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done;
 				}
 				goto found_prison;
 			}
 			error = ENOENT;
 			vfs_opterror(opts, "jail %d not found", jid);
 			goto done;
 		}
 	} else if (error != ENOENT)
 		goto done;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == 0) {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done;
 		}
 		pr = prison_find_name(mypr, name);
 		if (pr != NULL) {
 			drflags |= PD_LOCKED;
 			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" is dying",
 				    name);
 				goto done;
 			}
 			goto found_prison;
 		}
 		error = ENOENT;
 		vfs_opterror(opts, "jail \"%s\" not found", name);
 		goto done;
 	} else if (error != ENOENT)
 		goto done;
 
 	vfs_opterror(opts, "no jail specified");
 	error = ENOENT;
 	goto done;
 
  found_prison:
 	/* Get the parameters of the prison. */
 	prison_hold(pr);
 	drflags |= PD_DEREF;
 	td->td_retval[0] = pr->pr_id;
 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
 	    sizeof(pr->pr_cpuset->cs_id));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done;
 #ifdef INET
 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip,
 	    pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips *
 	    pr_families[PR_INET].size : 0 );
 	if (error != 0 && error != ENOENT)
 		goto done;
 #endif
 #ifdef INET6
 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip,
 	    pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips *
 	    pr_families[PR_INET6].size : 0 );
 	if (error != 0 && error != ENOENT)
 		goto done;
 #endif
 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
 	    sizeof(pr->pr_securelevel));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
 	    sizeof(pr->pr_childcount));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
 	    sizeof(pr->pr_childmax));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
 	if (error != 0 && error != ENOENT)
 		goto done;
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32 = pr->pr_hostid;
 
 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
 	} else
 #endif
 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
 	    sizeof(pr->pr_hostid));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
 	    sizeof(pr->pr_enforce_statfs));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
 	    sizeof(pr->pr_devfs_rsnum));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	for (bf = pr_flag_bool;
 	     bf < pr_flag_bool + nitems(pr_flag_bool);
 	     bf++) {
 		i = (pr->pr_flags & bf->flag) ? 1 : 0;
 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 		i = !i;
 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		f = pr->pr_flags & (jsf->disable | jsf->new);
 		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
 		    : (f == jsf->new) ? JAIL_SYS_NEW
 		    : JAIL_SYS_INHERIT;
 		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		i = (pr->pr_allow & bf->flag) ? 1 : 0;
 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 		i = !i;
 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	i = !prison_isalive(pr);
 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	i = !i;
 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
 	    sizeof(pr->pr_osreldate));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
 	if (error != 0 && error != ENOENT)
 		goto done;
 
 	/* Get the module parameters. */
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
 	if (error)
 		goto done;
 	prison_deref(pr, drflags);
 	pr = NULL;
 	drflags = 0;
 
 	/* By now, all parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done;
 		}
 	}
 
 	/* Write the fetched parameters back to userspace. */
 	error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
 			pos = 2 * opt->pos + 1;
 			optuio->uio_iov[pos].iov_len = opt->len;
 			if (opt->value != NULL) {
 				if (optuio->uio_segflg == UIO_SYSSPACE) {
 					bcopy(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 				} else {
 					error = copyout(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 					if (error)
 						break;
 				}
 			}
 		}
 	}
 
  done:
 	/* Release any temporary prison holds and/or locks. */
 	if (pr != NULL)
 		prison_deref(pr, drflags);
 	else if (drflags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	if (error && errmsg_pos >= 0) {
 		/* Write the error message back to userspace. */
 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
 		errmsg_pos = 2 * errmsg_pos + 1;
 		if (errmsg_len > 0) {
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				(void)copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * struct jail_remove_args {
  *	int jid;
  * };
  */
 int
 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 {
 	struct prison *pr;
 	int error;
 
 	error = priv_check(td, PRIV_JAIL_REMOVE);
 	if (error)
 		return (error);
 
 	sx_xlock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_xunlock(&allprison_lock);
 		return (EINVAL);
 	}
 	if (!prison_isalive(pr)) {
 		/* Silently ignore already-dying prisons. */
 		mtx_unlock(&pr->pr_mtx);
 		sx_xunlock(&allprison_lock);
 		return (0);
 	}
 	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
 	return (0);
 }
 
 /*
  * struct jail_attach_args {
  *	int jid;
  * };
  */
 int
 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 {
 	struct prison *pr;
 	int error;
 
 	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/* Do not allow a process to attach to a prison that is not alive. */
 	if (!prison_isalive(pr)) {
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
 }
 
 static int
 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 {
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	sx_assert(&allprison_lock, SX_LOCKED);
 	drflags &= PD_LOCK_FLAGS;
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
 	 * different jails at the same time.  It is important for
 	 * user processes not to do this, or they might end up with
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
 	prison_hold(pr);
 	refcount_acquire(&pr->pr_uref);
 	drflags |= PD_DEREF | PD_DEUREF;
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 
 	/* Let modules do whatever they need to prepare for attaching. */
 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
 	if (error) {
 		prison_deref(pr, drflags);
 		return (error);
 	}
 	sx_unlock(&allprison_lock);
 	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
 
 	/*
 	 * Reparent the newly attached process to this jail.
 	 */
 	p = td->td_proc;
 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
 	if (error)
 		goto e_revert_osd;
 
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
 #ifdef MAC
 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
 		goto e_unlock;
 #endif
 	VOP_UNLOCK(pr->pr_root);
 	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
 		goto e_revert_osd;
 
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 	newcred->cr_prison = pr;
 	proc_set_cred(p, newcred);
 	setsugid(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 	crfree(newcred);
 #endif
 	prison_proc_relink(oldcred->cr_prison, pr, p);
 	prison_deref(oldcred->cr_prison, drflags);
 	crfree(oldcred);
 
 	/*
 	 * If the prison was killed while changing credentials, die along
 	 * with it.
 	 */
 	if (!prison_isalive(pr)) {
 		PROC_LOCK(p);
 		kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 	}
 
 	return (0);
 
  e_unlock:
 	VOP_UNLOCK(pr->pr_root);
  e_revert_osd:
 	/* Tell modules this thread is still in its old jail after all. */
 	sx_slock(&allprison_lock);
 	drflags |= PD_LIST_SLOCKED;
 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
 	prison_deref(pr, drflags);
 	return (error);
 }
 
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
 struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (pr->pr_id < prid)
 			continue;
 		if (pr->pr_id > prid)
 			break;
 		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
 		mtx_lock(&pr->pr_mtx);
 		return (pr);
 	}
 	return (NULL);
 }
 
 /*
  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_child(struct prison *mypr, int prid)
 {
 	struct prison *pr;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (pr->pr_id == prid) {
 			KASSERT(prison_isvalid(pr),
 			    ("Found invalid prison %p", pr));
 			mtx_lock(&pr->pr_mtx);
 			return (pr);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Look for the name relative to mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_name(struct prison *mypr, const char *name)
 {
 	struct prison *pr, *deadpr;
 	size_t mylen;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
 	deadpr = NULL;
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (!strcmp(pr->pr_name + mylen, name)) {
 			KASSERT(prison_isvalid(pr),
 			    ("Found invalid prison %p", pr));
 			if (prison_isalive(pr)) {
 				mtx_lock(&pr->pr_mtx);
 				return (pr);
 			}
 			deadpr = pr;
 		}
 	}
 	/* There was no valid prison - perhaps there was a dying one. */
 	if (deadpr != NULL)
 		mtx_lock(&deadpr->pr_mtx);
 	return (deadpr);
 }
 
 /*
  * See if a prison has the specific flag set.  The prison should be locked,
  * unless checking for flags that are only set at jail creation (such as
  * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
  * to any other prison data.
  */
 bool
 prison_flag(struct ucred *cred, unsigned flag)
 {
 
 	return ((cred->cr_prison->pr_flags & flag) != 0);
 }
 
 /*
  * See if a prison has the specific allow flag set.
  * The prison *should* be locked, or only a single bit is examined, without
  * regard to any other prison data.
  */
 bool
 prison_allow(struct ucred *cred, unsigned flag)
 {
 
 	return ((cred->cr_prison->pr_allow & flag) != 0);
 }
 
 /*
  * Hold a prison reference, by incrementing pr_ref.  It is generally
  * an error to hold a prison that does not already have a reference.
  * A prison record will remain valid as long as it has at least one
  * reference, and will not be removed as long as either the prison
  * mutex or the allprison lock is held (allprison_lock may be shared).
  */
 void
 prison_hold_locked(struct prison *pr)
 {
 
 	/* Locking is no longer required. */
 	prison_hold(pr);
 }
 
 void
 prison_hold(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
 
 	KASSERT(was_valid,
 	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
 #else
 	refcount_acquire(&pr->pr_ref);
 #endif
 }
 
 /*
  * Remove a prison reference.  If that was the last reference, the
  * prison will be removed (at a later time).
  */
 void
 prison_free_locked(struct prison *pr)
 {
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	/*
 	 * Locking is no longer required, but unlock because the caller
 	 * expects it.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	prison_free(pr);
 }
 
 void
 prison_free(struct prison *pr)
 {
 
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
 	if (!refcount_release_if_not_last(&pr->pr_ref)) {
 		/*
 		 * Don't remove the last reference in this context,
 		 * in case there are locks held.
 		 */
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 	}
 }
 
 static void
 prison_free_not_last(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int lastref;
 
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
 	lastref = refcount_release(&pr->pr_ref);
 	KASSERT(!lastref,
 	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
 	     pr, pr->pr_id));
 #else
 	refcount_release(&pr->pr_ref);
 #endif
 }
 
 /*
  * Hold a prison for user visibility, by incrementing pr_uref.
  * It is generally an error to hold a prison that isn't already
  * user-visible, except through the jail system calls.  It is also
  * an error to hold an invalid prison.  A prison record will remain
  * alive as long as it has at least one user reference, and will not
  * be set to the dying state until the prison mutex and allprison_lock
  * are both freed.
  */
 void
 prison_proc_hold(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
 
 	KASSERT(was_alive,
 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
 #else
 	refcount_acquire(&pr->pr_uref);
 #endif
 }
 
 /*
  * Remove a prison user reference.  If it was the last reference, the
  * prison will be considered "dying", and may be removed once all of
  * its references are dropped.
  */
 void
 prison_proc_free(struct prison *pr)
 {
 
 	/*
 	 * Locking is only required when releasing the last reference.
 	 * This allows assurance that a locked prison will remain alive
 	 * until it is unlocked.
 	 */
 	KASSERT(refcount_load(&pr->pr_uref) > 0,
 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
 	if (!refcount_release_if_not_last(&pr->pr_uref)) {
 		/*
 		 * Don't remove the last user reference in this context,
 		 * which is expected to be a process that is not only locked,
 		 * but also half dead.  Add a reference so any calls to
 		 * prison_free() won't re-submit the task.
 		 */
 		prison_hold(pr);
 		mtx_lock(&pr->pr_mtx);
 		KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
 		    ("Redundant last reference in prison_proc_free (jid=%d)",
 		     pr->pr_id));
 		pr->pr_flags |= PR_COMPLETE_PROC;
 		mtx_unlock(&pr->pr_mtx);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 	}
 }
 
 static void
 prison_proc_free_not_last(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int lastref;
 
 	KASSERT(refcount_load(&pr->pr_uref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
 	lastref = refcount_release(&pr->pr_uref);
 	KASSERT(!lastref,
 	    ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
 	     pr, pr->pr_id));
 #else
 	refcount_release(&pr->pr_uref);
 #endif
 }
 
 void
 prison_proc_link(struct prison *pr, struct proc *p)
 {
 
 	sx_assert(&allproc_lock, SA_XLOCKED);
 	LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist);
 }
 
 void
 prison_proc_unlink(struct prison *pr, struct proc *p)
 {
 
 	sx_assert(&allproc_lock, SA_XLOCKED);
 	LIST_REMOVE(p, p_jaillist);
 }
 
 static void
 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p)
 {
 
 	sx_xlock(&allproc_lock);
 	prison_proc_unlink(opr, p);
 	prison_proc_link(npr, p);
 	sx_xunlock(&allproc_lock);
 }
 
 /*
  * Complete a call to either prison_free or prison_proc_free.
  */
 static void
 prison_complete(void *context, int pending)
 {
 	struct prison *pr = context;
 	int drflags;
 
 	/*
 	 * This could be called to release the last reference, or the last
 	 * user reference (plus the reference held in prison_proc_free).
 	 */
 	drflags = prison_lock_xlock(pr, PD_DEREF);
 	if (pr->pr_flags & PR_COMPLETE_PROC) {
 		pr->pr_flags &= ~PR_COMPLETE_PROC;
 		drflags |= PD_DEUREF;
 	}
 	prison_deref(pr, drflags);
 }
 
 static void
 prison_kill_processes_cb(struct proc *p, void *arg __unused)
 {
 
 	kern_psignal(p, SIGKILL);
 }
 
 /*
  * Note the iteration does not guarantee acting on all processes.
  * Most notably there may be fork or jail_attach in progress.
  */
 void
 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *),
     void *cbarg)
 {
 	struct prison *ppr;
 	struct proc *p;
 
 	if (atomic_load_int(&pr->pr_childcount) == 0) {
 		sx_slock(&allproc_lock);
 		LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) {
 			if (p->p_state == PRS_NEW)
 				continue;
 			PROC_LOCK(p);
 			cb(p, cbarg);
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (atomic_load_int(&pr->pr_childcount) == 0)
 			return;
 		/*
 		 * Some jails popped up during the iteration, fall through to a
 		 * system-wide search.
 		 */
 	}
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
 			for (ppr = p->p_ucred->cr_prison;
 			    ppr != &prison0;
 			    ppr = ppr->pr_parent) {
 				if (ppr == pr) {
 					cb(p, cbarg);
 					break;
 				}
 			}
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 }
 
 /*
  * Remove a prison reference and/or user reference (usually).
  * This assumes context that allows sleeping (for allprison_lock),
  * with no non-sleeping locks held, except perhaps the prison itself.
  * If there are no more references, release and delist the prison.
  * On completion, the prison lock and the allprison lock are both
  * unlocked.
  */
 static void
 prison_deref(struct prison *pr, int flags)
 {
 	struct prisonlist freeprison;
 	struct prison *killpr, *rpr, *ppr, *tpr;
 
 	killpr = NULL;
 	TAILQ_INIT(&freeprison);
 	/*
 	 * Release this prison as requested, which may cause its parent
 	 * to be released, and then maybe its grandparent, etc.
 	 */
 	for (;;) {
 		if (flags & PD_KILL) {
 			/* Kill the prison and its descendents. */
 			KASSERT(pr != &prison0,
 			    ("prison_deref trying to kill prison0"));
 			if (!(flags & PD_DEREF)) {
 				prison_hold(pr);
 				flags |= PD_DEREF;
 			}
 			flags = prison_lock_xlock(pr, flags);
 			prison_deref_kill(pr, &freeprison);
 		}
 		if (flags & PD_DEUREF) {
 			/* Drop a user reference. */
 			KASSERT(refcount_load(&pr->pr_uref) > 0,
 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
 			     pr->pr_id));
 			if (!refcount_release_if_not_last(&pr->pr_uref)) {
 				if (!(flags & PD_DEREF)) {
 					prison_hold(pr);
 					flags |= PD_DEREF;
 				}
 				flags = prison_lock_xlock(pr, flags);
 				if (refcount_release(&pr->pr_uref) &&
 				    pr->pr_state == PRISON_STATE_ALIVE) {
 					/*
 					 * When the last user references goes,
 					 * this becomes a dying prison.
 					 */
 					KASSERT(
 					    refcount_load(&prison0.pr_uref) > 0,
 					    ("prison0 pr_uref=0"));
 					pr->pr_state = PRISON_STATE_DYING;
 					mtx_unlock(&pr->pr_mtx);
 					flags &= ~PD_LOCKED;
 					prison_cleanup(pr);
 				}
 			}
 		}
 		if (flags & PD_KILL) {
 			/*
 			 * Any remaining user references are probably processes
 			 * that need to be killed, either in this prison or its
 			 * descendants.
 			 */
 			if (refcount_load(&pr->pr_uref) > 0)
 				killpr = pr;
 			/* Make sure the parent prison doesn't get killed. */
 			flags &= ~PD_KILL;
 		}
 		if (flags & PD_DEREF) {
 			/* Drop a reference. */
 			KASSERT(refcount_load(&pr->pr_ref) > 0,
 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
 			     pr->pr_id));
 			if (!refcount_release_if_not_last(&pr->pr_ref)) {
 				flags = prison_lock_xlock(pr, flags);
 				if (refcount_release(&pr->pr_ref)) {
 					/*
 					 * When the last reference goes,
 					 * unlink the prison and set it aside.
 					 */
 					KASSERT(
 					    refcount_load(&pr->pr_uref) == 0,
 					    ("prison_deref: last ref, "
 					     "but still has %d urefs (jid=%d)",
 					     pr->pr_uref, pr->pr_id));
 					KASSERT(
 					    refcount_load(&prison0.pr_ref) != 0,
 					    ("prison0 pr_ref=0"));
 					pr->pr_state = PRISON_STATE_INVALID;
 					TAILQ_REMOVE(&allprison, pr, pr_list);
 					LIST_REMOVE(pr, pr_sibling);
 					TAILQ_INSERT_TAIL(&freeprison, pr,
 					    pr_list);
 					for (ppr = pr->pr_parent;
 					     ppr != NULL;
 					     ppr = ppr->pr_parent)
 						ppr->pr_childcount--;
 					/*
 					 * Removing a prison frees references
 					 * from its parent.
 					 */
 					mtx_unlock(&pr->pr_mtx);
 					flags &= ~PD_LOCKED;
 					pr = pr->pr_parent;
 					flags |= PD_DEREF | PD_DEUREF;
 					continue;
 				}
 			}
 		}
 		break;
 	}
 
 	/* Release all the prison locks. */
 	if (flags & PD_LOCKED)
 		mtx_unlock(&pr->pr_mtx);
 	if (flags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	else if (flags & PD_LIST_XLOCKED)
 		sx_xunlock(&allprison_lock);
 
 	/* Kill any processes attached to a killed prison. */
 	if (killpr != NULL)
 		prison_proc_iterate(killpr, prison_kill_processes_cb, NULL);
 
 	/*
 	 * Finish removing any unreferenced prisons, which couldn't happen
 	 * while allprison_lock was held (to avoid a LOR on vrele).
 	 */
 	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
 #ifdef VIMAGE
 		if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
 			vnet_destroy(rpr->pr_vnet);
 #endif
 		if (rpr->pr_root != NULL)
 			vrele(rpr->pr_root);
 		mtx_destroy(&rpr->pr_mtx);
 #ifdef INET
 		prison_ip_free(rpr->pr_addrs[PR_INET]);
 #endif
 #ifdef INET6
 		prison_ip_free(rpr->pr_addrs[PR_INET6]);
 #endif
 		if (rpr->pr_cpuset != NULL)
 			cpuset_rel(rpr->pr_cpuset);
 		osd_jail_exit(rpr);
 #ifdef RACCT
 		if (racct_enable)
 			prison_racct_detach(rpr);
 #endif
 		TAILQ_REMOVE(&freeprison, rpr, pr_list);
 		free(rpr, M_PRISON);
 	}
 }
 
 /*
  * Kill the prison and its descendants.  Mark them as dying, clear the
  * persist flag, and call module remove methods.
  */
 static void
 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
 {
 	struct prison *cpr, *ppr, *rpr;
 	bool descend;
 
 	/*
 	 * Unlike the descendants, the target prison can be killed
 	 * even if it is currently dying.  This is useful for failed
 	 * creation in jail_set(2).
 	 */
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to kill dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
 	refcount_acquire(&pr->pr_uref);
 	pr->pr_state = PRISON_STATE_DYING;
 	mtx_unlock(&pr->pr_mtx);
 
 	rpr = NULL;
 	FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
 		if (descend) {
 			if (!prison_isalive(cpr)) {
 				descend = false;
 				continue;
 			}
 			prison_hold(cpr);
 			prison_proc_hold(cpr);
 			mtx_lock(&cpr->pr_mtx);
 			cpr->pr_state = PRISON_STATE_DYING;
 			cpr->pr_flags |= PR_REMOVE;
 			mtx_unlock(&cpr->pr_mtx);
 			continue;
 		}
 		if (!(cpr->pr_flags & PR_REMOVE))
 			continue;
 		prison_cleanup(cpr);
 		mtx_lock(&cpr->pr_mtx);
 		cpr->pr_flags &= ~PR_REMOVE;
 		if (cpr->pr_flags & PR_PERSIST) {
 			cpr->pr_flags &= ~PR_PERSIST;
 			prison_proc_free_not_last(cpr);
 			prison_free_not_last(cpr);
 		}
 		(void)refcount_release(&cpr->pr_uref);
 		if (refcount_release(&cpr->pr_ref)) {
 			/*
 			 * When the last reference goes, unlink the prison
 			 * and set it aside for prison_deref() to handle.
 			 * Delay unlinking the sibling list to keep the loop
 			 * safe.
 			 */
 			if (rpr != NULL)
 				LIST_REMOVE(rpr, pr_sibling);
 			rpr = cpr;
 			rpr->pr_state = PRISON_STATE_INVALID;
 			TAILQ_REMOVE(&allprison, rpr, pr_list);
 			TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
 			/*
 			 * Removing a prison frees references from its parent.
 			 */
 			ppr = rpr->pr_parent;
 			prison_proc_free_not_last(ppr);
 			prison_free_not_last(ppr);
 			for (; ppr != NULL; ppr = ppr->pr_parent)
 				ppr->pr_childcount--;
 		}
 		mtx_unlock(&cpr->pr_mtx);
 	}
 	if (rpr != NULL)
 		LIST_REMOVE(rpr, pr_sibling);
 
 	prison_cleanup(pr);
 	mtx_lock(&pr->pr_mtx);
 	if (pr->pr_flags & PR_PERSIST) {
 		pr->pr_flags &= ~PR_PERSIST;
 		prison_proc_free_not_last(pr);
 		prison_free_not_last(pr);
 	}
 	(void)refcount_release(&pr->pr_uref);
 }
 
 /*
  * Given the current locking state in the flags, make sure allprison_lock
  * is held exclusive, and the prison is locked.  Return flags indicating
  * the new state.
  */
 static int
 prison_lock_xlock(struct prison *pr, int flags)
 {
 
 	if (!(flags & PD_LIST_XLOCKED)) {
 		/*
 		 * Get allprison_lock, which may be an upgrade,
 		 * and may require unlocking the prison.
 		 */
 		if (flags & PD_LOCKED) {
 			mtx_unlock(&pr->pr_mtx);
 			flags &= ~PD_LOCKED;
 		}
 		if (flags & PD_LIST_SLOCKED) {
 			if (!sx_try_upgrade(&allprison_lock)) {
 				sx_sunlock(&allprison_lock);
 				sx_xlock(&allprison_lock);
 			}
 			flags &= ~PD_LIST_SLOCKED;
 		} else
 			sx_xlock(&allprison_lock);
 		flags |= PD_LIST_XLOCKED;
 	}
 	if (!(flags & PD_LOCKED)) {
 		/* Lock the prison mutex. */
 		mtx_lock(&pr->pr_mtx);
 		flags |= PD_LOCKED;
 	}
 	return flags;
 }
 
 /*
  * Release a prison's resources when it starts dying (when the last user
  * reference is dropped, or when it is killed).
  */
 static void
 prison_cleanup(struct prison *pr)
 {
 	sx_assert(&allprison_lock, SA_XLOCKED);
 	mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
 	vfs_exjail_delete(pr);
 	shm_remove_prison(pr);
 	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 }
 
 /*
  * Set or clear a permission bit in the pr_allow field, passing restrictions
  * (cleared permission) down to child jails.
  */
 void
 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
 {
 	struct prison *pr;
 
 	pr = cred->cr_prison;
 	sx_slock(&allprison_lock);
 	mtx_lock(&pr->pr_mtx);
 	prison_set_allow_locked(pr, flag, enable);
 	mtx_unlock(&pr->pr_mtx);
 	sx_sunlock(&allprison_lock);
 }
 
 static void
 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
 {
 	struct prison *cpr;
 	int descend;
 
 	if (enable != 0)
 		pr->pr_allow |= flag;
 	else {
 		pr->pr_allow &= ~flag;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
 			cpr->pr_allow &= ~flag;
 	}
 }
 
 /*
  * Check if a jail supports the given address family.
  *
  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
  * if not.
  */
 int
 prison_check_af(struct ucred *cred, int af)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 
 	pr = cred->cr_prison;
 #ifdef VIMAGE
 	/* Prisons with their own network stack are not limited. */
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (af)
 	{
 #ifdef INET
 	case AF_INET:
 		if (pr->pr_flags & PR_IP4)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP4) &&
 			    pr->pr_addrs[PR_INET] == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (pr->pr_flags & PR_IP6)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP6) &&
 			    pr->pr_addrs[PR_INET6] == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 	case AF_LOCAL:
 	case AF_ROUTE:
 	case AF_NETLINK:
 		break;
 	default:
 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Check if given address belongs to the jail referenced by cred (wrapper to
  * prison_check_ip[46]).
  *
  * Returns 0 if jail doesn't restrict the address family or if address belongs
  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
  */
 int
 prison_if(struct ucred *cred, const struct sockaddr *sa)
 {
 #ifdef INET
 	const struct sockaddr_in *sai;
 #endif
 #ifdef INET6
 	const struct sockaddr_in6 *sai6;
 #endif
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (sa->sa_family)
 	{
 #ifdef INET
 	case AF_INET:
 		sai = (const struct sockaddr_in *)sa;
 		error = prison_check_ip4(cred, &sai->sin_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sai6 = (const struct sockaddr_in6 *)sa;
 		error = prison_check_ip6(cred, &sai6->sin6_addr);
 		break;
 #endif
 	default:
 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
 int
 prison_check(struct ucred *cred1, struct ucred *cred2)
 {
 
 	return ((cred1->cr_prison == cred2->cr_prison ||
 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
 }
 
 /*
  * For mountd/nfsd to run within a prison, it must be:
  * - A vnet prison.
  * - PR_ALLOW_NFSD must be set on it.
  * - The root directory (pr_root) of the prison must be
  *   a file system mount point, so the mountd can hang
  *   export information on it.
  * - The prison's enforce_statfs cannot be 0, so that
  *   mountd(8) can do exports.
  */
 bool
 prison_check_nfsd(struct ucred *cred)
 {
 
 	if (jailed_without_vnet(cred))
 		return (false);
 	if (!prison_allow(cred, PR_ALLOW_NFSD))
 		return (false);
 	if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
 		return (false);
 	if (cred->cr_prison->pr_enforce_statfs == 0)
 		return (false);
 	return (true);
 }
 
 /*
  * Return true if p2 is a child of p1, otherwise false.
  */
 bool
 prison_ischild(struct prison *pr1, struct prison *pr2)
 {
 
 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
 		if (pr1 == pr2)
 			return (true);
 	return (false);
 }
 
 /*
  * Return true if the prison is currently alive.  A prison is alive if it
  * holds user references and it isn't being removed.
  */
 bool
 prison_isalive(const struct prison *pr)
 {
 
 	if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
 		return (false);
 	return (true);
 }
 
 /*
  * Return true if the prison is currently valid.  A prison is valid if it has
  * been fully created, and is not being destroyed.  Note that dying prisons
  * are still considered valid.  Invalid prisons won't be found under normal
  * circumstances, as they're only put in that state by functions that have
  * an exclusive hold on allprison_lock.
  */
 bool
 prison_isvalid(struct prison *pr)
 {
 
 	if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
 		return (false);
 	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
 		return (false);
 	return (true);
 }
 
 /*
  * Return true if the passed credential is in a jail and that jail does not
  * have its own virtual network stack, otherwise false.
  */
 bool
 jailed_without_vnet(struct ucred *cred)
 {
 
 	if (!jailed(cred))
 		return (false);
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (false);
 #endif
 
 	return (true);
 }
 
 /*
  * Return the correct hostname (domainname, et al) for the passed credential.
  */
 void
 getcredhostname(struct ucred *cred, char *buf, size_t size)
 {
 	struct prison *pr;
 
 	/*
 	 * A NULL credential can be used to shortcut to the physical
 	 * system's hostname.
 	 */
 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
 	mtx_lock(&pr->pr_mtx);
 	strlcpy(buf, pr->pr_hostname, size);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 getcreddomainname(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostid(struct ucred *cred, unsigned long *hostid)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	*hostid = cred->cr_prison->pr_hostid;
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getjailname(struct ucred *cred, char *name, size_t len)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(name, cred->cr_prison->pr_name, len);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 #ifdef VIMAGE
 /*
  * Determine whether the prison represented by cred owns
  * its vnet rather than having it inherited.
  *
  * Returns true in case the prison owns the vnet, false otherwise.
  */
 bool
 prison_owns_vnet(struct ucred *cred)
 {
 
 	/*
 	 * vnets cannot be added/removed after jail creation,
 	 * so no need to lock here.
 	 */
 	return ((cred->cr_prison->pr_flags & PR_VNET) != 0);
 }
 #endif
 
 /*
  * Determine whether the subject represented by cred can "see"
  * status of a mount point.
  * Returns: 0 for permitted, ENOENT otherwise.
  * XXX: This function should be called cr_canseemount() and should be
  *      placed in kern_prot.c.
  */
 int
 prison_canseemount(struct ucred *cred, struct mount *mp)
 {
 	struct prison *pr;
 	struct statfs *sp;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return (0);
 	if (pr->pr_root->v_mount == mp)
 		return (0);
 	if (pr->pr_enforce_statfs == 2)
 		return (ENOENT);
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 * This is ugly check, but this is the only situation when jail's
 	 * directory ends with '/'.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return (0);
 	len = strlen(pr->pr_path);
 	sp = &mp->mnt_stat;
 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 		return (ENOENT);
 	/*
 	 * Be sure that we don't have situation where jail's root directory
 	 * is "/some/path" and mount point is "/some/pathpath".
 	 */
 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 		return (ENOENT);
 	return (0);
 }
 
 void
 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 {
 	char jpath[MAXPATHLEN];
 	struct prison *pr;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return;
 	if (prison_canseemount(cred, mp) != 0) {
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		strlcpy(sp->f_mntonname, "[restricted]",
 		    sizeof(sp->f_mntonname));
 		return;
 	}
 	if (pr->pr_root->v_mount == mp) {
 		/*
 		 * Clear current buffer data, so we are sure nothing from
 		 * the valid path left there.
 		 */
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		*sp->f_mntonname = '/';
 		return;
 	}
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return;
 	len = strlen(pr->pr_path);
 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 	/*
 	 * Clear current buffer data, so we are sure nothing from
 	 * the valid path left there.
 	 */
 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 	if (*jpath == '\0') {
 		/* Should never happen. */
 		*sp->f_mntonname = '/';
 	} else {
 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 	}
 }
 
 /*
  * Check with permission for a specific privilege is granted within jail.  We
  * have a specific list of accepted privileges; the rest are denied.
  */
 int
 prison_priv_check(struct ucred *cred, int priv)
 {
 	struct prison *pr;
 	int error;
 
 	/*
 	 * Some policies have custom handlers. This routine should not be
 	 * called for them. See priv_check_cred().
 	 */
 	switch (priv) {
 	case PRIV_VFS_LOOKUP:
 	case PRIV_VFS_GENERATION:
 		KASSERT(0, ("prison_priv_check instead of a custom handler "
 		    "called for %d\n", priv));
 	}
 
 	if (!jailed(cred))
 		return (0);
 
 #ifdef VIMAGE
 	/*
 	 * Privileges specific to prisons with a virtual network stack.
 	 * There might be a duplicate entry here in case the privilege
 	 * is only granted conditionally in the legacy jail case.
 	 */
 	switch (priv) {
 		/*
 		 * NFS-specific privileges.
 		 */
 	case PRIV_NFS_DAEMON:
 	case PRIV_VFS_GETFH:
 	case PRIV_VFS_MOUNT_EXPORTED:
 		if (!prison_check_nfsd(cred))
 			return (EPERM);
 #ifdef notyet
 	case PRIV_NFS_LOCKD:
 #endif
 		/*
 		 * Network stack privileges.
 		 */
 	case PRIV_NET_BRIDGE:
 	case PRIV_NET_GRE:
 	case PRIV_NET_BPF:
 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
 	case PRIV_NET_ROUTE:
 	case PRIV_NET_TAP:
 	case PRIV_NET_SETIFMTU:
 	case PRIV_NET_SETIFFLAGS:
 	case PRIV_NET_SETIFCAP:
 	case PRIV_NET_SETIFDESCR:
 	case PRIV_NET_SETIFNAME	:
 	case PRIV_NET_SETIFMETRIC:
 	case PRIV_NET_SETIFPHYS:
 	case PRIV_NET_SETIFMAC:
 	case PRIV_NET_SETLANPCP:
 	case PRIV_NET_ADDMULTI:
 	case PRIV_NET_DELMULTI:
 	case PRIV_NET_HWIOCTL:
 	case PRIV_NET_SETLLADDR:
 	case PRIV_NET_ADDIFGROUP:
 	case PRIV_NET_DELIFGROUP:
 	case PRIV_NET_IFCREATE:
 	case PRIV_NET_IFDESTROY:
 	case PRIV_NET_ADDIFADDR:
 	case PRIV_NET_DELIFADDR:
 	case PRIV_NET_LAGG:
 	case PRIV_NET_GIF:
 	case PRIV_NET_SETIFVNET:
 	case PRIV_NET_SETIFFIB:
 	case PRIV_NET_OVPN:
 	case PRIV_NET_ME:
 	case PRIV_NET_WG:
 
 		/*
 		 * 802.11-related privileges.
 		 */
 	case PRIV_NET80211_VAP_GETKEY:
 	case PRIV_NET80211_VAP_MANAGE:
 
 #ifdef notyet
 		/*
 		 * ATM privileges.
 		 */
 	case PRIV_NETATM_CFG:
 	case PRIV_NETATM_ADD:
 	case PRIV_NETATM_DEL:
 	case PRIV_NETATM_SET:
 
 		/*
 		 * Bluetooth privileges.
 		 */
 	case PRIV_NETBLUETOOTH_RAW:
 #endif
 
 		/*
 		 * Netgraph and netgraph module privileges.
 		 */
 	case PRIV_NETGRAPH_CONTROL:
 #ifdef notyet
 	case PRIV_NETGRAPH_TTY:
 #endif
 
 		/*
 		 * IPv4 and IPv6 privileges.
 		 */
 	case PRIV_NETINET_IPFW:
 	case PRIV_NETINET_DIVERT:
 	case PRIV_NETINET_PF:
 	case PRIV_NETINET_DUMMYNET:
 	case PRIV_NETINET_CARP:
 	case PRIV_NETINET_MROUTE:
 	case PRIV_NETINET_RAW:
 	case PRIV_NETINET_ADDRCTRL6:
 	case PRIV_NETINET_ND6:
 	case PRIV_NETINET_SCOPE6:
 	case PRIV_NETINET_ALIFETIME6:
 	case PRIV_NETINET_IPSEC:
 	case PRIV_NETINET_BINDANY:
 
 #ifdef notyet
 		/*
 		 * NCP privileges.
 		 */
 	case PRIV_NETNCP:
 
 		/*
 		 * SMB privileges.
 		 */
 	case PRIV_NETSMB:
 #endif
 
 	/*
 	 * No default: or deny here.
 	 * In case of no permit fall through to next switch().
 	 */
 		if (cred->cr_prison->pr_flags & PR_VNET)
 			return (0);
 	}
 #endif /* VIMAGE */
 
 	switch (priv) {
 		/*
 		 * Allow ktrace privileges for root in jail.
 		 */
 	case PRIV_KTRACE:
 
 #if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
 		 * want to further refine the relationship between audit and
 		 * jail.
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
 		/*
 		 * Allow jailed processes to manipulate process UNIX
 		 * credentials in any way they see fit.
 		 */
 	case PRIV_CRED_SETUID:
 	case PRIV_CRED_SETEUID:
 	case PRIV_CRED_SETGID:
 	case PRIV_CRED_SETEGID:
 	case PRIV_CRED_SETGROUPS:
 	case PRIV_CRED_SETREUID:
 	case PRIV_CRED_SETREGID:
 	case PRIV_CRED_SETRESUID:
 	case PRIV_CRED_SETRESGID:
 
 		/*
 		 * Jail implements visibility constraints already, so allow
 		 * jailed root to override uid/gid-based constraints.
 		 */
 	case PRIV_SEEOTHERGIDS:
 	case PRIV_SEEOTHERUIDS:
 	case PRIV_SEEJAILPROC:
 
 		/*
 		 * Jail implements inter-process debugging limits already, so
 		 * allow jailed root various debugging privileges.
 		 */
 	case PRIV_DEBUG_DIFFCRED:
 	case PRIV_DEBUG_SUGID:
 	case PRIV_DEBUG_UNPRIV:
 
 		/*
 		 * Allow jail to set various resource limits and login
 		 * properties, and for now, exceed process resource limits.
 		 */
 	case PRIV_PROC_LIMIT:
 	case PRIV_PROC_SETLOGIN:
 	case PRIV_PROC_SETRLIMIT:
 
 		/*
 		 * System V and POSIX IPC privileges are granted in jail.
 		 */
 	case PRIV_IPC_READ:
 	case PRIV_IPC_WRITE:
 	case PRIV_IPC_ADMIN:
 	case PRIV_IPC_MSGSIZE:
 	case PRIV_MQ_ADMIN:
 
 		/*
 		 * Jail operations within a jail work on child jails.
 		 */
 	case PRIV_JAIL_ATTACH:
 	case PRIV_JAIL_SET:
 	case PRIV_JAIL_REMOVE:
 
 		/*
 		 * Jail implements its own inter-process limits, so allow
 		 * root processes in jail to change scheduling on other
 		 * processes in the same jail.  Likewise for signalling.
 		 */
 	case PRIV_SCHED_DIFFCRED:
 	case PRIV_SCHED_CPUSET:
 	case PRIV_SIGNAL_DIFFCRED:
 	case PRIV_SIGNAL_SUGID:
 
 		/*
 		 * Allow jailed processes to write to sysctls marked as jail
 		 * writable.
 		 */
 	case PRIV_SYSCTL_WRITEJAIL:
 
 		/*
 		 * Allow root in jail to manage a variety of quota
 		 * properties.  These should likely be conditional on a
 		 * configuration option.
 		 */
 	case PRIV_VFS_GETQUOTA:
 	case PRIV_VFS_SETQUOTA:
 
 		/*
 		 * Since Jail relies on chroot() to implement file system
 		 * protections, grant many VFS privileges to root in jail.
 		 * Be careful to exclude mount-related and NFS-related
 		 * privileges.
 		 */
 	case PRIV_VFS_READ:
 	case PRIV_VFS_WRITE:
 	case PRIV_VFS_ADMIN:
 	case PRIV_VFS_EXEC:
 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
 	case PRIV_VFS_CHFLAGS_DEV:
 	case PRIV_VFS_CHOWN:
 	case PRIV_VFS_CHROOT:
 	case PRIV_VFS_RETAINSUGID:
 	case PRIV_VFS_FCHROOT:
 	case PRIV_VFS_LINK:
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STAT:
 	case PRIV_VFS_STICKYFILE:
 
 		/*
 		 * As in the non-jail case, non-root users are expected to be
 		 * able to read kernel/physical memory (provided /dev/[k]mem
 		 * exists in the jail and they have permission to access it).
 		 */
 	case PRIV_KMEM_READ:
 		return (0);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * setting system flags.
 		 */
 	case PRIV_VFS_SYSFLAGS:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * mounting/unmounting file systems.
 		 */
 	case PRIV_VFS_MOUNT:
 	case PRIV_VFS_UNMOUNT:
 	case PRIV_VFS_MOUNT_NONUSER:
 	case PRIV_VFS_MOUNT_OWNER:
 		pr = cred->cr_prison;
 		prison_lock(pr);
 		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
 			error = 0;
 		else
 			error = EPERM;
 		prison_unlock(pr);
 		return (error);
 
 		/*
 		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
 		 * policy.  priv_check_cred will not specifically allow it, and
 		 * we may want a MAC policy to allow it.
 		 */
 	case PRIV_VFS_READ_DIR:
 		return (0);
 
 		/*
 		 * Conditionnaly allow locking (unlocking) physical pages
 		 * in memory.
 		 */
 	case PRIV_VM_MLOCK:
 	case PRIV_VM_MUNLOCK:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Conditionally allow jailed root to bind reserved ports.
 		 */
 	case PRIV_NETINET_RESERVEDPORT:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Allow jailed root to reuse in-use ports.
 		 */
 	case PRIV_NETINET_REUSEPORT:
 		return (0);
 
 		/*
 		 * Allow jailed root to set certain IPv4/6 (option) headers.
 		 */
 	case PRIV_NETINET_SETHDROPTS:
 		return (0);
 
 		/*
 		 * Conditionally allow creating raw sockets in jail.
 		 */
 	case PRIV_NETINET_RAW:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Since jail implements its own visibility limits on netstat
 		 * sysctls, allow getcred.  This allows identd to work in
 		 * jail.
 		 */
 	case PRIV_NETINET_GETCRED:
 		return (0);
 
 		/*
 		 * Allow jailed root to set loginclass.
 		 */
 	case PRIV_PROC_SETLOGINCLASS:
 		return (0);
 
 		/*
 		 * Do not allow a process inside a jail to read the kernel
 		 * message buffer unless explicitly permitted.
 		 */
 	case PRIV_MSGBUF:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
 			return (0);
 		return (EPERM);
 
 	default:
 		/*
 		 * In all remaining cases, deny the privilege request.  This
 		 * includes almost all network privileges, many system
 		 * configuration privileges.
 		 */
 		return (EPERM);
 	}
 }
 
 /*
  * Return the part of pr2's name that is relative to pr1, or the whole name
  * if it does not directly follow.
  */
 
 char *
 prison_name(struct prison *pr1, struct prison *pr2)
 {
 	char *name;
 
 	/* Jails see themselves as "0" (if they see themselves at all). */
 	if (pr1 == pr2)
 		return "0";
 	name = pr2->pr_name;
 	if (prison_ischild(pr1, pr2)) {
 		/*
 		 * pr1 isn't locked (and allprison_lock may not be either)
 		 * so its length can't be counted on.  But the number of dots
 		 * can be counted on - and counted.
 		 */
 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
 			name = strchr(name, '.') + 1;
 	}
 	return (name);
 }
 
 /*
  * Return the part of pr2's path that is relative to pr1, or the whole path
  * if it does not directly follow.
  */
 static char *
 prison_path(struct prison *pr1, struct prison *pr2)
 {
 	char *path1, *path2;
 	int len1;
 
 	path1 = pr1->pr_path;
 	path2 = pr2->pr_path;
 	if (!strcmp(path1, "/"))
 		return (path2);
 	len1 = strlen(path1);
 	if (strncmp(path1, path2, len1))
 		return (path2);
 	if (path2[len1] == '\0')
 		return "/";
 	if (path2[len1] == '/')
 		return (path2 + len1);
 	return (path2);
 }
 
 /*
  * Jail-related sysctls.
  */
 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Jails");
 
 #if defined(INET) || defined(INET6)
 /*
  * Copy address array to memory that would be then SYSCTL_OUT-ed.
  * sysctl_jail_list() helper.
  */
 static void
 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len)
 {
 	const struct prison_ip *pip;
 	const size_t size = pr_families[af].size;
 
  again:
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	if ((pip = pr->pr_addrs[af]) != NULL) {
 		if (*len < pip->ips) {
 			*len = pip->ips;
 			mtx_unlock(&pr->pr_mtx);
 			*out = realloc(*out, *len * size, M_TEMP, M_WAITOK);
 			mtx_lock(&pr->pr_mtx);
 			goto again;
 		}
 		bcopy(pip->pr_ip, *out, pip->ips * size);
 	}
 }
 #endif
 
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
 	struct xprison *xp;
 	struct prison *pr, *cpr;
 #ifdef INET
 	struct in_addr *ip4 = NULL;
 	int ip4s = 0;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6 = NULL;
 	int ip6s = 0;
 #endif
 	int descend, error;
 
 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
 	pr = req->td->td_ucred->cr_prison;
 	error = 0;
 	sx_slock(&allprison_lock);
 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 		mtx_lock(&cpr->pr_mtx);
 #ifdef INET
 		prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s);
 #endif
 #ifdef INET6
 		prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s);
 #endif
 		bzero(xp, sizeof(*xp));
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = cpr->pr_id;
 		xp->pr_state = cpr->pr_state;
 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
 #ifdef INET
 		xp->pr_ip4s = ip4s;
 #endif
 #ifdef INET6
 		xp->pr_ip6s = ip6s;
 #endif
 		mtx_unlock(&cpr->pr_mtx);
 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
 		if (error)
 			break;
 #ifdef INET
 		if (xp->pr_ip4s > 0) {
 			error = SYSCTL_OUT(req, ip4,
 			    xp->pr_ip4s * sizeof(struct in_addr));
 			if (error)
 				break;
 		}
 #endif
 #ifdef INET6
 		if (xp->pr_ip6s > 0) {
 			error = SYSCTL_OUT(req, ip6,
 			    xp->pr_ip6s * sizeof(struct in6_addr));
 			if (error)
 				break;
 		}
 #endif
 	}
 	sx_sunlock(&allprison_lock);
 	free(xp, M_TEMP);
 #ifdef INET
 	free(ip4, M_TEMP);
 #endif
 #ifdef INET6
 	free(ip6, M_TEMP);
 #endif
 	return (error);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_list, "S", "List of active jails");
 
 static int
 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 {
 	int error, injail;
 
 	injail = jailed(req->td->td_ucred);
 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_jailed, "I", "Process in jail?");
 
 static int
 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 {
 	int error, havevnet;
 #ifdef VIMAGE
 	struct ucred *cred = req->td->td_ucred;
 
 	havevnet = jailed(cred) && prison_owns_vnet(cred);
 #else
 	havevnet = 0;
 #endif
 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_vnet, "I", "Jail owns vnet?");
 
 #if defined(INET) || defined(INET6)
 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
     &jail_max_af_ips, 0,
     "Number of IP addresses a jail may have at most per address family (deprecated)");
 #endif
 
 /*
  * Default parameters for jail(2) compatibility.  For historical reasons,
  * the sysctl names have varying similarity to the parameter names.  Prisons
  * just see their own parameters, and can't change them.
  */
 static int
 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	/* Get the current flag value, and convert it to a boolean. */
 	if (req->td->td_ucred->cr_prison == &prison0) {
 		mtx_lock(&prison0.pr_mtx);
 		i = (jail_default_allow & arg2) != 0;
 		mtx_unlock(&prison0.pr_mtx);
 	} else
 		i = prison_allow(req->td->td_ucred, arg2);
 
 	if (arg1 != NULL)
 		i = !i;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	i = i ? arg2 : 0;
 	if (arg1 != NULL)
 		i ^= arg2;
 	/*
 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
 	 * for writing.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	jail_default_allow = (jail_default_allow & ~arg2) | i;
 	mtx_unlock(&prison0.pr_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
     "Processes in jail can set their hostnames (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
     "Processes in jail can use System V IPC primitives (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
     "Prison root can create raw sockets (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
     "Processes in jail can alter system file flags (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I",
     "Processes in jail can lock/unlock physical pages in memory");
 
 static int
 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int level, error;
 
 	pr = req->td->td_ucred->cr_prison;
 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	*(int *)arg1 = level;
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
     sysctl_jail_default_level, "I",
     "Processes in jail cannot see all mounted file systems (deprecated)");
 
 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
     sysctl_jail_default_level, "I",
     "Ruleset for the devfs filesystem in jail (deprecated)");
 
 /*
  * Nodes to describe jail parameters.  Maximum length of string parameters
  * is returned in the string itself, and the other parameters exist merely
  * to make themselves and their types known.
  */
 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Jail parameters");
 
 int
 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	long l;
 	size_t s;
 	char numbuf[12];
 
 	switch (oidp->oid_kind & CTLTYPE)
 	{
 	case CTLTYPE_LONG:
 	case CTLTYPE_ULONG:
 		l = 0;
 #ifdef SCTL_MASK32
 		if (!(req->flags & SCTL_MASK32))
 #endif
 			return (SYSCTL_OUT(req, &l, sizeof(l)));
 	case CTLTYPE_INT:
 	case CTLTYPE_UINT:
 		i = 0;
 		return (SYSCTL_OUT(req, &i, sizeof(i)));
 	case CTLTYPE_STRING:
 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
 		return
 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
 	case CTLTYPE_STRUCT:
 		s = (size_t)arg2;
 		return (SYSCTL_OUT(req, &s, sizeof(s)));
 	}
 	return (0);
 }
 
 /*
  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
  * jail creation time but cannot be changed in an existing jail.
  */
 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail secure level");
 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
     "Jail value for kern.osreldate and uname -K");
 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
     "Jail value for kern.osrelease and uname -r");
 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail cannot see all mounted file systems");
 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Ruleset for in-jail devfs mounts");
 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail persistence");
 #ifdef VIMAGE
 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
     "E,jailsys", "Virtual network stack");
 #endif
 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
     "B", "Jail is in the process of shutting down");
 
 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
     "I", "Current number of child jails");
 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Maximum number of child jails");
 
 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail hostname");
 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail NIS domainname");
 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
     "Jail host UUID");
 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
     "LU", "Jail host ID");
 
 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
 
 #ifdef INET
 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
     "Jail IPv4 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
     "S,in_addr,a", "Jail IPv4 addresses");
 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv4 source address selection rather than the "
     "primary jail IPv4 address.");
 #endif
 #ifdef INET6
 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
     "Jail IPv6 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
     "S,in6_addr,a", "Jail IPv6 addresses");
 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv6 source address selection rather than the "
     "primary jail IPv6 address.");
 #endif
 
 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set hostname");
 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may use SYSV IPC");
 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create raw sockets");
 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may alter system file flags");
 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set file quotas");
 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may lock (unlock) physical pages in memory");
 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may bind sockets to reserved ports");
 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may read the kernel message buffer");
 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Unprivileged processes may use process debugging facilities");
 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Processes in jail with uid 0 have privilege");
 #ifdef VIMAGE
 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Mountd/nfsd may run in the jail");
 #endif
 
 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount/unmount jail-friendly file systems in general");
 
 /*
  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
  * its associated bit in the pr_allow bitmask, or zero if the parameter was
  * not created.
  */
 unsigned
 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
     const char *descr)
 {
 	struct bool_flags *bf;
 	struct sysctl_oid *parent;
 	char *allow_name, *allow_noname, *allowed;
 #ifndef NO_SYSCTL_DESCR
 	char *descr_deprecated;
 #endif
 	u_int allow_flag;
 
 	if (prefix
 	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
 		< 0 ||
 	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
 		< 0
 	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
 	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
 		free(allow_name, M_PRISON);
 		return 0;
 	}
 
 	/*
 	 * See if this parameter has already beed added, i.e. a module was
 	 * previously loaded/unloaded.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		if (strcmp(bf->name, allow_name) == 0) {
 			allow_flag = bf->flag;
 			goto no_add;
 		}
 	}
 
 	/*
 	 * Find a free bit in pr_allow_all, failing if there are none
 	 * (which shouldn't happen as long as we keep track of how many
 	 * potential dynamic flags exist).
 	 */
 	for (allow_flag = 1;; allow_flag <<= 1) {
 		if (allow_flag == 0)
 			goto no_add;
 		if ((pr_allow_all & allow_flag) == 0)
 			break;
 	}
 
 	/* Note the parameter in the next open slot in pr_flag_allow. */
 	for (bf = pr_flag_allow; ; bf++) {
 		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
 			/* This should never happen, but is not fatal. */
 			allow_flag = 0;
 			goto no_add;
 		}
 		if (atomic_load_int(&bf->flag) == 0)
 			break;
 	}
 	bf->name = allow_name;
 	bf->noname = allow_noname;
 	pr_allow_all |= allow_flag;
 	/*
 	 * prison0 always has permission for the new parameter.
 	 * Other jails must have it granted to them.
 	 */
 	prison0.pr_allow |= allow_flag;
 	/* The flag indicates a valid entry, so make sure it is set last. */
 	atomic_store_rel_int(&bf->flag, allow_flag);
 	mtx_unlock(&prison0.pr_mtx);
 
 	/*
 	 * Create sysctls for the parameter, and the back-compat global
 	 * permission.
 	 */
 	parent = prefix
 	    ? SYSCTL_ADD_NODE(NULL,
 		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
 		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
 	    : &sysctl___security_jail_param_allow;
 	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
 	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    NULL, 0, sysctl_jail_param, "B", descr);
 	if ((prefix
 	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
 	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
 #ifndef NO_SYSCTL_DESCR
 		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
 		    descr);
 #endif
 		(void)SYSCTL_ADD_PROC(NULL,
 		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
 		    sysctl_jail_default_allow, "I", descr_deprecated);
 #ifndef NO_SYSCTL_DESCR
 		free(descr_deprecated, M_TEMP);
 #endif
 		free(allowed, M_TEMP);
 	}
 	return allow_flag;
 
  no_add:
 	mtx_unlock(&prison0.pr_mtx);
 	free(allow_name, M_PRISON);
 	free(allow_noname, M_PRISON);
 	return allow_flag;
 }
 
 /*
  * The VFS system will register jail-aware filesystems here.  They each get
  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
  * attempts to mount.
  */
 void
 prison_add_vfs(struct vfsconf *vfsp)
 {
 #ifdef NO_SYSCTL_DESCR
 
 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
 	    NULL, NULL);
 #else
 	char *descr;
 
 	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
 	    vfsp->vfc_name);
 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
 	    NULL, descr);
 	free(descr, M_TEMP);
 #endif
 }
 
 #ifdef RACCT
 void
 prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allprison_lock);
 	if (pre != NULL)
 		(pre)();
 	LIST_FOREACH(prr, &allprison_racct, prr_next)
 		(callback)(prr->prr_racct, arg2, arg3);
 	if (post != NULL)
 		(post)();
 	sx_sunlock(&allprison_lock);
 }
 
 static struct prison_racct *
 prison_racct_find_locked(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
 		return (NULL);
 
 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
 		if (strcmp(name, prr->prr_name) != 0)
 			continue;
 
 		/* Found prison_racct with a matching name? */
 		prison_racct_hold(prr);
 		return (prr);
 	}
 
 	/* Add new prison_racct. */
 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
 	racct_create(&prr->prr_racct);
 
 	strcpy(prr->prr_name, name);
 	refcount_init(&prr->prr_refcount, 1);
 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
 
 	return (prr);
 }
 
 struct prison_racct *
 prison_racct_find(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_xlock(&allprison_lock);
 	prr = prison_racct_find_locked(name);
 	sx_xunlock(&allprison_lock);
 	return (prr);
 }
 
 void
 prison_racct_hold(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	refcount_acquire(&prr->prr_refcount);
 }
 
 static void
 prison_racct_free_locked(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (refcount_release(&prr->prr_refcount)) {
 		racct_destroy(&prr->prr_racct);
 		LIST_REMOVE(prr, prr_next);
 		free(prr, M_PRISON_RACCT);
 	}
 }
 
 void
 prison_racct_free(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (refcount_release_if_not_last(&prr->prr_refcount))
 		return;
 
 	sx_xlock(&allprison_lock);
 	prison_racct_free_locked(prr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_attach(struct prison *pr)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	prr = prison_racct_find_locked(pr->pr_name);
 	KASSERT(prr != NULL, ("cannot find prison_racct"));
 
 	pr->pr_prison_racct = prr;
 }
 
 /*
  * Handle jail renaming.  From the racct point of view, renaming means
  * moving from one prison_racct to another.
  */
 static void
 prison_racct_modify(struct prison *pr)
 {
 #ifdef RCTL
 	struct proc *p;
 	struct ucred *cred;
 #endif
 	struct prison_racct *oldprr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
 		sx_xunlock(&allprison_lock);
 		sx_sunlock(&allproc_lock);
 		return;
 	}
 
 	oldprr = pr->pr_prison_racct;
 	pr->pr_prison_racct = NULL;
 
 	prison_racct_attach(pr);
 
 	/*
 	 * Move resource utilisation records.
 	 */
 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
 
 #ifdef RCTL
 	/*
 	 * Force rctl to reattach rules to processes.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		cred = crhold(p->p_ucred);
 		PROC_UNLOCK(p);
 		rctl_proc_ucred_changed(p, cred);
 		crfree(cred);
 	}
 #endif
 
 	sx_sunlock(&allproc_lock);
 	prison_racct_free_locked(oldprr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_detach(struct prison *pr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (pr->pr_prison_racct == NULL)
 		return;
 	prison_racct_free(pr->pr_prison_racct);
 	pr->pr_prison_racct = NULL;
 }
 #endif /* RACCT */
 
 #ifdef DDB
 
 static void
 db_show_prison(struct prison *pr)
 {
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 #if defined(INET) || defined(INET6)
 	int ii;
 	struct prison_ip *pip;
 #endif
 	unsigned f;
 #ifdef INET
 	char ip4buf[INET_ADDRSTRLEN];
 #endif
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	db_printf("prison %p:\n", pr);
 	db_printf(" jid             = %d\n", pr->pr_id);
 	db_printf(" name            = %s\n", pr->pr_name);
 	db_printf(" parent          = %p\n", pr->pr_parent);
 	db_printf(" ref             = %d\n", pr->pr_ref);
 	db_printf(" uref            = %d\n", pr->pr_uref);
 	db_printf(" state           = %s\n",
 	    pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
 	    pr->pr_state == PRISON_STATE_DYING ? "dying" :
 	    "invalid");
 	db_printf(" path            = %s\n", pr->pr_path);
 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
 	    ? pr->pr_cpuset->cs_id : -1);
 #ifdef VIMAGE
 	db_printf(" vnet            = %p\n", pr->pr_vnet);
 #endif
 	db_printf(" root            = %p\n", pr->pr_root);
 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
 	db_printf(" children.max    = %d\n", pr->pr_childmax);
 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
 	db_printf(" flags           = 0x%x", pr->pr_flags);
 	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
 		if (pr->pr_flags & bf->flag)
 			db_printf(" %s", bf->name);
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		f = pr->pr_flags & (jsf->disable | jsf->new);
 		db_printf(" %-16s= %s\n", jsf->name,
 		    (f != 0 && f == jsf->disable) ? "disable"
 		    : (f == jsf->new) ? "new"
 		    : "inherit");
 	}
 	db_printf(" allow           = 0x%x", pr->pr_allow);
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++)
 		if (pr->pr_allow & bf->flag)
 			db_printf(" %s", bf->name);
 	db_printf("\n");
 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
 #ifdef INET
 	if ((pip = pr->pr_addrs[PR_INET]) != NULL) {
 		db_printf(" ip4s            = %d\n", pip->ips);
 		for (ii = 0; ii < pip->ips; ii++)
 			db_printf(" %s %s\n",
 			    ii == 0 ? "ip4.addr        =" : "                 ",
 			    inet_ntoa_r(
 			    *(const struct in_addr *)PR_IP(pip, PR_INET, ii),
 			    ip4buf));
 	}
 #endif
 #ifdef INET6
 	if ((pip = pr->pr_addrs[PR_INET6]) != NULL) {
 		db_printf(" ip6s            = %d\n", pip->ips);
 		for (ii = 0; ii < pip->ips; ii++)
 			db_printf(" %s %s\n",
 			    ii == 0 ? "ip6.addr        =" : "                 ",
 			    ip6_sprintf(ip6buf,
 			    (const struct in6_addr *)PR_IP(pip, PR_INET6, ii)));
 	}
 #endif
 }
 
 DB_SHOW_COMMAND(prison, db_show_prison_command)
 {
 	struct prison *pr;
 
 	if (!have_addr) {
 		/*
 		 * Show all prisons in the list, and prison0 which is not
 		 * listed.
 		 */
 		db_show_prison(&prison0);
 		if (!db_pager_quit) {
 			TAILQ_FOREACH(pr, &allprison, pr_list) {
 				db_show_prison(pr);
 				if (db_pager_quit)
 					break;
 			}
 		}
 		return;
 	}
 
 	if (addr == 0)
 		pr = &prison0;
 	else {
 		/* Look for a prison with the ID and with references. */
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			if (pr->pr_id == addr && pr->pr_ref > 0)
 				break;
 		if (pr == NULL)
 			/* Look again, without requiring a reference. */
 			TAILQ_FOREACH(pr, &allprison, pr_list)
 				if (pr->pr_id == addr)
 					break;
 		if (pr == NULL)
 			/* Assume address points to a valid prison. */
 			pr = (struct prison *)addr;
 	}
 	db_show_prison(pr);
 }
 
 #endif /* DDB */
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
index 01158c2c238f..75b81457402b 100644
--- a/sys/kern/kern_ktrace.c
+++ b/sys/kern/kern_ktrace.c
@@ -1,1399 +1,1399 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
  */
 
 #include <sys/cdefs.h>
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/ktrace.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The ktrace facility allows the tracing of certain key events in user space
  * processes, such as system calls, signal delivery, context switches, and
  * user generated events using utrace(2).  It works by streaming event
  * records and data to a vnode associated with the process using the
  * ktrace(2) system call.  In general, records can be written directly from
  * the context that generates the event.  One important exception to this is
  * during a context switch, where sleeping is not permitted.  To handle this
  * case, trace events are generated using in-kernel ktr_request records, and
  * then delivered to disk at a convenient moment -- either immediately, the
  * next traceable event, at system call return, or at process exit.
  *
  * When dealing with multiple threads or processes writing to the same event
  * log, ordering guarantees are weak: specifically, if an event has multiple
  * records (i.e., system call enter and return), they may be interlaced with
  * records from another event.  Process and thread ID information is provided
  * in the record, and user applications can de-interlace events if required.
  */
 
 static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
 
 #ifdef KTRACE
 
 FEATURE(ktrace, "Kernel support for system-call tracing");
 
 #ifndef KTRACE_REQUEST_POOL
 #define	KTRACE_REQUEST_POOL	100
 #endif
 
 struct ktr_request {
 	struct	ktr_header ktr_header;
 	void	*ktr_buffer;
 	union {
 		struct	ktr_proc_ctor ktr_proc_ctor;
 		struct	ktr_cap_fail ktr_cap_fail;
 		struct	ktr_syscall ktr_syscall;
 		struct	ktr_sysret ktr_sysret;
 		struct	ktr_genio ktr_genio;
 		struct	ktr_psig ktr_psig;
 		struct	ktr_csw ktr_csw;
 		struct	ktr_fault ktr_fault;
 		struct	ktr_faultend ktr_faultend;
 		struct  ktr_struct_array ktr_struct_array;
 	} ktr_data;
 	STAILQ_ENTRY(ktr_request) ktr_list;
 };
 
 static const int data_lengths[] = {
 	[KTR_SYSCALL] = offsetof(struct ktr_syscall, ktr_args),
 	[KTR_SYSRET] = sizeof(struct ktr_sysret),
 	[KTR_NAMEI] = 0,
 	[KTR_GENIO] = sizeof(struct ktr_genio),
 	[KTR_PSIG] = sizeof(struct ktr_psig),
 	[KTR_CSW] = sizeof(struct ktr_csw),
 	[KTR_USER] = 0,
 	[KTR_STRUCT] = 0,
 	[KTR_SYSCTL] = 0,
 	[KTR_PROCCTOR] = sizeof(struct ktr_proc_ctor),
 	[KTR_PROCDTOR] = 0,
 	[KTR_CAPFAIL] = sizeof(struct ktr_cap_fail),
 	[KTR_FAULT] = sizeof(struct ktr_fault),
 	[KTR_FAULTEND] = sizeof(struct ktr_faultend),
 	[KTR_STRUCT_ARRAY] = sizeof(struct ktr_struct_array),
 };
 
 static STAILQ_HEAD(, ktr_request) ktr_free;
 
 static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "KTRACE options");
 
 static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
 TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
 
 u_int ktr_geniosize = PAGE_SIZE;
 SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RWTUN, &ktr_geniosize,
     0, "Maximum size of genio event payload");
 
 /*
  * Allow to not to send signal to traced process, in which context the
  * ktr record is written.  The limit is applied from the process that
  * set up ktrace, so killing the traced process is not completely fair.
  */
 int ktr_filesize_limit_signal = 0;
 SYSCTL_INT(_kern_ktrace, OID_AUTO, filesize_limit_signal, CTLFLAG_RWTUN,
     &ktr_filesize_limit_signal, 0,
     "Send SIGXFSZ to the traced process when the log size limit is exceeded");
 
 static int print_message = 1;
 static struct mtx ktrace_mtx;
 static struct sx ktrace_sx;
 
 struct ktr_io_params {
 	struct vnode	*vp;
 	struct ucred	*cr;
 	off_t		lim;
 	u_int		refs;
 };
 
 static void ktrace_init(void *dummy);
 static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
 static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
 static struct ktr_request *ktr_getrequest_entered(struct thread *td, int type);
 static struct ktr_request *ktr_getrequest(int type);
 static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
 static struct ktr_io_params *ktr_freeproc(struct proc *p);
 static void ktr_freerequest(struct ktr_request *req);
 static void ktr_freerequest_locked(struct ktr_request *req);
 static void ktr_writerequest(struct thread *td, struct ktr_request *req);
 static int ktrcanset(struct thread *,struct proc *);
 static int ktrsetchildren(struct thread *, struct proc *, int, int,
     struct ktr_io_params *);
 static int ktrops(struct thread *, struct proc *, int, int,
     struct ktr_io_params *);
 static void ktrprocctor_entered(struct thread *, struct proc *);
 
 /*
  * ktrace itself generates events, such as context switches, which we do not
  * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
  * whether or not it is in a region where tracing of events should be
  * suppressed.
  */
 static void
 ktrace_enter(struct thread *td)
 {
 
 	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
 	td->td_pflags |= TDP_INKTRACE;
 }
 
 static void
 ktrace_exit(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
 	td->td_pflags &= ~TDP_INKTRACE;
 }
 
 static void
 ktrace_assert(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
 }
 
 static void
 ast_ktrace(struct thread *td, int tda __unused)
 {
 	KTRUSERRET(td);
 }
 
 static void
 ktrace_init(void *dummy)
 {
 	struct ktr_request *req;
 	int i;
 
 	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
 	sx_init(&ktrace_sx, "ktrace_sx");
 	STAILQ_INIT(&ktr_free);
 	for (i = 0; i < ktr_requestpool; i++) {
 		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK |
 		    M_ZERO);
 		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	}
 	ast_register(TDA_KTRACE, ASTR_ASTF_REQUIRED, 0, ast_ktrace);
 }
 SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
 
 static int
 sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td;
 	u_int newsize, oldsize, wantsize;
 	int error;
 
 	/* Handle easy read-only case first to avoid warnings from GCC. */
 	if (!req->newptr) {
 		oldsize = ktr_requestpool;
 		return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
 	}
 
 	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
 	if (error)
 		return (error);
 	td = curthread;
 	ktrace_enter(td);
 	oldsize = ktr_requestpool;
 	newsize = ktrace_resize_pool(oldsize, wantsize);
 	ktrace_exit(td);
 	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
 	if (error)
 		return (error);
 	if (wantsize > oldsize && newsize < wantsize)
 		return (ENOSPC);
 	return (0);
 }
 SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &ktr_requestpool, 0,
     sysctl_kern_ktrace_request_pool, "IU",
     "Pool buffer size for ktrace(1)");
 
 static u_int
 ktrace_resize_pool(u_int oldsize, u_int newsize)
 {
 	STAILQ_HEAD(, ktr_request) ktr_new;
 	struct ktr_request *req;
 	int bound;
 
 	print_message = 1;
 	bound = newsize - oldsize;
 	if (bound == 0)
 		return (ktr_requestpool);
 	if (bound < 0) {
 		mtx_lock(&ktrace_mtx);
 		/* Shrink pool down to newsize if possible. */
 		while (bound++ < 0) {
 			req = STAILQ_FIRST(&ktr_free);
 			if (req == NULL)
 				break;
 			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 			ktr_requestpool--;
 			free(req, M_KTRACE);
 		}
 	} else {
 		/* Grow pool up to newsize. */
 		STAILQ_INIT(&ktr_new);
 		while (bound-- > 0) {
 			req = malloc(sizeof(struct ktr_request), M_KTRACE,
 			    M_WAITOK | M_ZERO);
 			STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
 		}
 		mtx_lock(&ktrace_mtx);
 		STAILQ_CONCAT(&ktr_free, &ktr_new);
 		ktr_requestpool += (newsize - oldsize);
 	}
 	mtx_unlock(&ktrace_mtx);
 	return (ktr_requestpool);
 }
 
 /* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
 CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
     (sizeof((struct thread *)NULL)->td_name));
 
 static struct ktr_request *
 ktr_getrequest_entered(struct thread *td, int type)
 {
 	struct ktr_request *req;
 	struct proc *p = td->td_proc;
 	int pm;
 
 	mtx_lock(&ktrace_mtx);
 	if (!KTRCHECK(td, type)) {
 		mtx_unlock(&ktrace_mtx);
 		return (NULL);
 	}
 	req = STAILQ_FIRST(&ktr_free);
 	if (req != NULL) {
 		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 		req->ktr_header.ktr_type = type;
 		if (p->p_traceflag & KTRFAC_DROP) {
 			req->ktr_header.ktr_type |= KTR_DROP;
 			p->p_traceflag &= ~KTRFAC_DROP;
 		}
 		mtx_unlock(&ktrace_mtx);
 		nanotime(&req->ktr_header.ktr_time);
 		req->ktr_header.ktr_type |= KTR_VERSIONED;
 		req->ktr_header.ktr_pid = p->p_pid;
 		req->ktr_header.ktr_tid = td->td_tid;
 		req->ktr_header.ktr_cpu = PCPU_GET(cpuid);
 		req->ktr_header.ktr_version = KTR_VERSION1;
 		bcopy(td->td_name, req->ktr_header.ktr_comm,
 		    sizeof(req->ktr_header.ktr_comm));
 		req->ktr_buffer = NULL;
 		req->ktr_header.ktr_len = 0;
 	} else {
 		p->p_traceflag |= KTRFAC_DROP;
 		pm = print_message;
 		print_message = 0;
 		mtx_unlock(&ktrace_mtx);
 		if (pm)
 			printf("Out of ktrace request objects.\n");
 	}
 	return (req);
 }
 
 static struct ktr_request *
 ktr_getrequest(int type)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 
 	ktrace_enter(td);
 	req = ktr_getrequest_entered(td, type);
 	if (req == NULL)
 		ktrace_exit(td);
 
 	return (req);
 }
 
 /*
  * Some trace generation environments don't permit direct access to VFS,
  * such as during a context switch where sleeping is not allowed.  Under these
  * circumstances, queue a request to the thread to be written asynchronously
  * later.
  */
 static void
 ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
 {
 
 	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
 	ast_sched(td, TDA_KTRACE);
 }
 
 /*
  * Drain any pending ktrace records from the per-thread queue to disk.  This
  * is used both internally before committing other records, and also on
  * system call return.  We drain all the ones we can find at the time when
  * drain is requested, but don't keep draining after that as those events
  * may be approximately "after" the current event.
  */
 static void
 ktr_drain(struct thread *td)
 {
 	struct ktr_request *queued_req;
 	STAILQ_HEAD(, ktr_request) local_queue;
 
 	ktrace_assert(td);
 	sx_assert(&ktrace_sx, SX_XLOCKED);
 
 	STAILQ_INIT(&local_queue);
 
 	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
 		mtx_lock(&ktrace_mtx);
 		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
 		mtx_unlock(&ktrace_mtx);
 
 		while ((queued_req = STAILQ_FIRST(&local_queue))) {
 			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
 			ktr_writerequest(td, queued_req);
 			ktr_freerequest(queued_req);
 		}
 	}
 }
 
 /*
  * Submit a trace record for immediate commit to disk -- to be used only
  * where entering VFS is OK.  First drain any pending records that may have
  * been cached in the thread.
  */
 static void
 ktr_submitrequest(struct thread *td, struct ktr_request *req)
 {
 
 	ktrace_assert(td);
 
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	ktr_writerequest(td, req);
 	ktr_freerequest(req);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 static void
 ktr_freerequest(struct ktr_request *req)
 {
 
 	mtx_lock(&ktrace_mtx);
 	ktr_freerequest_locked(req);
 	mtx_unlock(&ktrace_mtx);
 }
 
 static void
 ktr_freerequest_locked(struct ktr_request *req)
 {
 
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	if (req->ktr_buffer != NULL)
 		free(req->ktr_buffer, M_KTRACE);
 	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 }
 
 static void
 ktr_io_params_ref(struct ktr_io_params *kiop)
 {
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	kiop->refs++;
 }
 
 static struct ktr_io_params *
 ktr_io_params_rele(struct ktr_io_params *kiop)
 {
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	if (kiop == NULL)
 		return (NULL);
 	KASSERT(kiop->refs > 0, ("kiop ref == 0 %p", kiop));
 	return (--(kiop->refs) == 0 ? kiop : NULL);
 }
 
 void
 ktr_io_params_free(struct ktr_io_params *kiop)
 {
 	if (kiop == NULL)
 		return;
 
 	MPASS(kiop->refs == 0);
 	vn_close(kiop->vp, FWRITE, kiop->cr, curthread);
 	crfree(kiop->cr);
 	free(kiop, M_KTRACE);
 }
 
 static struct ktr_io_params *
 ktr_io_params_alloc(struct thread *td, struct vnode *vp)
 {
 	struct ktr_io_params *res;
 
 	res = malloc(sizeof(struct ktr_io_params), M_KTRACE, M_WAITOK);
 	res->vp = vp;
 	res->cr = crhold(td->td_ucred);
 	res->lim = lim_cur(td, RLIMIT_FSIZE);
 	res->refs = 1;
 	return (res);
 }
 
 /*
  * Disable tracing for a process and release all associated resources.
  * The caller is responsible for releasing a reference on the returned
  * vnode and credentials.
  */
 static struct ktr_io_params *
 ktr_freeproc(struct proc *p)
 {
 	struct ktr_io_params *kiop;
 	struct ktr_request *req;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	kiop = ktr_io_params_rele(p->p_ktrioparms);
 	p->p_ktrioparms = NULL;
 	p->p_traceflag = 0;
 	while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
 		STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
 		ktr_freerequest_locked(req);
 	}
 	return (kiop);
 }
 
 struct vnode *
 ktr_get_tracevp(struct proc *p, bool ref)
 {
 	struct vnode *vp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (p->p_ktrioparms != NULL) {
 		vp = p->p_ktrioparms->vp;
 		if (ref)
 			vrefact(vp);
 	} else {
 		vp = NULL;
 	}
 	return (vp);
 }
 
 void
 ktrsyscall(int code, int narg, syscallarg_t args[])
 {
 	struct ktr_request *req;
 	struct ktr_syscall *ktp;
 	size_t buflen;
 	char *buf = NULL;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	buflen = sizeof(register_t) * narg;
 	if (buflen > 0) {
 		buf = malloc(buflen, M_KTRACE, M_WAITOK);
 		bcopy(args, buf, buflen);
 	}
 	req = ktr_getrequest(KTR_SYSCALL);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	ktp = &req->ktr_data.ktr_syscall;
 	ktp->ktr_code = code;
 	ktp->ktr_narg = narg;
 	if (buflen > 0) {
 		req->ktr_header.ktr_len = buflen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrsysret(int code, int error, register_t retval)
 {
 	struct ktr_request *req;
 	struct ktr_sysret *ktp;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_SYSRET);
 	if (req == NULL)
 		return;
 	ktp = &req->ktr_data.ktr_sysret;
 	ktp->ktr_code = code;
 	ktp->ktr_error = error;
 	ktp->ktr_retval = ((error == 0) ? retval: 0);		/* what about val2 ? */
 	ktr_submitrequest(curthread, req);
 }
 
 /*
  * When a setuid process execs, disable tracing.
  *
  * XXX: We toss any pending asynchronous records.
  */
 struct ktr_io_params *
 ktrprocexec(struct proc *p)
 {
 	struct ktr_io_params *kiop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	kiop = p->p_ktrioparms;
 	if (kiop == NULL || priv_check_cred(kiop->cr, PRIV_DEBUG_DIFFCRED))
 		return (NULL);
 
 	mtx_lock(&ktrace_mtx);
 	kiop = ktr_freeproc(p);
 	mtx_unlock(&ktrace_mtx);
 	return (kiop);
 }
 
 /*
  * When a process exits, drain per-process asynchronous trace records
  * and disable tracing.
  */
 void
 ktrprocexit(struct thread *td)
 {
 	struct ktr_request *req;
 	struct proc *p;
 	struct ktr_io_params *kiop;
 
 	p = td->td_proc;
 	if (p->p_traceflag == 0)
 		return;
 
 	ktrace_enter(td);
 	req = ktr_getrequest_entered(td, KTR_PROCDTOR);
 	if (req != NULL)
 		ktr_enqueuerequest(td, req);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	PROC_LOCK(p);
 	mtx_lock(&ktrace_mtx);
 	kiop = ktr_freeproc(p);
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	ktr_io_params_free(kiop);
 	ktrace_exit(td);
 }
 
 static void
 ktrprocctor_entered(struct thread *td, struct proc *p)
 {
 	struct ktr_proc_ctor *ktp;
 	struct ktr_request *req;
 	struct thread *td2;
 
 	ktrace_assert(td);
 	td2 = FIRST_THREAD_IN_PROC(p);
 	req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
 	if (req == NULL)
 		return;
 	ktp = &req->ktr_data.ktr_proc_ctor;
 	ktp->sv_flags = p->p_sysent->sv_flags;
 	ktr_enqueuerequest(td2, req);
 }
 
 void
 ktrprocctor(struct proc *p)
 {
 	struct thread *td = curthread;
 
 	if ((p->p_traceflag & KTRFAC_MASK) == 0)
 		return;
 
 	ktrace_enter(td);
 	ktrprocctor_entered(td, p);
 	ktrace_exit(td);
 }
 
 /*
  * When a process forks, enable tracing in the new process if needed.
  */
 void
 ktrprocfork(struct proc *p1, struct proc *p2)
 {
 
 	MPASS(p2->p_ktrioparms == NULL);
 	MPASS(p2->p_traceflag == 0);
 
 	if (p1->p_traceflag == 0)
 		return;
 
 	PROC_LOCK(p1);
 	mtx_lock(&ktrace_mtx);
 	if (p1->p_traceflag & KTRFAC_INHERIT) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_ktrioparms = p1->p_ktrioparms) != NULL)
 			p1->p_ktrioparms->refs++;
 	}
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p1);
 
 	ktrprocctor(p2);
 }
 
 /*
  * When a thread returns, drain any asynchronous records generated by the
  * system call.
  */
 void
 ktruserret(struct thread *td)
 {
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 void
 ktrnamei(const char *path)
 {
 	struct ktr_request *req;
 	int namelen;
 	char *buf = NULL;
 
 	namelen = strlen(path);
 	if (namelen > 0) {
 		buf = malloc(namelen, M_KTRACE, M_WAITOK);
 		bcopy(path, buf, namelen);
 	}
 	req = ktr_getrequest(KTR_NAMEI);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	if (namelen > 0) {
 		req->ktr_header.ktr_len = namelen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrsysctl(int *name, u_int namelen)
 {
 	struct ktr_request *req;
 	u_int mib[CTL_MAXNAME + 2];
 	char *mibname;
 	size_t mibnamelen;
 	int error;
 
 	/* Lookup name of mib. */    
 	KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
 	mib[0] = 0;
 	mib[1] = 1;
 	bcopy(name, mib + 2, namelen * sizeof(*name));
 	mibnamelen = 128;
 	mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
 	error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
 	    NULL, 0, &mibnamelen, 0);
 	if (error) {
 		free(mibname, M_KTRACE);
 		return;
 	}
 	req = ktr_getrequest(KTR_SYSCTL);
 	if (req == NULL) {
 		free(mibname, M_KTRACE);
 		return;
 	}
 	req->ktr_header.ktr_len = mibnamelen;
 	req->ktr_buffer = mibname;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrgenio(int fd, enum uio_rw rw, struct uio *uio, int error)
 {
 	struct ktr_request *req;
 	struct ktr_genio *ktg;
 	int datalen;
 	char *buf;
 
 	if (error) {
-		free(uio, M_IOV);
+		freeuio(uio);
 		return;
 	}
 	uio->uio_offset = 0;
 	uio->uio_rw = UIO_WRITE;
 	datalen = MIN(uio->uio_resid, ktr_geniosize);
 	buf = malloc(datalen, M_KTRACE, M_WAITOK);
 	error = uiomove(buf, datalen, uio);
-	free(uio, M_IOV);
+	freeuio(uio);
 	if (error) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	req = ktr_getrequest(KTR_GENIO);
 	if (req == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	ktg = &req->ktr_data.ktr_genio;
 	ktg->ktr_fd = fd;
 	ktg->ktr_rw = rw;
 	req->ktr_header.ktr_len = datalen;
 	req->ktr_buffer = buf;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrpsig(int sig, sig_t action, sigset_t *mask, int code)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_psig	*kp;
 
 	req = ktr_getrequest(KTR_PSIG);
 	if (req == NULL)
 		return;
 	kp = &req->ktr_data.ktr_psig;
 	kp->signo = (char)sig;
 	kp->action = action;
 	kp->mask = *mask;
 	kp->code = code;
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 
 void
 ktrcsw(int out, int user, const char *wmesg)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_csw *kc;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_CSW);
 	if (req == NULL)
 		return;
 	kc = &req->ktr_data.ktr_csw;
 	kc->out = out;
 	kc->user = user;
 	if (wmesg != NULL)
 		strlcpy(kc->wmesg, wmesg, sizeof(kc->wmesg));
 	else
 		bzero(kc->wmesg, sizeof(kc->wmesg));
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 
 void
 ktrstruct(const char *name, const void *data, size_t datalen)
 {
 	struct ktr_request *req;
 	char *buf;
 	size_t buflen, namelen;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	if (data == NULL)
 		datalen = 0;
 	namelen = strlen(name) + 1;
 	buflen = namelen + datalen;
 	buf = malloc(buflen, M_KTRACE, M_WAITOK);
 	strcpy(buf, name);
 	bcopy(data, buf + namelen, datalen);
 	if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	req->ktr_buffer = buf;
 	req->ktr_header.ktr_len = buflen;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrstruct_error(const char *name, const void *data, size_t datalen, int error)
 {
 
 	if (error == 0)
 		ktrstruct(name, data, datalen);
 }
 
 void
 ktrstructarray(const char *name, enum uio_seg seg, const void *data,
     int num_items, size_t struct_size)
 {
 	struct ktr_request *req;
 	struct ktr_struct_array *ksa;
 	char *buf;
 	size_t buflen, datalen, namelen;
 	int max_items;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 	if (num_items < 0)
 		return;
 
 	/* Trim array length to genio size. */
 	max_items = ktr_geniosize / struct_size;
 	if (num_items > max_items) {
 		if (max_items == 0)
 			num_items = 1;
 		else
 			num_items = max_items;
 	}
 	datalen = num_items * struct_size;
 
 	if (data == NULL)
 		datalen = 0;
 
 	namelen = strlen(name) + 1;
 	buflen = namelen + datalen;
 	buf = malloc(buflen, M_KTRACE, M_WAITOK);
 	strcpy(buf, name);
 	if (seg == UIO_SYSSPACE)
 		bcopy(data, buf + namelen, datalen);
 	else {
 		if (copyin(data, buf + namelen, datalen) != 0) {
 			free(buf, M_KTRACE);
 			return;
 		}
 	}
 	if ((req = ktr_getrequest(KTR_STRUCT_ARRAY)) == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	ksa = &req->ktr_data.ktr_struct_array;
 	ksa->struct_size = struct_size;
 	req->ktr_buffer = buf;
 	req->ktr_header.ktr_len = buflen;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrcapfail(enum ktr_cap_fail_type type, const cap_rights_t *needed,
     const cap_rights_t *held)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_cap_fail *kcf;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_CAPFAIL);
 	if (req == NULL)
 		return;
 	kcf = &req->ktr_data.ktr_cap_fail;
 	kcf->cap_type = type;
 	if (needed != NULL)
 		kcf->cap_needed = *needed;
 	else
 		cap_rights_init(&kcf->cap_needed);
 	if (held != NULL)
 		kcf->cap_held = *held;
 	else
 		cap_rights_init(&kcf->cap_held);
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 
 void
 ktrfault(vm_offset_t vaddr, int type)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_fault *kf;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_FAULT);
 	if (req == NULL)
 		return;
 	kf = &req->ktr_data.ktr_fault;
 	kf->vaddr = vaddr;
 	kf->type = type;
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 
 void
 ktrfaultend(int result)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
 	struct ktr_faultend *kf;
 
 	if (__predict_false(curthread->td_pflags & TDP_INKTRACE))
 		return;
 
 	req = ktr_getrequest(KTR_FAULTEND);
 	if (req == NULL)
 		return;
 	kf = &req->ktr_data.ktr_faultend;
 	kf->result = result;
 	ktr_enqueuerequest(td, req);
 	ktrace_exit(td);
 }
 #endif /* KTRACE */
 
 /* Interface and common routines */
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktrace_args {
 	char	*fname;
 	int	ops;
 	int	facs;
 	int	pid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ktrace(struct thread *td, struct ktrace_args *uap)
 {
 #ifdef KTRACE
 	struct vnode *vp = NULL;
 	struct proc *p;
 	struct pgrp *pg;
 	int facs = uap->facs & ~KTRFAC_ROOT;
 	int ops = KTROP(uap->ops);
 	int descend = uap->ops & KTRFLAG_DESCEND;
 	int ret = 0;
 	int flags, error = 0;
 	struct nameidata nd;
 	struct ktr_io_params *kiop, *old_kiop;
 
 	/*
 	 * Need something to (un)trace.
 	 */
 	if (ops != KTROP_CLEARFILE && facs == 0)
 		return (EINVAL);
 
 	kiop = NULL;
 	if (ops != KTROP_CLEAR) {
 		/*
 		 * an operation which requires a file argument.
 		 */
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname);
 		flags = FREAD | FWRITE | O_NOFOLLOW;
 		error = vn_open(&nd, &flags, 0, NULL);
 		if (error)
 			return (error);
 		NDFREE_PNBUF(&nd);
 		vp = nd.ni_vp;
 		VOP_UNLOCK(vp);
 		if (vp->v_type != VREG) {
 			(void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 			return (EACCES);
 		}
 		kiop = ktr_io_params_alloc(td, vp);
 	}
 
 	/*
 	 * Clear all uses of the tracefile.
 	 */
 	ktrace_enter(td);
 	if (ops == KTROP_CLEARFILE) {
 restart:
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			old_kiop = NULL;
 			PROC_LOCK(p);
 			if (p->p_ktrioparms != NULL &&
 			    p->p_ktrioparms->vp == vp) {
 				if (ktrcanset(td, p)) {
 					mtx_lock(&ktrace_mtx);
 					old_kiop = ktr_freeproc(p);
 					mtx_unlock(&ktrace_mtx);
 				} else
 					error = EPERM;
 			}
 			PROC_UNLOCK(p);
 			if (old_kiop != NULL) {
 				sx_sunlock(&allproc_lock);
 				ktr_io_params_free(old_kiop);
 				goto restart;
 			}
 		}
 		sx_sunlock(&allproc_lock);
 		goto done;
 	}
 	/*
 	 * do it
 	 */
 	sx_slock(&proctree_lock);
 	if (uap->pid < 0) {
 		/*
 		 * by process group
 		 */
 		pg = pgfind(-uap->pid);
 		if (pg == NULL) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 
 		/*
 		 * ktrops() may call vrele(). Lock pg_members
 		 * by the proctree_lock rather than pg_mtx.
 		 */
 		PGRP_UNLOCK(pg);
 		if (LIST_EMPTY(&pg->pg_members)) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (descend)
 				ret |= ktrsetchildren(td, p, ops, facs, kiop);
 			else
 				ret |= ktrops(td, p, ops, facs, kiop);
 		}
 	} else {
 		/*
 		 * by pid
 		 */
 		p = pfind(uap->pid);
 		if (p == NULL) {
 			error = ESRCH;
 			sx_sunlock(&proctree_lock);
 			goto done;
 		}
 		if (descend)
 			ret |= ktrsetchildren(td, p, ops, facs, kiop);
 		else
 			ret |= ktrops(td, p, ops, facs, kiop);
 	}
 	sx_sunlock(&proctree_lock);
 	if (!ret)
 		error = EPERM;
 done:
 	if (kiop != NULL) {
 		mtx_lock(&ktrace_mtx);
 		kiop = ktr_io_params_rele(kiop);
 		mtx_unlock(&ktrace_mtx);
 		ktr_io_params_free(kiop);
 	}
 	ktrace_exit(td);
 	return (error);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 /* ARGSUSED */
 int
 sys_utrace(struct thread *td, struct utrace_args *uap)
 {
 
 #ifdef KTRACE
 	struct ktr_request *req;
 	void *cp;
 	int error;
 
 	if (!KTRPOINT(td, KTR_USER))
 		return (0);
 	if (uap->len > KTR_USER_MAXLEN)
 		return (EINVAL);
 	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
 	error = copyin(uap->addr, cp, uap->len);
 	if (error) {
 		free(cp, M_KTRACE);
 		return (error);
 	}
 	req = ktr_getrequest(KTR_USER);
 	if (req == NULL) {
 		free(cp, M_KTRACE);
 		return (ENOMEM);
 	}
 	req->ktr_buffer = cp;
 	req->ktr_header.ktr_len = uap->len;
 	ktr_submitrequest(td, req);
 	return (0);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 #ifdef KTRACE
 static int
 ktrops(struct thread *td, struct proc *p, int ops, int facs,
     struct ktr_io_params *new_kiop)
 {
 	struct ktr_io_params *old_kiop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!ktrcanset(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	if ((ops == KTROP_SET && p->p_state == PRS_NEW) ||
 	    p_cansee(td, p) != 0) {
 		/*
 		 * Disallow setting trace points if the process is being born.
 		 * This avoids races with trace point inheritance in
 		 * ktrprocfork().
 		 */
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	if ((p->p_flag & P_WEXIT) != 0) {
 		/*
 		 * There's nothing to do if the process is exiting, but avoid
 		 * signaling an error.
 		 */
 		PROC_UNLOCK(p);
 		return (1);
 	}
 	old_kiop = NULL;
 	mtx_lock(&ktrace_mtx);
 	if (ops == KTROP_SET) {
 		if (p->p_ktrioparms != NULL &&
 		    p->p_ktrioparms->vp != new_kiop->vp) {
 			/* if trace file already in use, relinquish below */
 			old_kiop = ktr_io_params_rele(p->p_ktrioparms);
 			p->p_ktrioparms = NULL;
 		}
 		if (p->p_ktrioparms == NULL) {
 			p->p_ktrioparms = new_kiop;
 			ktr_io_params_ref(new_kiop);
 		}
 		p->p_traceflag |= facs;
 		if (priv_check(td, PRIV_KTRACE) == 0)
 			p->p_traceflag |= KTRFAC_ROOT;
 	} else {
 		/* KTROP_CLEAR */
 		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
 			/* no more tracing */
 			old_kiop = ktr_freeproc(p);
 	}
 	mtx_unlock(&ktrace_mtx);
 	if ((p->p_traceflag & KTRFAC_MASK) != 0)
 		ktrprocctor_entered(td, p);
 	PROC_UNLOCK(p);
 	ktr_io_params_free(old_kiop);
 
 	return (1);
 }
 
 static int
 ktrsetchildren(struct thread *td, struct proc *top, int ops, int facs,
     struct ktr_io_params *new_kiop)
 {
 	struct proc *p;
 	int ret = 0;
 
 	p = top;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (;;) {
 		ret |= ktrops(td, p, ops, facs, new_kiop);
 		/*
 		 * If this process has children, descend to them next,
 		 * otherwise do any siblings, and if done with this level,
 		 * follow back up the tree (but not past top).
 		 */
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top)
 				return (ret);
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 		PROC_LOCK(p);
 	}
 	/*NOTREACHED*/
 }
 
 static void
 ktr_writerequest(struct thread *td, struct ktr_request *req)
 {
 	struct ktr_io_params *kiop, *kiop1;
 	struct ktr_header *kth;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct uio auio;
 	struct iovec aiov[3];
 	struct mount *mp;
 	off_t lim;
 	int datalen, buflen;
 	int error;
 
 	p = td->td_proc;
 
 	/*
 	 * We reference the kiop for use in I/O in case ktrace is
 	 * disabled on the process as we write out the request.
 	 */
 	mtx_lock(&ktrace_mtx);
 	kiop = p->p_ktrioparms;
 
 	/*
 	 * If kiop is NULL, it has been cleared out from under this
 	 * request, so just drop it.
 	 */
 	if (kiop == NULL) {
 		mtx_unlock(&ktrace_mtx);
 		return;
 	}
 
 	ktr_io_params_ref(kiop);
 	vp = kiop->vp;
 	cred = kiop->cr;
 	lim = kiop->lim;
 
 	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
 	mtx_unlock(&ktrace_mtx);
 
 	kth = &req->ktr_header;
 	KASSERT(((u_short)kth->ktr_type & ~KTR_TYPE) < nitems(data_lengths),
 	    ("data_lengths array overflow"));
 	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_TYPE];
 	buflen = kth->ktr_len;
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	aiov[0].iov_base = (caddr_t)kth;
 	aiov[0].iov_len = sizeof(struct ktr_header);
 	auio.uio_resid = sizeof(struct ktr_header);
 	auio.uio_iovcnt = 1;
 	auio.uio_td = td;
 	if (datalen != 0) {
 		aiov[1].iov_base = (caddr_t)&req->ktr_data;
 		aiov[1].iov_len = datalen;
 		auio.uio_resid += datalen;
 		auio.uio_iovcnt++;
 		kth->ktr_len += datalen;
 	}
 	if (buflen != 0) {
 		KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
 		aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
 		aiov[auio.uio_iovcnt].iov_len = buflen;
 		auio.uio_resid += buflen;
 		auio.uio_iovcnt++;
 	}
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	td->td_ktr_io_lim = lim;
 #ifdef MAC
 	error = mac_vnode_check_write(cred, NOCRED, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	if (error == 0) {
 		mtx_lock(&ktrace_mtx);
 		kiop = ktr_io_params_rele(kiop);
 		mtx_unlock(&ktrace_mtx);
 		ktr_io_params_free(kiop);
 		return;
 	}
 
 	/*
 	 * If error encountered, give up tracing on this vnode on this
 	 * process.  Other processes might still be suitable for
 	 * writes to this vnode.
 	 */
 	log(LOG_NOTICE,
 	    "ktrace write failed, errno %d, tracing stopped for pid %d\n",
 	    error, p->p_pid);
 
 	kiop1 = NULL;
 	PROC_LOCK(p);
 	mtx_lock(&ktrace_mtx);
 	if (p->p_ktrioparms != NULL && p->p_ktrioparms->vp == vp)
 		kiop1 = ktr_freeproc(p);
 	kiop = ktr_io_params_rele(kiop);
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	ktr_io_params_free(kiop1);
 	ktr_io_params_free(kiop);
 }
 
 /*
  * Return true if caller has permission to set the ktracing state
  * of target.  Essentially, the target can't possess any
  * more permissions than the caller.  KTRFAC_ROOT signifies that
  * root previously set the tracing status on the target process, and
  * so, only root may further change it.
  */
 static int
 ktrcanset(struct thread *td, struct proc *targetp)
 {
 
 	PROC_LOCK_ASSERT(targetp, MA_OWNED);
 	if (targetp->p_traceflag & KTRFAC_ROOT &&
 	    priv_check(td, PRIV_KTRACE))
 		return (0);
 
 	if (p_candebug(td, targetp) != 0)
 		return (0);
 
 	return (1);
 }
 
 #endif /* KTRACE */
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index f6f6b57adfc7..323e7fcde07b 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -1,1373 +1,1373 @@
 /*-
  * Copyright (c) 2013-2015 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktls.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 
 #include <net/vnet.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile dynamic memory");
 
 #define	EXT_FLAG_SYNC		EXT_FLAG_VENDOR1
 #define	EXT_FLAG_NOCACHE	EXT_FLAG_VENDOR2
 #define	EXT_FLAG_CACHE_LAST	EXT_FLAG_VENDOR3
 
 /*
  * Structure describing a single sendfile(2) I/O, which may consist of
  * several underlying pager I/Os.
  *
  * The syscall context allocates the structure and initializes 'nios'
  * to 1.  As sendfile_swapin() runs through pages and starts asynchronous
  * paging operations, it increments 'nios'.
  *
  * Every I/O completion calls sendfile_iodone(), which decrements the 'nios',
  * and the syscall also calls sendfile_iodone() after allocating all mbufs,
  * linking them and sending to socket.  Whoever reaches zero 'nios' is
  * responsible to * call pru_ready on the socket, to notify it of readyness
  * of the data.
  */
 struct sf_io {
 	volatile u_int	nios;
 	u_int		error;
 	int		npages;
 	struct socket	*so;
 	struct mbuf	*m;
 	vm_object_t	obj;
 	vm_pindex_t	pindex0;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 	vm_page_t	pa[];
 };
 
 /*
  * Structure used to track requests with SF_SYNC flag.
  */
 struct sendfile_sync {
 	struct mtx	mtx;
 	struct cv	cv;
 	unsigned	count;
 	bool		waiting;
 };
 
 static void
 sendfile_sync_destroy(struct sendfile_sync *sfs)
 {
 	KASSERT(sfs->count == 0, ("sendfile sync %p still busy", sfs));
 
 	cv_destroy(&sfs->cv);
 	mtx_destroy(&sfs->mtx);
 	free(sfs, M_SENDFILE);
 }
 
 static void
 sendfile_sync_signal(struct sendfile_sync *sfs)
 {
 	mtx_lock(&sfs->mtx);
 	KASSERT(sfs->count > 0, ("sendfile sync %p not busy", sfs));
 	if (--sfs->count == 0) {
 		if (!sfs->waiting) {
 			/* The sendfile() waiter was interrupted by a signal. */
 			sendfile_sync_destroy(sfs);
 			return;
 		} else {
 			cv_signal(&sfs->cv);
 		}
 	}
 	mtx_unlock(&sfs->mtx);
 }
 
 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 
 static void
 sfstat_init(const void *unused)
 {
 
 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
 	    M_WAITOK);
 }
 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
 
 static int
 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct sfstat s;
 
 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
 	if (req->newptr)
 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
 	return (SYSCTL_OUT(req, &s, sizeof(s)));
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     sfstat_sysctl, "I",
     "sendfile statistics");
 
 static void
 sendfile_free_mext(struct mbuf *m)
 {
 	struct sf_buf *sf;
 	vm_page_t pg;
 	int flags;
 
 	KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF,
 	    ("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m));
 
 	sf = m->m_ext.ext_arg1;
 	pg = sf_buf_page(sf);
 	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;
 
 	sf_buf_free(sf);
 	vm_page_release(pg, flags);
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg2;
 		sendfile_sync_signal(sfs);
 	}
 }
 
 static void
 sendfile_free_mext_pg(struct mbuf *m)
 {
 	vm_page_t pg;
 	int flags, i;
 	bool cache_last;
 
 	M_ASSERTEXTPG(m);
 
 	cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
 	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;
 
 	for (i = 0; i < m->m_epg_npgs; i++) {
 		if (cache_last && i == m->m_epg_npgs - 1)
 			flags = 0;
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_release(pg, flags);
 	}
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg1;
 		sendfile_sync_signal(sfs);
 	}
 }
 
 /*
  * Helper function to calculate how much data to put into page i of n.
  * Only first and last pages are special.
  */
 static inline off_t
 xfsize(int i, int n, off_t off, off_t len)
 {
 
 	if (i == 0)
 		return (omin(PAGE_SIZE - (off & PAGE_MASK), len));
 
 	if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
 		return ((off + len) & PAGE_MASK);
 
 	return (PAGE_SIZE);
 }
 
 /*
  * Helper function to get offset within object for i page.
  */
 static inline vm_ooffset_t
 vmoff(int i, off_t off)
 {
 
 	if (i == 0)
 		return ((vm_ooffset_t)off);
 
 	return (trunc_page(off + i * PAGE_SIZE));
 }
 
 /*
  * Helper function used when allocation of a page or sf_buf failed.
  * Pretend as if we don't have enough space, subtract xfsize() of
  * all pages that failed.
  */
 static inline void
 fixspace(int old, int new, off_t off, int *space)
 {
 
 	KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));
 
 	/* Subtract last one. */
 	*space -= xfsize(old - 1, old, off, *space);
 	old--;
 
 	if (new == old)
 		/* There was only one page. */
 		return;
 
 	/* Subtract first one. */
 	if (new == 0) {
 		*space -= xfsize(0, old, off, *space);
 		new++;
 	}
 
 	/* Rest of pages are full sized. */
 	*space -= (old - new) * PAGE_SIZE;
 
 	KASSERT(*space >= 0, ("%s: space went backwards", __func__));
 }
 
 /*
  * Wait for all in-flight ios to complete, we must not unwire pages
  * under them.
  */
 static void
 sendfile_iowait(struct sf_io *sfio, const char *wmesg)
 {
 	while (atomic_load_int(&sfio->nios) != 1)
 		pause(wmesg, 1);
 }
 
 /*
  * I/O completion callback.
  */
 static void
 sendfile_iodone(void *arg, vm_page_t *pa, int count, int error)
 {
 	struct sf_io *sfio = arg;
 	struct socket *so;
 	int i;
 
 	if (error != 0)
 		sfio->error = error;
 
 	/*
 	 * Restore the valid page pointers.  They are already
 	 * unbusied, but still wired.
 	 *
 	 * XXXKIB since pages are only wired, and we do not
 	 * own the object lock, other users might have
 	 * invalidated them in meantime.  Similarly, after we
 	 * unbusied the swapped-in pages, they can become
 	 * invalid under us.
 	 */
 	MPASS(count == 0 || pa[0] != bogus_page);
 	for (i = 0; i < count; i++) {
 		if (pa[i] == bogus_page) {
 			sfio->pa[(pa[0]->pindex - sfio->pindex0) + i] =
 			    pa[i] = vm_page_relookup(sfio->obj,
 			    pa[0]->pindex + i);
 			KASSERT(pa[i] != NULL,
 			    ("%s: page %p[%d] disappeared",
 			    __func__, pa, i));
 		} else {
 			vm_page_xunbusy_unchecked(pa[i]);
 		}
 	}
 
 	if (!refcount_release(&sfio->nios))
 		return;
 
 #ifdef INVARIANTS
 	for (i = 1; i < sfio->npages; i++) {
 		if (sfio->pa[i] == NULL)
 			break;
 		KASSERT(vm_page_wired(sfio->pa[i]),
 		    ("sfio %p page %d %p not wired", sfio, i, sfio->pa[i]));
 		if (i == 0)
 			continue;
 		KASSERT(sfio->pa[0]->object == sfio->pa[i]->object,
 		    ("sfio %p page %d %p wrong owner %p %p", sfio, i,
 		    sfio->pa[i], sfio->pa[0]->object, sfio->pa[i]->object));
 		KASSERT(sfio->pa[0]->pindex + i == sfio->pa[i]->pindex,
 		    ("sfio %p page %d %p wrong index %jx %jx", sfio, i,
 		    sfio->pa[i], (uintmax_t)sfio->pa[0]->pindex,
 		    (uintmax_t)sfio->pa[i]->pindex));
 	}
 #endif
 
 	vm_object_pip_wakeup(sfio->obj);
 
 	if (sfio->m == NULL) {
 		/*
 		 * Either I/O operation failed, or we failed to allocate
 		 * buffers, or we bailed out on first busy page, or we
 		 * succeeded filling the request without any I/Os. Anyway,
 		 * pru_send hadn't been executed - nothing had been sent
 		 * to the socket yet.
 		 */
 		MPASS((curthread->td_pflags & TDP_KTHREAD) == 0);
 		free(sfio, M_SENDFILE);
 		return;
 	}
 
 #if defined(KERN_TLS) && defined(INVARIANTS)
 	if ((sfio->m->m_flags & M_EXTPG) != 0)
 		KASSERT(sfio->tls == sfio->m->m_epg_tls,
 		    ("TLS session mismatch"));
 	else
 		KASSERT(sfio->tls == NULL,
 		    ("non-ext_pgs mbuf with TLS session"));
 #endif
 	so = sfio->so;
 	CURVNET_SET(so->so_vnet);
 	if (__predict_false(sfio->error)) {
 		/*
 		 * I/O operation failed.  The state of data in the socket
 		 * is now inconsistent, and all what we can do is to tear
 		 * it down. Protocol abort method would tear down protocol
 		 * state, free all ready mbufs and detach not ready ones.
 		 * We will free the mbufs corresponding to this I/O manually.
 		 *
 		 * The socket would be marked with EIO and made available
 		 * for read, so that application receives EIO on next
 		 * syscall and eventually closes the socket.
 		 */
 		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 
 		mb_free_notready(sfio->m, sfio->npages);
 #ifdef KERN_TLS
 	} else if (sfio->tls != NULL && sfio->tls->mode == TCP_TLS_MODE_SW) {
 		/*
 		 * I/O operation is complete, but we still need to
 		 * encrypt.  We cannot do this in the interrupt thread
 		 * of the disk controller, so forward the mbufs to a
 		 * different thread.
 		 *
 		 * Donate the socket reference from sfio to rather
 		 * than explicitly invoking soref().
 		 */
 		ktls_enqueue(sfio->m, so, sfio->npages);
 		goto out_with_ref;
 #endif
 	} else
 		(void)so->so_proto->pr_ready(so, sfio->m, sfio->npages);
 
 	sorele(so);
 #ifdef KERN_TLS
 out_with_ref:
 #endif
 	CURVNET_RESTORE();
 	free(sfio, M_SENDFILE);
 }
 
 /*
  * Iterate through pages vector and request paging for non-valid pages.
  */
 static int
 sendfile_swapin(vm_object_t obj, struct sf_io *sfio, int *nios, off_t off,
     off_t len, int rhpages, int flags)
 {
 	vm_page_t *pa;
 	int a, count, count1, grabbed, i, j, npages, rv;
 
 	pa = sfio->pa;
 	npages = sfio->npages;
 	*nios = 0;
 	flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0;
 	sfio->pindex0 = OFF_TO_IDX(off);
 
 	/*
 	 * First grab all the pages and wire them.  Note that we grab
 	 * only required pages.  Readahead pages are dealt with later.
 	 */
 	grabbed = vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(off),
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | flags, pa, npages);
 	if (grabbed < npages) {
 		for (int i = grabbed; i < npages; i++)
 			pa[i] = NULL;
 		npages = grabbed;
 		rhpages = 0;
 	}
 
 	for (i = 0; i < npages;) {
 		/* Skip valid pages. */
 		if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
 		    xfsize(i, npages, off, len))) {
 			vm_page_xunbusy(pa[i]);
 			SFSTAT_INC(sf_pages_valid);
 			i++;
 			continue;
 		}
 
 		/*
 		 * Next page is invalid.  Check if it belongs to pager.  It
 		 * may not be there, which is a regular situation for shmem
 		 * pager.  For vnode pager this happens only in case of
 		 * a sparse file.
 		 *
 		 * Important feature of vm_pager_has_page() is the hint
 		 * stored in 'a', about how many pages we can pagein after
 		 * this page in a single I/O.
 		 */
 		VM_OBJECT_RLOCK(obj);
 		if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL,
 		    &a)) {
 			VM_OBJECT_RUNLOCK(obj);
 			pmap_zero_page(pa[i]);
 			vm_page_valid(pa[i]);
 			MPASS(pa[i]->dirty == 0);
 			vm_page_xunbusy(pa[i]);
 			i++;
 			continue;
 		}
 		VM_OBJECT_RUNLOCK(obj);
 
 		/*
 		 * We want to pagein as many pages as possible, limited only
 		 * by the 'a' hint and actual request.
 		 */
 		count = min(a + 1, npages - i);
 
 		/*
 		 * We should not pagein into a valid page because
 		 * there might be still unfinished write tracked by
 		 * e.g. a buffer, thus we substitute any valid pages
 		 * with the bogus one.
 		 *
 		 * We must not leave around xbusy pages which are not
 		 * part of the run passed to vm_pager_getpages(),
 		 * otherwise pager might deadlock waiting for the busy
 		 * status of the page, e.g. if it constitues the
 		 * buffer needed to validate other page.
 		 *
 		 * First trim the end of the run consisting of the
 		 * valid pages, then replace the rest of the valid
 		 * with bogus.
 		 */
 		count1 = count;
 		for (j = i + count - 1; j > i; j--) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				count--;
 			} else {
 				break;
 			}
 		}
 
 		/*
 		 * The last page in the run pa[i + count - 1] is
 		 * guaranteed to be invalid by the trim above, so it
 		 * is not replaced with bogus, thus -1 in the loop end
 		 * condition.
 		 */
 		MPASS(pa[i + count - 1]->valid != VM_PAGE_BITS_ALL);
 		for (j = i + 1; j < i + count - 1; j++) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				SFSTAT_INC(sf_pages_bogus);
 				pa[j] = bogus_page;
 			}
 		}
 
 		refcount_acquire(&sfio->nios);
 		rv = vm_pager_get_pages_async(obj, pa + i, count, NULL,
 		    i + count == npages ? &rhpages : NULL,
 		    &sendfile_iodone, sfio);
 		if (__predict_false(rv != VM_PAGER_OK)) {
 			sendfile_iowait(sfio, "sferrio");
 
 			/*
 			 * Do remaining pages recovery before returning EIO.
 			 * Pages from 0 to npages are wired.
 			 * Pages from (i + count1) to npages are busied.
 			 */
 			for (j = 0; j < npages; j++) {
 				if (j >= i + count1)
 					vm_page_xunbusy(pa[j]);
 				KASSERT(pa[j] != NULL && pa[j] != bogus_page,
 				    ("%s: page %p[%d] I/O recovery failure",
 				    __func__, pa, j));
 				vm_page_unwire(pa[j], PQ_INACTIVE);
 				pa[j] = NULL;
 			}
 			return (EIO);
 		}
 
 		SFSTAT_INC(sf_iocnt);
 		SFSTAT_ADD(sf_pages_read, count);
 		if (i + count == npages)
 			SFSTAT_ADD(sf_rhpages_read, rhpages);
 
 		i += count1;
 		(*nios)++;
 	}
 
 	if (*nios == 0 && npages != 0)
 		SFSTAT_INC(sf_noiocnt);
 
 	return (0);
 }
 
 static int
 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
     int *bsize)
 {
 	vm_object_t obj;
 	struct vnode *vp;
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	vp = *vp_res = NULL;
 	obj = NULL;
 	shmfd = *shmfd_res = NULL;
 	*bsize = 0;
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		if (vp->v_type != VREG) {
 			error = EINVAL;
 			goto out;
 		}
 		*bsize = vp->v_mount->mnt_stat.f_iosize;
 		obj = vp->v_object;
 		if (obj == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * Use the pager size when available to simplify synchronization
 		 * with filesystems, which otherwise must atomically update both
 		 * the vnode pager size and file size.
 		 */
 		if (obj->type == OBJT_VNODE) {
 			VM_OBJECT_RLOCK(obj);
 			*obj_size = obj->un_pager.vnp.vnp_size;
 		} else {
 			error = vn_getsize_locked(vp, obj_size, td->td_ucred);
 			if (error != 0)
 				goto out;
 			VM_OBJECT_RLOCK(obj);
 		}
 	} else if (fp->f_type == DTYPE_SHM) {
 		shmfd = fp->f_data;
 		obj = shmfd->shm_object;
 		VM_OBJECT_RLOCK(obj);
 		*obj_size = shmfd->shm_size;
 	} else {
 		error = EINVAL;
 		goto out;
 	}
 
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		VM_OBJECT_RUNLOCK(obj);
 		error = EBADF;
 		goto out;
 	}
 
 	/*
 	 * Temporarily increase the backing VM object's reference
 	 * count so that a forced reclamation of its vnode does not
 	 * immediately destroy it.
 	 */
 	vm_object_reference_locked(obj);
 	VM_OBJECT_RUNLOCK(obj);
 	*obj_res = obj;
 	*vp_res = vp;
 	*shmfd_res = shmfd;
 
 out:
 	if (vp != NULL)
 		VOP_UNLOCK(vp);
 	return (error);
 }
 
 static int
 sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
     struct socket **so)
 {
 	int error;
 
 	*sock_fp = NULL;
 	*so = NULL;
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 */
 	error = getsock(td, s, &cap_send_rights, sock_fp);
 	if (error != 0)
 		return (error);
 	*so = (*sock_fp)->f_data;
 	if ((*so)->so_type != SOCK_STREAM)
 		return (EINVAL);
 	/*
 	 * SCTP one-to-one style sockets currently don't work with
 	 * sendfile(). So indicate EINVAL for now.
 	 */
 	if ((*so)->so_proto->pr_protocol == IPPROTO_SCTP)
 		return (EINVAL);
 	return (0);
 }
 
 int
 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj;
 	vm_page_t pga;
 	struct socket *so;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 	struct mbuf *m, *mh, *mhtail;
 	struct sf_buf *sf;
 	struct shmfd *shmfd;
 	struct sendfile_sync *sfs;
 	struct vattr va;
 	off_t off, sbytes, rem, obj_size, nobj_size;
 	int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
 #ifdef KERN_TLS
 	int tls_enq_cnt;
 #endif
 	bool use_ext_pgs;
 
 	obj = NULL;
 	so = NULL;
 	m = mh = NULL;
 	sfs = NULL;
 #ifdef KERN_TLS
 	tls = NULL;
 #endif
 	hdrlen = sbytes = 0;
 	softerr = 0;
 	use_ext_pgs = false;
 
 	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
 	if (error != 0)
 		return (error);
 
 	error = sendfile_getsock(td, sockfd, &sock_fp, &so);
 	if (error != 0)
 		goto out;
 
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	SFSTAT_INC(sf_syscalls);
 	SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags));
 
 	if (flags & SF_SYNC) {
 		sfs = malloc(sizeof(*sfs), M_SENDFILE, M_WAITOK | M_ZERO);
 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 		cv_init(&sfs->cv, "sendfile");
 		sfs->waiting = true;
 	}
 
 	rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
 
 	/*
 	 * Protect against multiple writers to the socket.
 	 *
 	 * XXXRW: Historically this has assumed non-interruptibility, so now
 	 * we implement that, but possibly shouldn't.
 	 */
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT | SBL_NOINTR);
 	if (error != 0)
 		goto out;
 #ifdef KERN_TLS
 	tls = ktls_hold(so->so_snd.sb_tls_info);
 #endif
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = offset; rem > 0; ) {
 		struct sf_io *sfio;
 		vm_page_t *pa;
 		struct mbuf *m0, *mtail;
 		int nios, space, npages, rhpages;
 
 		mtail = NULL;
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOTCONN;
 			goto done;
 		}
 
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(so, SO_SND);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * At the beginning of the first loop check if any headers
 		 * are specified and copy them into mbufs.  Reduce space in
 		 * the socket buffer by the size of the header mbuf chain.
 		 * Clear hdr_uio here and hdrlen at the end of the first loop.
 		 */
 		if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
 			hdr_uio->uio_td = td;
 			hdr_uio->uio_rw = UIO_WRITE;
 #ifdef KERN_TLS
 			if (tls != NULL)
 				mh = m_uiotombuf(hdr_uio, M_WAITOK, space,
 				    tls->params.max_frame_len, M_EXTPG);
 			else
 #endif
 				mh = m_uiotombuf(hdr_uio, M_WAITOK,
 				    space, 0, 0);
 			hdrlen = m_length(mh, &mhtail);
 			space -= hdrlen;
 			/*
 			 * If header consumed all the socket buffer space,
 			 * don't waste CPU cycles and jump to the end.
 			 */
 			if (space == 0) {
 				sfio = NULL;
 				nios = 0;
 				goto prepend_header;
 			}
 			hdr_uio = NULL;
 		}
 
 		if (vp != NULL) {
 			error = vn_lock(vp, LK_SHARED);
 			if (error != 0)
 				goto done;
 
 			/*
 			 * Check to see if the file size has changed.
 			 */
 			if (obj->type == OBJT_VNODE) {
 				VM_OBJECT_RLOCK(obj);
 				nobj_size = obj->un_pager.vnp.vnp_size;
 				VM_OBJECT_RUNLOCK(obj);
 			} else {
 				error = VOP_GETATTR(vp, &va, td->td_ucred);
 				if (error != 0) {
 					VOP_UNLOCK(vp);
 					goto done;
 				}
 				nobj_size = va.va_size;
 			}
 			if (off >= nobj_size) {
 				VOP_UNLOCK(vp);
 				goto done;
 			}
 			if (nobj_size != obj_size) {
 				obj_size = nobj_size;
 				rem = nbytes ? omin(nbytes + offset, obj_size) :
 				    obj_size;
 				rem -= off;
 			}
 		}
 
 		if (space > rem)
 			space = rem;
 		else if (space > PAGE_SIZE) {
 			/*
 			 * Use page boundaries when possible for large
 			 * requests.
 			 */
 			if (off & PAGE_MASK)
 				space -= (PAGE_SIZE - (off & PAGE_MASK));
 			space = trunc_page(space);
 			if (off & PAGE_MASK)
 				space += (PAGE_SIZE - (off & PAGE_MASK));
 		}
 
 		npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
 
 		/*
 		 * Calculate maximum allowed number of pages for readahead
 		 * at this iteration.  If SF_USER_READAHEAD was set, we don't
 		 * do any heuristics and use exactly the value supplied by
 		 * application.  Otherwise, we allow readahead up to "rem".
 		 * If application wants more, let it be, but there is no
 		 * reason to go above maxphys.  Also check against "obj_size",
 		 * since vm_pager_has_page() can hint beyond EOF.
 		 */
 		if (flags & SF_USER_READAHEAD) {
 			rhpages = SF_READAHEAD(flags);
 		} else {
 			rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) -
 			    npages;
 			rhpages += SF_READAHEAD(flags);
 		}
 		rhpages = min(howmany(maxphys, PAGE_SIZE), rhpages);
 		rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) -
 		    npages, rhpages);
 
 		sfio = malloc(sizeof(struct sf_io) +
 		    npages * sizeof(vm_page_t), M_SENDFILE, M_WAITOK);
 		refcount_init(&sfio->nios, 1);
 		sfio->obj = obj;
 		sfio->error = 0;
 		sfio->m = NULL;
 		sfio->npages = npages;
 #ifdef KERN_TLS
 		/*
 		 * This doesn't use ktls_hold() because sfio->m will
 		 * also have a reference on 'tls' that will be valid
 		 * for all of sfio's lifetime.
 		 */
 		sfio->tls = tls;
 #endif
 		vm_object_pip_add(obj, 1);
 		error = sendfile_swapin(obj, sfio, &nios, off, space, rhpages,
 		    flags);
 		if (error != 0) {
 			if (vp != NULL)
 				VOP_UNLOCK(vp);
 			sendfile_iodone(sfio, NULL, 0, error);
 			goto done;
 		}
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		pa = sfio->pa;
 
 		/*
 		 * Use unmapped mbufs if enabled for TCP.  Unmapped
 		 * bufs are restricted to TCP as that is what has been
 		 * tested.  In particular, unmapped mbufs have not
 		 * been tested with UNIX-domain sockets.
 		 *
 		 * TLS frames always require unmapped mbufs.
 		 */
 		if ((mb_use_ext_pgs &&
 		    so->so_proto->pr_protocol == IPPROTO_TCP)
 #ifdef KERN_TLS
 		    || tls != NULL
 #endif
 		    ) {
 			use_ext_pgs = true;
 #ifdef KERN_TLS
 			if (tls != NULL)
 				max_pgs = num_pages(tls->params.max_frame_len);
 			else
 #endif
 				max_pgs = MBUF_PEXT_MAX_PGS;
 
 			/* Start at last index, to wrap on first use. */
 			ext_pgs_idx = max_pgs - 1;
 		}
 
 		for (int i = 0; i < npages; i++) {
 			/*
 			 * If a page wasn't grabbed successfully, then
 			 * trim the array. Can happen only with SF_NODISKIO.
 			 */
 			if (pa[i] == NULL) {
 				SFSTAT_INC(sf_busy);
 				fixspace(npages, i, off, &space);
 				sfio->npages = i;
 				softerr = EBUSY;
 				break;
 			}
 			pga = pa[i];
 			if (pga == bogus_page)
 				pga = vm_page_relookup(obj, sfio->pindex0 + i);
 
 			if (use_ext_pgs) {
 				off_t xfs;
 
 				ext_pgs_idx++;
 				if (ext_pgs_idx == max_pgs) {
 					m0 = mb_alloc_ext_pgs(M_WAITOK,
 					    sendfile_free_mext_pg);
 
 					if (flags & SF_NOCACHE) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_NOCACHE;
 
 						/*
 						 * See comment below regarding
 						 * ignoring SF_NOCACHE for the
 						 * last page.
 						 */
 						if ((npages - i <= max_pgs) &&
 						    ((off + space) & PAGE_MASK) &&
 						    (rem > space || rhpages > 0))
 							m0->m_ext.ext_flags |=
 							    EXT_FLAG_CACHE_LAST;
 					}
 					if (sfs != NULL) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_SYNC;
 						m0->m_ext.ext_arg1 = sfs;
 						mtx_lock(&sfs->mtx);
 						sfs->count++;
 						mtx_unlock(&sfs->mtx);
 					}
 					ext_pgs_idx = 0;
 
 					/* Append to mbuf chain. */
 					if (mtail != NULL)
 						mtail->m_next = m0;
 					else
 						m = m0;
 					mtail = m0;
 					m0->m_epg_1st_off =
 					    vmoff(i, off) & PAGE_MASK;
 				}
 				if (nios) {
 					mtail->m_flags |= M_NOTREADY;
 					m0->m_epg_nrdy++;
 				}
 
 				m0->m_epg_pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pga);
 				m0->m_epg_npgs++;
 				xfs = xfsize(i, npages, off, space);
 				m0->m_epg_last_len = xfs;
 				MBUF_EXT_PGS_ASSERT_SANITY(m0);
 				mtail->m_len += xfs;
 				mtail->m_ext.ext_size += PAGE_SIZE;
 				continue;
 			}
 
 			/*
 			 * Get a sendfile buf.  When allocating the
 			 * first buffer for mbuf chain, we usually
 			 * wait as long as necessary, but this wait
 			 * can be interrupted.  For consequent
 			 * buffers, do not sleep, since several
 			 * threads might exhaust the buffers and then
 			 * deadlock.
 			 */
 			sf = sf_buf_alloc(pga,
 			    m != NULL ? SFB_NOWAIT : SFB_CATCH);
 			if (sf == NULL) {
 				SFSTAT_INC(sf_allocfail);
 				sendfile_iowait(sfio, "sfnosf");
 				for (int j = i; j < npages; j++) {
 					vm_page_unwire(pa[j], PQ_INACTIVE);
 					pa[j] = NULL;
 				}
 				if (m == NULL)
 					softerr = ENOBUFS;
 				fixspace(npages, i, off, &space);
 				sfio->npages = i;
 				break;
 			}
 
 			m0 = m_get(M_WAITOK, MT_DATA);
 			m0->m_ext.ext_buf = (char *)sf_buf_kva(sf);
 			m0->m_ext.ext_size = PAGE_SIZE;
 			m0->m_ext.ext_arg1 = sf;
 			m0->m_ext.ext_type = EXT_SFBUF;
 			m0->m_ext.ext_flags = EXT_FLAG_EMBREF;
 			m0->m_ext.ext_free = sendfile_free_mext;
 			/*
 			 * SF_NOCACHE sets the page as being freed upon send.
 			 * However, we ignore it for the last page in 'space',
 			 * if the page is truncated, and we got more data to
 			 * send (rem > space), or if we have readahead
 			 * configured (rhpages > 0).
 			 */
 			if ((flags & SF_NOCACHE) &&
 			    (i != npages - 1 ||
 			    !((off + space) & PAGE_MASK) ||
 			    !(rem > space || rhpages > 0)))
 				m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE;
 			if (sfs != NULL) {
 				m0->m_ext.ext_flags |= EXT_FLAG_SYNC;
 				m0->m_ext.ext_arg2 = sfs;
 				mtx_lock(&sfs->mtx);
 				sfs->count++;
 				mtx_unlock(&sfs->mtx);
 			}
 			m0->m_ext.ext_count = 1;
 			m0->m_flags |= (M_EXT | M_RDONLY);
 			if (nios)
 				m0->m_flags |= M_NOTREADY;
 			m0->m_data = (char *)sf_buf_kva(sf) +
 			    (vmoff(i, off) & PAGE_MASK);
 			m0->m_len = xfsize(i, npages, off, space);
 
 			/* Append to mbuf chain. */
 			if (mtail != NULL)
 				mtail->m_next = m0;
 			else
 				m = m0;
 			mtail = m0;
 		}
 
 		if (vp != NULL)
 			VOP_UNLOCK(vp);
 
 		/* Keep track of bytes processed. */
 		off += space;
 		rem -= space;
 
 		/*
 		 * Prepend header, if any.  Save pointer to first mbuf
 		 * with a page.
 		 */
 		if (hdrlen) {
 prepend_header:
 			m0 = mhtail->m_next = m;
 			m = mh;
 			mh = NULL;
 		} else
 			m0 = m;
 
 		if (m == NULL) {
 			KASSERT(softerr, ("%s: m NULL, no error", __func__));
 			error = softerr;
 			sendfile_iodone(sfio, NULL, 0, 0);
 			goto done;
 		}
 
 		/* Add the buffer chain to the socket buffer. */
 		KASSERT(m_length(m, NULL) == space + hdrlen,
 		    ("%s: mlen %u space %d hdrlen %d",
 		    __func__, m_length(m, NULL), space, hdrlen));
 
 		CURVNET_SET(so->so_vnet);
 #ifdef KERN_TLS
 		if (tls != NULL)
 			ktls_frame(m, tls, &tls_enq_cnt, TLS_RLTYPE_APP);
 #endif
 		if (nios == 0) {
 			/*
 			 * If sendfile_swapin() didn't initiate any I/Os,
 			 * which happens if all data is cached in VM, or if
 			 * the header consumed all socket buffer space and
 			 * sfio is NULL, then we can send data right now
 			 * without the PRUS_NOTREADY flag.
 			 */
 			if (sfio != NULL)
 				sendfile_iodone(sfio, NULL, 0, 0);
 #ifdef KERN_TLS
 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
 				error = so->so_proto->pr_send(so,
 				    PRUS_NOTREADY, m, NULL, NULL, td);
 				if (error != 0) {
 					m_freem(m);
 				} else {
 					soref(so);
 					ktls_enqueue(m, so, tls_enq_cnt);
 				}
 			} else
 #endif
 				error = so->so_proto->pr_send(so, 0, m, NULL,
 				    NULL, td);
 		} else {
 			sfio->so = so;
 			sfio->m = m0;
 			soref(so);
 			error = so->so_proto->pr_send(so, PRUS_NOTREADY, m,
 			    NULL, NULL, td);
 			sendfile_iodone(sfio, NULL, 0, error);
 		}
 #ifdef TCP_REQUEST_TRK
 		if (so->so_proto->pr_protocol == IPPROTO_TCP) {
 			/* log the sendfile call to the TCP log, if enabled */
 			tcp_log_sendfile(so, offset, nbytes, flags);
 		}
 #endif
 		CURVNET_RESTORE();
 
 		m = NULL;
 		if (error)
 			goto done;
 		sbytes += space + hdrlen;
 		if (hdrlen)
 			hdrlen = 0;
 		if (softerr) {
 			error = softerr;
 			goto done;
 		}
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		SOCK_IO_SEND_UNLOCK(so);
 		error = kern_writev(td, sockfd, trl_uio);
 		if (error == 0)
 			sbytes += td->td_retval[0];
 		goto out;
 	}
 
 done:
 	SOCK_IO_SEND_UNLOCK(so);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (sent != NULL) {
 		(*sent) = sbytes;
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 	if (mh)
 		m_freem(mh);
 
 	if (sfs != NULL) {
 		mtx_lock(&sfs->mtx);
 		if (sfs->count != 0)
 			error = cv_wait_sig(&sfs->cv, &sfs->mtx);
 		if (sfs->count == 0) {
 			sendfile_sync_destroy(sfs);
 		} else {
 			sfs->waiting = false;
 			mtx_unlock(&sfs->mtx);
 		}
 	}
 #ifdef KERN_TLS
 	if (tls != NULL)
 		ktls_free(tls);
 #endif
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 static int
 sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	struct file *fp;
 	off_t sbytes;
 	int error;
 
 	/*
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if (uap->offset < 0)
 		return (EINVAL);
 
 	sbytes = 0;
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error != 0)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
 			    &hdr_uio);
 			if (error != 0)
 				goto out;
 #ifdef COMPAT_FREEBSD4
 			/*
 			 * In FreeBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 #endif
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
 			    &trl_uio);
 			if (error != 0)
 				goto out;
 		}
 	}
 
 	AUDIT_ARG_FD(uap->fd);
 
 	/*
 	 * sendfile(2) can start at any offset within a file so we require
 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
 	 */
 	if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0)
 		goto out;
 
 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
 	    uap->nbytes, &sbytes, uap->flags, td);
 	fdrop(fp, td);
 
 	if (uap->sbytes != NULL)
 		(void)copyout(&sbytes, uap->sbytes, sizeof(off_t));
 
 out:
-	free(hdr_uio, M_IOV);
-	free(trl_uio, M_IOV);
+	freeuio(hdr_uio);
+	freeuio(trl_uio);
 	return (error);
 }
 
 /*
  * sendfile(2)
  * 
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  * 
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
  * 0.  Optionally add a header and/or trailer to the socket output.  If
  * specified, write the total number of bytes sent into *sbytes.
  */
 int
 sys_sendfile(struct thread *td, struct sendfile_args *uap)
 {
 
 	return (sendfile(td, uap, 0));
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
index f2be2f0dc664..60040a085162 100644
--- a/sys/kern/subr_prf.c
+++ b/sys/kern/subr_prf.c
@@ -1,1335 +1,1335 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 #ifdef _KERNEL
 #include "opt_ddb.h"
 #include "opt_printf.h"
 #endif  /* _KERNEL */
 
 #include <sys/param.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kdb.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/kernel.h>
 #include <sys/msgbuf.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stddef.h>
 #include <sys/sysctl.h>
 #include <sys/tslog.h>
 #include <sys/tty.h>
 #include <sys/syslog.h>
 #include <sys/cons.h>
 #include <sys/uio.h>
 #else /* !_KERNEL */
 #include <errno.h>
 #endif
 #include <sys/ctype.h>
 #include <sys/sbuf.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #ifdef _KERNEL
 #include <machine/stdarg.h>
 #else
 #include <stdarg.h>
 #endif
 
 /*
  * This is needed for sbuf_putbuf() when compiled into userland.  Due to the
  * shared nature of this file, it's the only place to put it.
  */
 #ifndef _KERNEL
 #include <stdio.h>
 #endif
 
 #ifdef _KERNEL
 
 #define TOCONS	0x01
 #define TOTTY	0x02
 #define TOLOG	0x04
 
 /* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
 #define MAXNBUF	(sizeof(intmax_t) * NBBY + 1)
 
 struct putchar_arg {
 	int	flags;
 	int	pri;
 	struct	tty *tty;
 	char	*p_bufr;
 	size_t	n_bufr;
 	char	*p_next;
 	size_t	remain;
 };
 
 struct snprintf_arg {
 	char	*str;
 	size_t	remain;
 };
 
 extern	int log_open;
 
 static void  msglogchar(int c, int pri);
 static void  msglogstr(char *str, int pri, int filter_cr);
 static void  prf_putbuf(char *bufr, int flags, int pri);
 static void  putchar(int ch, void *arg);
 static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper);
 static void  snprintf_func(int ch, void *arg);
 
 static bool msgbufmapped;		/* Set when safe to use msgbuf */
 int msgbuftrigger;
 struct msgbuf *msgbufp;
 
 #ifndef BOOT_TAG_SZ
 #define	BOOT_TAG_SZ	32
 #endif
 #ifndef BOOT_TAG
 /* Tag used to mark the start of a boot in dmesg */
 #define	BOOT_TAG	"---<<BOOT>>---"
 #endif
 
 static char current_boot_tag[BOOT_TAG_SZ + 1] = BOOT_TAG;
 SYSCTL_STRING(_kern, OID_AUTO, boot_tag, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     current_boot_tag, 0, "Tag added to dmesg at start of boot");
 
 static int log_console_output = 1;
 SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RWTUN,
     &log_console_output, 0, "Duplicate console output to the syslog");
 
 /*
  * See the comment in log_console() below for more explanation of this.
  */
 static int log_console_add_linefeed;
 SYSCTL_INT(_kern, OID_AUTO, log_console_add_linefeed, CTLFLAG_RWTUN,
     &log_console_add_linefeed, 0, "log_console() adds extra newlines");
 
 static int always_console_output;
 SYSCTL_INT(_kern, OID_AUTO, always_console_output, CTLFLAG_RWTUN,
     &always_console_output, 0, "Always output to console despite TIOCCONS");
 
 /*
  * Warn that a system table is full.
  */
 void
 tablefull(const char *tab)
 {
 
 	log(LOG_ERR, "%s: table is full\n", tab);
 }
 
 /*
  * Uprintf prints to the controlling terminal for the current process.
  */
 int
 uprintf(const char *fmt, ...)
 {
 	va_list ap;
 	struct putchar_arg pca;
 	struct proc *p;
 	struct thread *td;
 	int retval;
 
 	td = curthread;
 	if (TD_IS_IDLETHREAD(td))
 		return (0);
 
 	if (td->td_proc == initproc) {
 		/* Produce output when we fail to load /sbin/init: */
 		va_start(ap, fmt);
 		retval = vprintf(fmt, ap);
 		va_end(ap);
 		return (retval);
 	}
 
 	sx_slock(&proctree_lock);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if ((p->p_flag & P_CONTROLT) == 0) {
 		PROC_UNLOCK(p);
 		sx_sunlock(&proctree_lock);
 		return (0);
 	}
 	SESS_LOCK(p->p_session);
 	pca.tty = p->p_session->s_ttyp;
 	SESS_UNLOCK(p->p_session);
 	PROC_UNLOCK(p);
 	if (pca.tty == NULL) {
 		sx_sunlock(&proctree_lock);
 		return (0);
 	}
 	pca.flags = TOTTY;
 	pca.p_bufr = NULL;
 	va_start(ap, fmt);
 	tty_lock(pca.tty);
 	sx_sunlock(&proctree_lock);
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 	tty_unlock(pca.tty);
 	va_end(ap);
 	return (retval);
 }
 
 /*
  * tprintf and vtprintf print on the controlling terminal associated with the
  * given session, possibly to the log as well.
  */
 void
 tprintf(struct proc *p, int pri, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vtprintf(p, pri, fmt, ap);
 	va_end(ap);
 }
 
 void
 vtprintf(struct proc *p, int pri, const char *fmt, va_list ap)
 {
 	struct tty *tp = NULL;
 	int flags = 0;
 	struct putchar_arg pca;
 	struct session *sess = NULL;
 
 	sx_slock(&proctree_lock);
 	if (pri != -1)
 		flags |= TOLOG;
 	if (p != NULL) {
 		PROC_LOCK(p);
 		if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
 			sess = p->p_session;
 			sess_hold(sess);
 			PROC_UNLOCK(p);
 			tp = sess->s_ttyp;
 			if (tp != NULL && tty_checkoutq(tp))
 				flags |= TOTTY;
 			else
 				tp = NULL;
 		} else
 			PROC_UNLOCK(p);
 	}
 	pca.pri = pri;
 	pca.tty = tp;
 	pca.flags = flags;
 	pca.p_bufr = NULL;
 	if (pca.tty != NULL)
 		tty_lock(pca.tty);
 	sx_sunlock(&proctree_lock);
 	kvprintf(fmt, putchar, &pca, 10, ap);
 	if (pca.tty != NULL)
 		tty_unlock(pca.tty);
 	if (sess != NULL)
 		sess_release(sess);
 	msgbuftrigger = 1;
 }
 
 static int
 _vprintf(int level, int flags, const char *fmt, va_list ap)
 {
 	struct putchar_arg pca;
 	int retval;
 #ifdef PRINTF_BUFR_SIZE
 	char bufr[PRINTF_BUFR_SIZE];
 #endif
 
 	TSENTER();
 	pca.tty = NULL;
 	pca.pri = level;
 	pca.flags = flags;
 #ifdef PRINTF_BUFR_SIZE
 	pca.p_bufr = bufr;
 	pca.p_next = pca.p_bufr;
 	pca.n_bufr = sizeof(bufr);
 	pca.remain = sizeof(bufr);
 	*pca.p_next = '\0';
 #else
 	/* Don't buffer console output. */
 	pca.p_bufr = NULL;
 #endif
 
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 
 #ifdef PRINTF_BUFR_SIZE
 	/* Write any buffered console/log output: */
 	if (*pca.p_bufr != '\0')
 		prf_putbuf(pca.p_bufr, flags, level);
 #endif
 
 	TSEXIT();
 	return (retval);
 }
 
 /*
  * Log writes to the log buffer, and guarantees not to sleep (so can be
  * called by interrupt routines).  If there is no process reading the
  * log yet, it writes to the console also.
  */
 void
 log(int level, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vlog(level, fmt, ap);
 	va_end(ap);
 }
 
 void
 vlog(int level, const char *fmt, va_list ap)
 {
 
 	(void)_vprintf(level, log_open ? TOLOG : TOCONS | TOLOG, fmt, ap);
 	msgbuftrigger = 1;
 }
 
 #define CONSCHUNK 128
 
 void
 log_console(struct uio *uio)
 {
 	int c, error, nl;
 	char *consbuffer;
 	int pri;
 
 	if (!log_console_output)
 		return;
 
 	pri = LOG_INFO | LOG_CONSOLE;
 	uio = cloneuio(uio);
 	consbuffer = malloc(CONSCHUNK, M_TEMP, M_WAITOK);
 
 	nl = 0;
 	while (uio->uio_resid > 0) {
 		c = imin(uio->uio_resid, CONSCHUNK - 1);
 		error = uiomove(consbuffer, c, uio);
 		if (error != 0)
 			break;
 		/* Make sure we're NUL-terminated */
 		consbuffer[c] = '\0';
 		if (consbuffer[c - 1] == '\n')
 			nl = 1;
 		else
 			nl = 0;
 		msglogstr(consbuffer, pri, /*filter_cr*/ 1);
 	}
 	/*
 	 * The previous behavior in log_console() is preserved when
 	 * log_console_add_linefeed is non-zero.  For that behavior, if an
 	 * individual console write came in that was not terminated with a
 	 * line feed, it would add a line feed.
 	 *
 	 * This results in different data in the message buffer than
 	 * appears on the system console (which doesn't add extra line feed
 	 * characters).
 	 *
 	 * A number of programs and rc scripts write a line feed, or a period
 	 * and a line feed when they have completed their operation.  On
 	 * the console, this looks seamless, but when displayed with
 	 * 'dmesg -a', you wind up with output that looks like this:
 	 *
 	 * Updating motd:
 	 * .
 	 *
 	 * On the console, it looks like this:
 	 * Updating motd:.
 	 *
 	 * We could add logic to detect that situation, or just not insert
 	 * the extra newlines.  Set the kern.log_console_add_linefeed
 	 * sysctl/tunable variable to get the old behavior.
 	 */
 	if (!nl && log_console_add_linefeed) {
 		consbuffer[0] = '\n';
 		consbuffer[1] = '\0';
 		msglogstr(consbuffer, pri, /*filter_cr*/ 1);
 	}
 	msgbuftrigger = 1;
-	free(uio, M_IOV);
+	freeuio(uio);
 	free(consbuffer, M_TEMP);
 }
 
 int
 printf(const char *fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	va_start(ap, fmt);
 	retval = vprintf(fmt, ap);
 	va_end(ap);
 
 	return (retval);
 }
 
 int
 vprintf(const char *fmt, va_list ap)
 {
 	int retval;
 
 	retval = _vprintf(-1, TOCONS | TOLOG, fmt, ap);
 
 	if (!KERNEL_PANICKED())
 		msgbuftrigger = 1;
 
 	return (retval);
 }
 
 static void
 prf_putchar(int c, int flags, int pri)
 {
 
 	if (flags & TOLOG)
 		msglogchar(c, pri);
 
 	if (flags & TOCONS) {
 		if ((!KERNEL_PANICKED()) && (constty != NULL))
 			msgbuf_addchar(&consmsgbuf, c);
 
 		if ((constty == NULL) || always_console_output)
 			cnputc(c);
 	}
 }
 
 static void
 prf_putbuf(char *bufr, int flags, int pri)
 {
 
 	if (flags & TOLOG)
 		msglogstr(bufr, pri, /*filter_cr*/1);
 
 	if (flags & TOCONS) {
 		if ((!KERNEL_PANICKED()) && (constty != NULL))
 			msgbuf_addstr(&consmsgbuf, -1,
 			    bufr, /*filter_cr*/ 0);
 
 		if ((constty == NULL) || always_console_output)
 			cnputs(bufr);
 	}
 }
 
 static void
 putbuf(int c, struct putchar_arg *ap)
 {
 	/* Check if no console output buffer was provided. */
 	if (ap->p_bufr == NULL) {
 		prf_putchar(c, ap->flags, ap->pri);
 	} else {
 		/* Buffer the character: */
 		*ap->p_next++ = c;
 		ap->remain--;
 
 		/* Always leave the buffer zero terminated. */
 		*ap->p_next = '\0';
 
 		/* Check if the buffer needs to be flushed. */
 		if (ap->remain == 2 || c == '\n') {
 			prf_putbuf(ap->p_bufr, ap->flags, ap->pri);
 
 			ap->p_next = ap->p_bufr;
 			ap->remain = ap->n_bufr;
 			*ap->p_next = '\0';
 		}
 
 		/*
 		 * Since we fill the buffer up one character at a time,
 		 * this should not happen.  We should always catch it when
 		 * ap->remain == 2 (if not sooner due to a newline), flush
 		 * the buffer and move on.  One way this could happen is
 		 * if someone sets PRINTF_BUFR_SIZE to 1 or something
 		 * similarly silly.
 		 */
 		KASSERT(ap->remain > 2, ("Bad buffer logic, remain = %zd",
 		    ap->remain));
 	}
 }
 
 /*
  * Print a character on console or users terminal.  If destination is
  * the console then the last bunch of characters are saved in msgbuf for
  * inspection later.
  */
 static void
 putchar(int c, void *arg)
 {
 	struct putchar_arg *ap = (struct putchar_arg*) arg;
 	struct tty *tp = ap->tty;
 	int flags = ap->flags;
 
 	/* Don't use the tty code after a panic or while in ddb. */
 	if (kdb_active) {
 		if (c != '\0')
 			cnputc(c);
 		return;
 	}
 
 	if ((flags & TOTTY) && tp != NULL && !KERNEL_PANICKED())
 		tty_putchar(tp, c);
 
 	if ((flags & (TOCONS | TOLOG)) && c != '\0')
 		putbuf(c, ap);
 }
 
 /*
  * Scaled down version of sprintf(3).
  */
 int
 sprintf(char *buf, const char *cfmt, ...)
 {
 	int retval;
 	va_list ap;
 
 	va_start(ap, cfmt);
 	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
 	buf[retval] = '\0';
 	va_end(ap);
 	return (retval);
 }
 
 /*
  * Scaled down version of vsprintf(3).
  */
 int
 vsprintf(char *buf, const char *cfmt, va_list ap)
 {
 	int retval;
 
 	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
 	buf[retval] = '\0';
 	return (retval);
 }
 
 /*
  * Scaled down version of snprintf(3).
  */
 int
 snprintf(char *str, size_t size, const char *format, ...)
 {
 	int retval;
 	va_list ap;
 
 	va_start(ap, format);
 	retval = vsnprintf(str, size, format, ap);
 	va_end(ap);
 	return(retval);
 }
 
 /*
  * Scaled down version of vsnprintf(3).
  */
 int
 vsnprintf(char *str, size_t size, const char *format, va_list ap)
 {
 	struct snprintf_arg info;
 	int retval;
 
 	info.str = str;
 	info.remain = size;
 	retval = kvprintf(format, snprintf_func, &info, 10, ap);
 	if (info.remain >= 1)
 		*info.str++ = '\0';
 	return (retval);
 }
 
 /*
  * Kernel version which takes radix argument vsnprintf(3).
  */
 int
 vsnrprintf(char *str, size_t size, int radix, const char *format, va_list ap)
 {
 	struct snprintf_arg info;
 	int retval;
 
 	info.str = str;
 	info.remain = size;
 	retval = kvprintf(format, snprintf_func, &info, radix, ap);
 	if (info.remain >= 1)
 		*info.str++ = '\0';
 	return (retval);
 }
 
 static void
 snprintf_func(int ch, void *arg)
 {
 	struct snprintf_arg *const info = arg;
 
 	if (info->remain >= 2) {
 		*info->str++ = ch;
 		info->remain--;
 	}
 }
 
 /*
  * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
  * order; return an optional length and a pointer to the last character
  * written in the buffer (i.e., the first character of the string).
  * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
  */
 static char *
 ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper)
 {
 	char *p, c;
 
 	p = nbuf;
 	*p = '\0';
 	do {
 		c = hex2ascii(num % base);
 		*++p = upper ? toupper(c) : c;
 	} while (num /= base);
 	if (lenp)
 		*lenp = p - nbuf;
 	return (p);
 }
 
 /*
  * Scaled down version of printf(3).
  *
  * Two additional formats:
  *
  * The format %b is supported to decode error registers.
  * Its usage is:
  *
  *	printf("reg=%b\n", regval, "<base><arg>*");
  *
  * where <base> is the output base expressed as a control character, e.g.
  * \10 gives octal; \20 gives hex.  Each arg is a sequence of characters,
  * the first of which gives the bit number to be inspected (origin 1), and
  * the next characters (up to a control character, i.e. a character <= 32),
  * give the name of the register.  Thus:
  *
  *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE");
  *
  * would produce output:
  *
  *	reg=3<BITTWO,BITONE>
  *
  * XXX:  %D  -- Hexdump, takes pointer and separator string:
  *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
  *		("%*D", len, ptr, " " -> XX XX XX XX ...
  */
 int
 kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
 {
 #define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
 	char nbuf[MAXNBUF];
 	char *d;
 	const char *p, *percent, *q;
 	u_char *up;
 	int ch, n;
 	uintmax_t num;
 	int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
 	int cflag, hflag, jflag, tflag, zflag;
 	int bconv, dwidth, upper;
 	char padc;
 	int stop = 0, retval = 0;
 
 	num = 0;
 	q = NULL;
 	if (!func)
 		d = (char *) arg;
 	else
 		d = NULL;
 
 	if (fmt == NULL)
 		fmt = "(fmt null)\n";
 
 	if (radix < 2 || radix > 36)
 		radix = 10;
 
 	for (;;) {
 		padc = ' ';
 		width = 0;
 		while ((ch = (u_char)*fmt++) != '%' || stop) {
 			if (ch == '\0')
 				return (retval);
 			PCHAR(ch);
 		}
 		percent = fmt - 1;
 		qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
 		sign = 0; dot = 0; bconv = 0; dwidth = 0; upper = 0;
 		cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
 reswitch:	switch (ch = (u_char)*fmt++) {
 		case '.':
 			dot = 1;
 			goto reswitch;
 		case '#':
 			sharpflag = 1;
 			goto reswitch;
 		case '+':
 			sign = 1;
 			goto reswitch;
 		case '-':
 			ladjust = 1;
 			goto reswitch;
 		case '%':
 			PCHAR(ch);
 			break;
 		case '*':
 			if (!dot) {
 				width = va_arg(ap, int);
 				if (width < 0) {
 					ladjust = !ladjust;
 					width = -width;
 				}
 			} else {
 				dwidth = va_arg(ap, int);
 			}
 			goto reswitch;
 		case '0':
 			if (!dot) {
 				padc = '0';
 				goto reswitch;
 			}
 			/* FALLTHROUGH */
 		case '1': case '2': case '3': case '4':
 		case '5': case '6': case '7': case '8': case '9':
 				for (n = 0;; ++fmt) {
 					n = n * 10 + ch - '0';
 					ch = *fmt;
 					if (ch < '0' || ch > '9')
 						break;
 				}
 			if (dot)
 				dwidth = n;
 			else
 				width = n;
 			goto reswitch;
 		case 'b':
 			ladjust = 1;
 			bconv = 1;
 			goto handle_nosign;
 		case 'c':
 			width -= 1;
 
 			if (!ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			PCHAR(va_arg(ap, int));
 			if (ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			break;
 		case 'D':
 			up = va_arg(ap, u_char *);
 			p = va_arg(ap, char *);
 			if (!width)
 				width = 16;
 			while(width--) {
 				PCHAR(hex2ascii(*up >> 4));
 				PCHAR(hex2ascii(*up & 0x0f));
 				up++;
 				if (width)
 					for (q=p;*q;q++)
 						PCHAR(*q);
 			}
 			break;
 		case 'd':
 		case 'i':
 			base = 10;
 			sign = 1;
 			goto handle_sign;
 		case 'h':
 			if (hflag) {
 				hflag = 0;
 				cflag = 1;
 			} else
 				hflag = 1;
 			goto reswitch;
 		case 'j':
 			jflag = 1;
 			goto reswitch;
 		case 'l':
 			if (lflag) {
 				lflag = 0;
 				qflag = 1;
 			} else
 				lflag = 1;
 			goto reswitch;
 		case 'n':
 			/*
 			 * We do not support %n in kernel, but consume the
 			 * argument.
 			 */
 			if (jflag)
 				(void)va_arg(ap, intmax_t *);
 			else if (qflag)
 				(void)va_arg(ap, quad_t *);
 			else if (lflag)
 				(void)va_arg(ap, long *);
 			else if (zflag)
 				(void)va_arg(ap, size_t *);
 			else if (hflag)
 				(void)va_arg(ap, short *);
 			else if (cflag)
 				(void)va_arg(ap, char *);
 			else
 				(void)va_arg(ap, int *);
 			break;
 		case 'o':
 			base = 8;
 			goto handle_nosign;
 		case 'p':
 			base = 16;
 			sharpflag = (width == 0);
 			sign = 0;
 			num = (uintptr_t)va_arg(ap, void *);
 			goto number;
 		case 'q':
 			qflag = 1;
 			goto reswitch;
 		case 'r':
 			base = radix;
 			if (sign)
 				goto handle_sign;
 			goto handle_nosign;
 		case 's':
 			p = va_arg(ap, char *);
 			if (p == NULL)
 				p = "(null)";
 			if (!dot)
 				n = strlen (p);
 			else
 				for (n = 0; n < dwidth && p[n]; n++)
 					continue;
 
 			width -= n;
 
 			if (!ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			while (n--)
 				PCHAR(*p++);
 			if (ladjust && width > 0)
 				while (width--)
 					PCHAR(padc);
 			break;
 		case 't':
 			tflag = 1;
 			goto reswitch;
 		case 'u':
 			base = 10;
 			goto handle_nosign;
 		case 'X':
 			upper = 1;
 			/* FALLTHROUGH */
 		case 'x':
 			base = 16;
 			goto handle_nosign;
 		case 'y':
 			base = 16;
 			sign = 1;
 			goto handle_sign;
 		case 'z':
 			zflag = 1;
 			goto reswitch;
 handle_nosign:
 			sign = 0;
 			if (jflag)
 				num = va_arg(ap, uintmax_t);
 			else if (qflag)
 				num = va_arg(ap, u_quad_t);
 			else if (tflag)
 				num = va_arg(ap, ptrdiff_t);
 			else if (lflag)
 				num = va_arg(ap, u_long);
 			else if (zflag)
 				num = va_arg(ap, size_t);
 			else if (hflag)
 				num = (u_short)va_arg(ap, int);
 			else if (cflag)
 				num = (u_char)va_arg(ap, int);
 			else
 				num = va_arg(ap, u_int);
 			if (bconv) {
 				q = va_arg(ap, char *);
 				base = *q++;
 			}
 			goto number;
 handle_sign:
 			if (jflag)
 				num = va_arg(ap, intmax_t);
 			else if (qflag)
 				num = va_arg(ap, quad_t);
 			else if (tflag)
 				num = va_arg(ap, ptrdiff_t);
 			else if (lflag)
 				num = va_arg(ap, long);
 			else if (zflag)
 				num = va_arg(ap, ssize_t);
 			else if (hflag)
 				num = (short)va_arg(ap, int);
 			else if (cflag)
 				num = (char)va_arg(ap, int);
 			else
 				num = va_arg(ap, int);
 number:
 			if (sign && (intmax_t)num < 0) {
 				neg = 1;
 				num = -(intmax_t)num;
 			}
 			p = ksprintn(nbuf, num, base, &n, upper);
 			tmp = 0;
 			if (sharpflag && num != 0) {
 				if (base == 8)
 					tmp++;
 				else if (base == 16)
 					tmp += 2;
 			}
 			if (neg)
 				tmp++;
 
 			if (!ladjust && padc == '0')
 				dwidth = width - tmp;
 			width -= tmp + imax(dwidth, n);
 			dwidth -= n;
 			if (!ladjust)
 				while (width-- > 0)
 					PCHAR(' ');
 			if (neg)
 				PCHAR('-');
 			if (sharpflag && num != 0) {
 				if (base == 8) {
 					PCHAR('0');
 				} else if (base == 16) {
 					PCHAR('0');
 					PCHAR('x');
 				}
 			}
 			while (dwidth-- > 0)
 				PCHAR('0');
 
 			while (*p)
 				PCHAR(*p--);
 
 			if (bconv && num != 0) {
 				/* %b conversion flag format. */
 				tmp = retval;
 				while (*q) {
 					n = *q++;
 					if (num & (1 << (n - 1))) {
 						PCHAR(retval != tmp ?
 						    ',' : '<');
 						for (; (n = *q) > ' '; ++q)
 							PCHAR(n);
 					} else
 						for (; *q > ' '; ++q)
 							continue;
 				}
 				if (retval != tmp) {
 					PCHAR('>');
 					width -= retval - tmp;
 				}
 			}
 
 			if (ladjust)
 				while (width-- > 0)
 					PCHAR(' ');
 
 			break;
 		default:
 			while (percent < fmt)
 				PCHAR(*percent++);
 			/*
 			 * Since we ignore a formatting argument it is no
 			 * longer safe to obey the remaining formatting
 			 * arguments as the arguments will no longer match
 			 * the format specs.
 			 */
 			stop = 1;
 			break;
 		}
 	}
 #undef PCHAR
 }
 
 /*
  * Put character in log buffer with a particular priority.
  */
 static void
 msglogchar(int c, int pri)
 {
 	static int lastpri = -1;
 	static int dangling;
 	char nbuf[MAXNBUF];
 	char *p;
 
 	if (!msgbufmapped)
 		return;
 	if (c == '\0' || c == '\r')
 		return;
 	if (pri != -1 && pri != lastpri) {
 		if (dangling) {
 			msgbuf_addchar(msgbufp, '\n');
 			dangling = 0;
 		}
 		msgbuf_addchar(msgbufp, '<');
 		for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;)
 			msgbuf_addchar(msgbufp, *p--);
 		msgbuf_addchar(msgbufp, '>');
 		lastpri = pri;
 	}
 	msgbuf_addchar(msgbufp, c);
 	if (c == '\n') {
 		dangling = 0;
 		lastpri = -1;
 	} else {
 		dangling = 1;
 	}
 }
 
 static void
 msglogstr(char *str, int pri, int filter_cr)
 {
 	if (!msgbufmapped)
 		return;
 
 	msgbuf_addstr(msgbufp, pri, str, filter_cr);
 }
 
 void
 msgbufinit(void *ptr, int size)
 {
 	char *cp;
 	static struct msgbuf *oldp = NULL;
 	bool print_boot_tag;
 
 	TSENTER();
 	size -= sizeof(*msgbufp);
 	cp = (char *)ptr;
 	print_boot_tag = !msgbufmapped;
 	/* Attempt to fetch kern.boot_tag tunable on first mapping */
 	if (!msgbufmapped)
 		TUNABLE_STR_FETCH("kern.boot_tag", current_boot_tag,
 		    sizeof(current_boot_tag));
 	msgbufp = (struct msgbuf *)(cp + size);
 	msgbuf_reinit(msgbufp, cp, size);
 	if (msgbufmapped && oldp != msgbufp)
 		msgbuf_copy(oldp, msgbufp);
 	msgbufmapped = true;
 	if (print_boot_tag && *current_boot_tag != '\0')
 		printf("%s\n", current_boot_tag);
 	oldp = msgbufp;
 	TSEXIT();
 }
 
 /* Sysctls for accessing/clearing the msgbuf */
 static int
 sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS)
 {
 	char buf[128], *bp;
 	u_int seq;
 	int error, len;
 	bool wrap;
 
 	error = priv_check(req->td, PRIV_MSGBUF);
 	if (error)
 		return (error);
 
 	/* Read the whole buffer, one chunk at a time. */
 	mtx_lock(&msgbuf_lock);
 	msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
 	wrap = (seq != 0);
 	for (;;) {
 		len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
 		mtx_unlock(&msgbuf_lock);
 		if (len == 0)
 			return (SYSCTL_OUT(req, "", 1)); /* add nulterm */
 		if (wrap) {
 			/* Skip the first line, as it is probably incomplete. */
 			bp = memchr(buf, '\n', len);
 			if (bp == NULL) {
 				mtx_lock(&msgbuf_lock);
 				continue;
 			}
 			wrap = false;
 			bp++;
 			len -= bp - buf;
 			if (len == 0) {
 				mtx_lock(&msgbuf_lock);
 				continue;
 			}
 		} else
 			bp = buf;
 		error = sysctl_handle_opaque(oidp, bp, len, req);
 		if (error)
 			return (error);
 
 		mtx_lock(&msgbuf_lock);
 	}
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, msgbuf,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer");
 
 static int msgbuf_clearflag;
 
 static int
 sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (!error && req->newptr) {
 		mtx_lock(&msgbuf_lock);
 		msgbuf_clear(msgbufp);
 		mtx_unlock(&msgbuf_lock);
 		msgbuf_clearflag = 0;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE,
     &msgbuf_clearflag, 0, sysctl_kern_msgbuf_clear, "I",
     "Clear kernel message buffer");
 
 #ifdef DDB
 
 DB_SHOW_COMMAND_FLAGS(msgbuf, db_show_msgbuf, DB_CMD_MEMSAFE)
 {
 	int i, j;
 
 	if (!msgbufmapped) {
 		db_printf("msgbuf not mapped yet\n");
 		return;
 	}
 	db_printf("msgbufp = %p\n", msgbufp);
 	db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n",
 	    msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq,
 	    msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum);
 	for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) {
 		j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq);
 		db_printf("%c", msgbufp->msg_ptr[j]);
 	}
 	db_printf("\n");
 }
 
 #endif /* DDB */
 
 void
 hexdump(const void *ptr, int length, const char *hdr, int flags)
 {
 	int i, j, k;
 	int cols;
 	const unsigned char *cp;
 	char delim;
 
 	if ((flags & HD_DELIM_MASK) != 0)
 		delim = (flags & HD_DELIM_MASK) >> 8;
 	else
 		delim = ' ';
 
 	if ((flags & HD_COLUMN_MASK) != 0)
 		cols = flags & HD_COLUMN_MASK;
 	else
 		cols = 16;
 
 	cp = ptr;
 	for (i = 0; i < length; i+= cols) {
 		if (hdr != NULL)
 			printf("%s", hdr);
 
 		if ((flags & HD_OMIT_COUNT) == 0)
 			printf("%04x  ", i);
 
 		if ((flags & HD_OMIT_HEX) == 0) {
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k < length)
 					printf("%c%02x", delim, cp[k]);
 				else
 					printf("   ");
 			}
 		}
 
 		if ((flags & HD_OMIT_CHARS) == 0) {
 			printf("  |");
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k >= length)
 					printf(" ");
 				else if (cp[k] >= ' ' && cp[k] <= '~')
 					printf("%c", cp[k]);
 				else
 					printf(".");
 			}
 			printf("|");
 		}
 		printf("\n");
 	}
 }
 #endif /* _KERNEL */
 
 void
 sbuf_hexdump(struct sbuf *sb, const void *ptr, int length, const char *hdr,
 	     int flags)
 {
 	int i, j, k;
 	int cols;
 	const unsigned char *cp;
 	char delim;
 
 	if ((flags & HD_DELIM_MASK) != 0)
 		delim = (flags & HD_DELIM_MASK) >> 8;
 	else
 		delim = ' ';
 
 	if ((flags & HD_COLUMN_MASK) != 0)
 		cols = flags & HD_COLUMN_MASK;
 	else
 		cols = 16;
 
 	cp = ptr;
 	for (i = 0; i < length; i+= cols) {
 		if (hdr != NULL)
 			sbuf_printf(sb, "%s", hdr);
 
 		if ((flags & HD_OMIT_COUNT) == 0)
 			sbuf_printf(sb, "%04x  ", i);
 
 		if ((flags & HD_OMIT_HEX) == 0) {
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k < length)
 					sbuf_printf(sb, "%c%02x", delim, cp[k]);
 				else
 					sbuf_printf(sb, "   ");
 			}
 		}
 
 		if ((flags & HD_OMIT_CHARS) == 0) {
 			sbuf_printf(sb, "  |");
 			for (j = 0; j < cols; j++) {
 				k = i + j;
 				if (k >= length)
 					sbuf_printf(sb, " ");
 				else if (cp[k] >= ' ' && cp[k] <= '~')
 					sbuf_printf(sb, "%c", cp[k]);
 				else
 					sbuf_printf(sb, ".");
 			}
 			sbuf_printf(sb, "|");
 		}
 		sbuf_printf(sb, "\n");
 	}
 }
 
 #ifdef _KERNEL
 void
 counted_warning(unsigned *counter, const char *msg)
 {
 	struct thread *td;
 	unsigned c;
 
 	for (;;) {
 		c = *counter;
 		if (c == 0)
 			break;
 		if (atomic_cmpset_int(counter, c, c - 1)) {
 			td = curthread;
 			log(LOG_INFO, "pid %d (%s) %s%s\n",
 			    td->td_proc->p_pid, td->td_name, msg,
 			    c > 1 ? "" : " - not logging anymore");
 			break;
 		}
 	}
 }
 #endif
 
 #ifdef _KERNEL
 void
 sbuf_putbuf(struct sbuf *sb)
 {
 
 	prf_putbuf(sbuf_data(sb), TOLOG | TOCONS, -1);
 }
 #else
 void
 sbuf_putbuf(struct sbuf *sb)
 {
 
 	printf("%s", sbuf_data(sb));
 }
 #endif
 
 int
 sbuf_printf_drain(void *arg, const char *data, int len)
 {
 	size_t *retvalptr;
 	int r;
 #ifdef _KERNEL
 	char *dataptr;
 	char oldchr;
 
 	/*
 	 * This is allowed as an extra byte is always resvered for
 	 * terminating NUL byte.  Save and restore the byte because
 	 * we might be flushing a record, and there may be valid
 	 * data after the buffer.
 	 */
 	oldchr = data[len];
 	dataptr = __DECONST(char *, data);
 	dataptr[len] = '\0';
 
 	prf_putbuf(dataptr, TOLOG | TOCONS, -1);
 	r = len;
 
 	dataptr[len] = oldchr;
 
 #else /* !_KERNEL */
 
 	r = printf("%.*s", len, data);
 	if (r < 0)
 		return (-errno);
 
 #endif
 
 	retvalptr = arg;
 	if (retvalptr != NULL)
 		*retvalptr += r;
 
 	return (r);
 }
diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c
index 7a7fe9b0f4f9..d836c0ed295e 100644
--- a/sys/kern/subr_uio.c
+++ b/sys/kern/subr_uio.c
@@ -1,517 +1,539 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2014 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_map.h>
 
 #include <machine/bus.h>
 
 SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, UIO_MAXIOV,
 	"Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
 
 static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault);
 
 int
 copyin_nofault(const void *udaddr, void *kaddr, size_t len)
 {
 	int error, save;
 
 	save = vm_fault_disable_pagefaults();
 	error = copyin(udaddr, kaddr, len);
 	vm_fault_enable_pagefaults(save);
 	return (error);
 }
 
 int
 copyout_nofault(const void *kaddr, void *udaddr, size_t len)
 {
 	int error, save;
 
 	save = vm_fault_disable_pagefaults();
 	error = copyout(kaddr, udaddr, len);
 	vm_fault_enable_pagefaults(save);
 	return (error);
 }
 
 #define	PHYS_PAGE_COUNT(len)	(howmany(len, PAGE_SIZE) + 1)
 
 int
 physcopyin(void *src, vm_paddr_t dst, size_t len)
 {
 	vm_page_t m[PHYS_PAGE_COUNT(len)];
 	struct iovec iov[1];
 	struct uio uio;
 	int i;
 
 	iov[0].iov_base = src;
 	iov[0].iov_len = len;
 	uio.uio_iov = iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	for (i = 0; i < PHYS_PAGE_COUNT(len); i++, dst += PAGE_SIZE)
 		m[i] = PHYS_TO_VM_PAGE(dst);
 	return (uiomove_fromphys(m, dst & PAGE_MASK, len, &uio));
 }
 
 int
 physcopyout(vm_paddr_t src, void *dst, size_t len)
 {
 	vm_page_t m[PHYS_PAGE_COUNT(len)];
 	struct iovec iov[1];
 	struct uio uio;
 	int i;
 
 	iov[0].iov_base = dst;
 	iov[0].iov_len = len;
 	uio.uio_iov = iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	for (i = 0; i < PHYS_PAGE_COUNT(len); i++, src += PAGE_SIZE)
 		m[i] = PHYS_TO_VM_PAGE(src);
 	return (uiomove_fromphys(m, src & PAGE_MASK, len, &uio));
 }
 
 #undef PHYS_PAGE_COUNT
 
 int
 physcopyin_vlist(bus_dma_segment_t *src, off_t offset, vm_paddr_t dst,
     size_t len)
 {
 	size_t seg_len;
 	int error;
 
 	error = 0;
 	while (offset >= src->ds_len) {
 		offset -= src->ds_len;
 		src++;
 	}
 
 	while (len > 0 && error == 0) {
 		seg_len = MIN(src->ds_len - offset, len);
 		error = physcopyin((void *)(uintptr_t)(src->ds_addr + offset),
 		    dst, seg_len);
 		offset = 0;
 		src++;
 		len -= seg_len;
 		dst += seg_len;
 	}
 
 	return (error);
 }
 
 int
 physcopyout_vlist(vm_paddr_t src, bus_dma_segment_t *dst, off_t offset,
     size_t len)
 {
 	size_t seg_len;
 	int error;
 
 	error = 0;
 	while (offset >= dst->ds_len) {
 		offset -= dst->ds_len;
 		dst++;
 	}
 
 	while (len > 0 && error == 0) {
 		seg_len = MIN(dst->ds_len - offset, len);
 		error = physcopyout(src, (void *)(uintptr_t)(dst->ds_addr +
 		    offset), seg_len);
 		offset = 0;
 		dst++;
 		len -= seg_len;
 		src += seg_len;
 	}
 
 	return (error);
 }
 
 int
 uiomove(void *cp, int n, struct uio *uio)
 {
 
 	return (uiomove_faultflag(cp, n, uio, 0));
 }
 
 int
 uiomove_nofault(void *cp, int n, struct uio *uio)
 {
 
 	return (uiomove_faultflag(cp, n, uio, 1));
 }
 
 static int
 uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
 {
 	struct iovec *iov;
 	size_t cnt;
 	int error, newflags, save;
 
 	save = error = 0;
 
 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
 	    ("uiomove: mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
 	    ("uiomove proc"));
 	KASSERT(uio->uio_resid >= 0,
 	    ("%s: uio %p resid underflow", __func__, uio));
 
 	if (uio->uio_segflg == UIO_USERSPACE) {
 		newflags = TDP_DEADLKTREAT;
 		if (nofault) {
 			/*
 			 * Fail if a non-spurious page fault occurs.
 			 */
 			newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
 		} else {
 			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 			    "Calling uiomove()");
 		}
 		save = curthread_pflags_set(newflags);
 	} else {
 		KASSERT(nofault == 0, ("uiomove: nofault"));
 	}
 
 	while (n > 0 && uio->uio_resid) {
 		KASSERT(uio->uio_iovcnt > 0,
 		    ("%s: uio %p iovcnt underflow", __func__, uio));
 
 		iov = uio->uio_iov;
 		cnt = iov->iov_len;
 		if (cnt == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			continue;
 		}
 		if (cnt > n)
 			cnt = n;
 
 		switch (uio->uio_segflg) {
 		case UIO_USERSPACE:
 			maybe_yield();
 			if (uio->uio_rw == UIO_READ)
 				error = copyout(cp, iov->iov_base, cnt);
 			else
 				error = copyin(iov->iov_base, cp, cnt);
 			if (error)
 				goto out;
 			break;
 
 		case UIO_SYSSPACE:
 			if (uio->uio_rw == UIO_READ)
 				bcopy(cp, iov->iov_base, cnt);
 			else
 				bcopy(iov->iov_base, cp, cnt);
 			break;
 		case UIO_NOCOPY:
 			break;
 		}
 		iov->iov_base = (char *)iov->iov_base + cnt;
 		iov->iov_len -= cnt;
 		uio->uio_resid -= cnt;
 		uio->uio_offset += cnt;
 		cp = (char *)cp + cnt;
 		n -= cnt;
 	}
 out:
 	if (save)
 		curthread_pflags_restore(save);
 	return (error);
 }
 
 /*
  * Wrapper for uiomove() that validates the arguments against a known-good
  * kernel buffer.  Currently, uiomove accepts a signed (n) argument, which
  * is almost definitely a bad thing, so we catch that here as well.  We
  * return a runtime failure, but it might be desirable to generate a runtime
  * assertion failure instead.
  */
 int
 uiomove_frombuf(void *buf, int buflen, struct uio *uio)
 {
 	size_t offset, n;
 
 	if (uio->uio_offset < 0 || uio->uio_resid < 0 ||
 	    (offset = uio->uio_offset) != uio->uio_offset)
 		return (EINVAL);
 	if (buflen <= 0 || offset >= buflen)
 		return (0);
 	if ((n = buflen - offset) > IOSIZE_MAX)
 		return (EINVAL);
 	return (uiomove((char *)buf + offset, n, uio));
 }
 
 /*
  * Give next character to user as result of read.
  */
 int
 ureadc(int c, struct uio *uio)
 {
 	struct iovec *iov;
 	char *iov_base;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "Calling ureadc()");
 
 again:
 	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
 		panic("ureadc");
 	iov = uio->uio_iov;
 	if (iov->iov_len == 0) {
 		uio->uio_iovcnt--;
 		uio->uio_iov++;
 		goto again;
 	}
 	switch (uio->uio_segflg) {
 	case UIO_USERSPACE:
 		if (subyte(iov->iov_base, c) < 0)
 			return (EFAULT);
 		break;
 
 	case UIO_SYSSPACE:
 		iov_base = iov->iov_base;
 		*iov_base = c;
 		break;
 
 	case UIO_NOCOPY:
 		break;
 	}
 	iov->iov_base = (char *)iov->iov_base + 1;
 	iov->iov_len--;
 	uio->uio_resid--;
 	uio->uio_offset++;
 	return (0);
 }
 
 int
 copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error)
 {
 	u_int iovlen;
 
 	*iov = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (error);
 	iovlen = iovcnt * sizeof(struct iovec);
 	*iov = malloc(iovlen, M_IOV, M_WAITOK);
 	error = copyin(iovp, *iov, iovlen);
 	if (error) {
 		free(*iov, M_IOV);
 		*iov = NULL;
 	}
 	return (error);
 }
 
 int
 copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop)
 {
 	struct iovec *iov;
 	struct uio *uio;
 	u_int iovlen;
 	int error, i;
 
 	*uiop = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (EINVAL);
 	iovlen = iovcnt * sizeof(struct iovec);
-	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
-	iov = (struct iovec *)(uio + 1);
+	uio = allocuio(iovcnt);
+	iov = uio->uio_iov;
 	error = copyin(iovp, iov, iovlen);
-	if (error) {
-		free(uio, M_IOV);
+	if (error != 0) {
+		freeuio(uio);
 		return (error);
 	}
-	uio->uio_iov = iov;
 	uio->uio_iovcnt = iovcnt;
 	uio->uio_segflg = UIO_USERSPACE;
 	uio->uio_offset = -1;
 	uio->uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > IOSIZE_MAX - uio->uio_resid) {
-			free(uio, M_IOV);
+			freeuio(uio);
 			return (EINVAL);
 		}
 		uio->uio_resid += iov->iov_len;
 		iov++;
 	}
 	*uiop = uio;
 	return (0);
 }
 
+struct uio *
+allocuio(u_int iovcnt)
+{
+	struct uio *uio;
+	int iovlen;
+
+	KASSERT(iovcnt <= UIO_MAXIOV,
+	    ("Requested %u iovecs exceed UIO_MAXIOV", iovcnt));
+	iovlen = iovcnt * sizeof(struct iovec);
+	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
+	uio->uio_iov = (struct iovec *)(uio + 1);
+
+	return (uio);
+}
+
+void
+freeuio(struct uio *uio)
+{
+	free(uio, M_IOV);
+}
+
 struct uio *
 cloneuio(struct uio *uiop)
 {
+	struct iovec *iov;
 	struct uio *uio;
 	int iovlen;
 
 	iovlen = uiop->uio_iovcnt * sizeof(struct iovec);
-	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
+	uio = allocuio(uiop->uio_iovcnt);
+	iov = uio->uio_iov;
 	*uio = *uiop;
-	uio->uio_iov = (struct iovec *)(uio + 1);
+	uio->uio_iov = iov;
 	bcopy(uiop->uio_iov, uio->uio_iov, iovlen);
 	return (uio);
 }
 
 /*
  * Map some anonymous memory in user space of size sz, rounded up to the page
  * boundary.
  */
 int
 copyout_map(struct thread *td, vm_offset_t *addr, size_t sz)
 {
 	struct vmspace *vms;
 	int error;
 	vm_size_t size;
 
 	vms = td->td_proc->p_vmspace;
 
 	/*
 	 * Map somewhere after heap in process memory.
 	 */
 	*addr = round_page((vm_offset_t)vms->vm_daddr +
 	    lim_max(td, RLIMIT_DATA));
 
 	/* round size up to page boundary */
 	size = (vm_size_t)round_page(sz);
 	if (size == 0)
 		return (EINVAL);
 	error = vm_mmap_object(&vms->vm_map, addr, size, VM_PROT_READ |
 	    VM_PROT_WRITE, VM_PROT_ALL, MAP_PRIVATE | MAP_ANON, NULL, 0,
 	    FALSE, td);
 	return (error);
 }
 
 /*
  * Unmap memory in user space.
  */
 int
 copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz)
 {
 	vm_map_t map;
 	vm_size_t size;
 
 	if (sz == 0)
 		return (0);
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	size = (vm_size_t)round_page(sz);
 
 	if (vm_map_remove(map, addr, addr + size) != KERN_SUCCESS)
 		return (EINVAL);
 
 	return (0);
 }
 
 int32_t
 fuword32(volatile const void *addr)
 {
 	int rv;
 	int32_t val;
 
 	rv = fueword32(addr, &val);
 	return (rv == -1 ? -1 : val);
 }
 
 #ifdef _LP64
 int64_t
 fuword64(volatile const void *addr)
 {
 	int rv;
 	int64_t val;
 
 	rv = fueword64(addr, &val);
 	return (rv == -1 ? -1 : val);
 }
 #endif /* _LP64 */
 
 long
 fuword(volatile const void *addr)
 {
 	long val;
 	int rv;
 
 	rv = fueword(addr, &val);
 	return (rv == -1 ? -1 : val);
 }
 
 uint32_t
 casuword32(volatile uint32_t *addr, uint32_t old, uint32_t new)
 {
 	int rv;
 	uint32_t val;
 
 	rv = casueword32(addr, old, &val, new);
 	return (rv == -1 ? -1 : val);
 }
 
 u_long
 casuword(volatile u_long *addr, u_long old, u_long new)
 {
 	int rv;
 	u_long val;
 
 	rv = casueword(addr, old, &val, new);
 	return (rv == -1 ? -1 : val);
 }
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index c999bd60f95d..b569a9f808d5 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -1,2170 +1,2170 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/eventfd.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/sleepqueue.h>
 #include <sys/specialfd.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/condvar.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 
 /*
  * The following macro defines how many bytes will be allocated from
  * the stack instead of memory allocated when passing the IOCTL data
  * structures from userspace and to the kernel. Some IOCTLs having
  * small data structures are used very frequently and this small
  * buffer on the stack gives a significant speedup improvement for
  * those requests. The value of this define should be greater or equal
  * to 64 bytes and should also be power of two. The data structure is
  * currently hard-aligned to a 8-byte boundary on the stack. This
  * should currently be sufficient for all supported platforms.
  */
 #define	SYS_IOCTL_SMALL_SIZE	128	/* bytes */
 #define	SYS_IOCTL_SMALL_ALIGN	8	/* bytes */
 
 #ifdef __LP64__
 static int iosize_max_clamp = 0;
 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
     &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
 static int devfs_iosize_max_clamp = 1;
 SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW,
     &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices");
 #endif
 
 /*
  * Assert that the return value of read(2) and write(2) syscalls fits
  * into a register.  If not, an architecture will need to provide the
  * usermode wrappers to reconstruct the result.
  */
 CTASSERT(sizeof(register_t) >= sizeof(size_t));
 
 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
 
 static int	pollout(struct thread *, struct pollfd *, struct pollfd *,
 		    u_int);
 static int	pollscan(struct thread *, struct pollfd *, u_int);
 static int	pollrescan(struct thread *);
 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
 static int	selrescan(struct thread *, fd_mask **, fd_mask **);
 static void	selfdalloc(struct thread *, void *);
 static void	selfdfree(struct seltd *, struct selfd *);
 static int	dofileread(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static void	doselwakeup(struct selinfo *, int);
 static void	seltdinit(struct thread *);
 static int	seltdwait(struct thread *, sbintime_t, sbintime_t);
 static void	seltdclear(struct thread *);
 
 /*
  * One seltd per-thread allocated on demand as needed.
  *
  *	t - protected by st_mtx
  * 	k - Only accessed by curthread or read-only
  */
 struct seltd {
 	STAILQ_HEAD(, selfd)	st_selq;	/* (k) List of selfds. */
 	struct selfd		*st_free1;	/* (k) free fd for read set. */
 	struct selfd		*st_free2;	/* (k) free fd for write set. */
 	struct mtx		st_mtx;		/* Protects struct seltd */
 	struct cv		st_wait;	/* (t) Wait channel. */
 	int			st_flags;	/* (t) SELTD_ flags. */
 };
 
 #define	SELTD_PENDING	0x0001			/* We have pending events. */
 #define	SELTD_RESCAN	0x0002			/* Doing a rescan. */
 
 /*
  * One selfd allocated per-thread per-file-descriptor.
  *	f - protected by sf_mtx
  */
 struct selfd {
 	STAILQ_ENTRY(selfd)	sf_link;	/* (k) fds owned by this td. */
 	TAILQ_ENTRY(selfd)	sf_threads;	/* (f) fds on this selinfo. */
 	struct selinfo		*sf_si;		/* (f) selinfo when linked. */
 	struct mtx		*sf_mtx;	/* Pointer to selinfo mtx. */
 	struct seltd		*sf_td;		/* (k) owning seltd. */
 	void			*sf_cookie;	/* (k) fd or pollfd. */
 };
 
 MALLOC_DEFINE(M_SELFD, "selfd", "selfd");
 static struct mtx_pool *mtxpool_select;
 
 #ifdef __LP64__
 size_t
 devfs_iosize_max(void)
 {
 
 	return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ?
 	    INT_MAX : SSIZE_MAX);
 }
 
 size_t
 iosize_max(void)
 {
 
 	return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ?
 	    INT_MAX : SSIZE_MAX);
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct read_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 };
 #endif
 int
 sys_read(struct thread *td, struct read_args *uap)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_readv(td, uap->fd, &auio);
 	return (error);
 }
 
 /*
  * Positioned read system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pread_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 int
 sys_pread(struct thread *td, struct pread_args *uap)
 {
 
 	return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset));
 }
 
 int
 kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_preadv(td, fd, &auio, offset);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap)
 {
 
 	return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset));
 }
 #endif
 
 /*
  * Scatter read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 int
 sys_readv(struct thread *td, struct readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 kern_readv(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_read(td, fd, &cap_read_rights, &fp);
 	if (error)
 		return (error);
 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Scatter positioned read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct preadv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
 int
 sys_preadv(struct thread *td, struct preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, uap->offset);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_read(td, fd, &cap_pread_rights, &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
 		error = EINVAL;
 	else
 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for readv and preadv that reads data in
  * from a file using the passed in uio, offset, and flags.
  */
 static int
 dofileread(struct thread *td, int fd, struct file *fp, struct uio *auio,
     off_t offset, int flags)
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	AUDIT_ARG_FD(fd);
 
 	/* Finish zero length reads right here */
 	if (auio->uio_resid == 0) {
 		td->td_retval[0] = 0;
 		return (0);
 	}
 	auio->uio_rw = UIO_READ;
 	auio->uio_offset = offset;
 	auio->uio_td = td;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO)) 
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_READ, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct write_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 };
 #endif
 int
 sys_write(struct thread *td, struct write_args *uap)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_writev(td, uap->fd, &auio);
 	return (error);
 }
 
 /*
  * Positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwrite_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 int
 sys_pwrite(struct thread *td, struct pwrite_args *uap)
 {
 
 	return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset));
 }
 
 int
 kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte,
     off_t offset)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (nbyte > IOSIZE_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_pwritev(td, fd, &auio, offset);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD6)
 int
 freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap)
 {
 
 	return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset));
 }
 #endif
 
 /*
  * Gather write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct writev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 int
 sys_writev(struct thread *td, struct writev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_writev(td, uap->fd, auio);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 kern_writev(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_write(td, fd, &cap_write_rights, &fp);
 	if (error)
 		return (error);
 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Gather positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwritev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
 int
 sys_pwritev(struct thread *td, struct pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 int
 kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
 		error = EINVAL;
 	else
 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for writev and pwritev that writes data to
  * a file using the passed in uio, offset, and flags.
  */
 static int
 dofilewrite(struct thread *td, int fd, struct file *fp, struct uio *auio,
     off_t offset, int flags)
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	AUDIT_ARG_FD(fd);
 	auio->uio_rw = UIO_WRITE;
 	auio->uio_td = td;
 	auio->uio_offset = offset;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	error = fo_write(fp, auio, td->td_ucred, flags, td);
 	/*
 	 * Socket layer is responsible for special error handling,
 	 * see sousrsend().
 	 */
 	if (error != 0 && fp->f_type != DTYPE_SOCKET) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		if (error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_WRITE, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 /*
  * Truncate a file given a file descriptor.
  *
  * Can't use fget_write() here, since must return EINVAL and not EBADF if the
  * descriptor isn't writable.
  */
 int
 kern_ftruncate(struct thread *td, int fd, off_t length)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	if (length < 0)
 		return (EINVAL);
 	error = fget(td, fd, &cap_ftruncate_rights, &fp);
 	if (error)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if (!(fp->f_flag & FWRITE)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = fo_truncate(fp, length, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ftruncate_args {
 	int	fd;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 sys_ftruncate(struct thread *td, struct ftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct oftruncate_args {
 	int	fd;
 	long	length;
 };
 #endif
 int
 oftruncate(struct thread *td, struct oftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct ioctl_args {
 	int	fd;
 	u_long	com;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ioctl(struct thread *td, struct ioctl_args *uap)
 {
 	u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN);
 	uint32_t com;
 	int arg, error;
 	u_int size;
 	caddr_t data;
 
 #ifdef INVARIANTS
 	if (uap->com > 0xffffffff) {
 		printf(
 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
 		    td->td_proc->p_pid, td->td_name, uap->com);
 	}
 #endif
 	com = (uint32_t)uap->com;
 
 	/*
 	 * Interpret high order word to find amount of data to be
 	 * copied to/from the user's address space.
 	 */
 	size = IOCPARM_LEN(com);
 	if ((size > IOCPARM_MAX) ||
 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	    ((com & IOC_OUT) && size == 0) ||
 #else
 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
 #endif
 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
 		return (ENOTTY);
 
 	if (size > 0) {
 		if (com & IOC_VOID) {
 			/* Integer argument. */
 			arg = (intptr_t)uap->data;
 			data = (void *)&arg;
 			size = 0;
 		} else {
 			if (size > SYS_IOCTL_SMALL_SIZE)
 				data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
 			else
 				data = smalldata;
 		}
 	} else
 		data = (void *)&uap->data;
 	if (com & IOC_IN) {
 		error = copyin(uap->data, data, (u_int)size);
 		if (error != 0)
 			goto out;
 	} else if (com & IOC_OUT) {
 		/*
 		 * Zero the buffer so the user always
 		 * gets back something deterministic.
 		 */
 		bzero(data, size);
 	}
 
 	error = kern_ioctl(td, uap->fd, com, data);
 
 	if (error == 0 && (com & IOC_OUT))
 		error = copyout(data, uap->data, (u_int)size);
 
 out:
 	if (size > SYS_IOCTL_SMALL_SIZE)
 		free(data, M_IOCTLOPS);
 	return (error);
 }
 
 int
 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	int error, tmp, locked;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_CMD(com);
 
 	fdp = td->td_proc->p_fd;
 
 	switch (com) {
 	case FIONCLEX:
 	case FIOCLEX:
 		FILEDESC_XLOCK(fdp);
 		locked = LA_XLOCKED;
 		break;
 	default:
 #ifdef CAPABILITIES
 		FILEDESC_SLOCK(fdp);
 		locked = LA_SLOCKED;
 #else
 		locked = LA_UNLOCKED;
 #endif
 		break;
 	}
 
 #ifdef CAPABILITIES
 	if ((fp = fget_noref(fdp, fd)) == NULL) {
 		error = EBADF;
 		goto out;
 	}
 	if ((error = cap_ioctl_check(fdp, fd, com)) != 0) {
 		fp = NULL;	/* fhold() was not called yet */
 		goto out;
 	}
 	if (!fhold(fp)) {
 		error = EBADF;
 		fp = NULL;
 		goto out;
 	}
 	if (locked == LA_SLOCKED) {
 		FILEDESC_SUNLOCK(fdp);
 		locked = LA_UNLOCKED;
 	}
 #else
 	error = fget(td, fd, &cap_ioctl_rights, &fp);
 	if (error != 0) {
 		fp = NULL;
 		goto out;
 	}
 #endif
 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	switch (com) {
 	case FIONCLEX:
 		fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
 		goto out;
 	case FIOCLEX:
 		fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
 		goto out;
 	case FIONBIO:
 		if ((tmp = *(int *)data))
 			atomic_set_int(&fp->f_flag, FNONBLOCK);
 		else
 			atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		data = (void *)&tmp;
 		break;
 	case FIOASYNC:
 		if ((tmp = *(int *)data))
 			atomic_set_int(&fp->f_flag, FASYNC);
 		else
 			atomic_clear_int(&fp->f_flag, FASYNC);
 		data = (void *)&tmp;
 		break;
 	}
 
 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
 out:
 	switch (locked) {
 	case LA_XLOCKED:
 		FILEDESC_XUNLOCK(fdp);
 		break;
 #ifdef CAPABILITIES
 	case LA_SLOCKED:
 		FILEDESC_SUNLOCK(fdp);
 		break;
 #endif
 	default:
 		FILEDESC_UNLOCK_ASSERT(fdp);
 		break;
 	}
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
 {
 	int error;
 
 	error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len);
 	return (kern_posix_error(td, error));
 }
 
 int
 kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	if (offset < 0 || len <= 0)
 		return (EINVAL);
 	/* Check for wrap. */
 	if (offset > OFF_MAX - len)
 		return (EFBIG);
 	AUDIT_ARG_FD(fd);
 	error = fget(td, fd, &cap_pwrite_rights, &fp);
 	if (error != 0)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 		error = ESPIPE;
 		goto out;
 	}
 	if ((fp->f_flag & FWRITE) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	error = fo_fallocate(fp, offset, len, td);
  out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_fspacectl(struct thread *td, struct fspacectl_args *uap)
 {
 	struct spacectl_range rqsr, rmsr;
 	int error, cerror;
 
 	error = copyin(uap->rqsr, &rqsr, sizeof(rqsr));
 	if (error != 0)
 		return (error);
 
 	error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags,
 	    &rmsr);
 	if (uap->rmsr != NULL) {
 		cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr));
 		if (error == 0)
 			error = cerror;
 	}
 	return (error);
 }
 
 int
 kern_fspacectl(struct thread *td, int fd, int cmd,
     const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp)
 {
 	struct file *fp;
 	struct spacectl_range rmsr;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_CMD(cmd);
 	AUDIT_ARG_FFLAGS(flags);
 
 	if (rqsr == NULL)
 		return (EINVAL);
 	rmsr = *rqsr;
 	if (rmsrp != NULL)
 		*rmsrp = rmsr;
 
 	if (cmd != SPACECTL_DEALLOC ||
 	    rqsr->r_offset < 0 || rqsr->r_len <= 0 ||
 	    rqsr->r_offset > OFF_MAX - rqsr->r_len ||
 	    (flags & ~SPACECTL_F_SUPPORTED) != 0)
 		return (EINVAL);
 
 	error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 	if (error != 0)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 		error = ESPIPE;
 		goto out;
 	}
 	if ((fp->f_flag & FWRITE) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags,
 	    td->td_ucred, td);
 	/* fspacectl is not restarted after signals if the file is modified. */
 	if (rmsr.r_len != rqsr->r_len && (error == ERESTART ||
 	    error == EINTR || error == EWOULDBLOCK))
 		error = 0;
 	if (rmsrp != NULL)
 		*rmsrp = rmsr;
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 kern_specialfd(struct thread *td, int type, void *arg)
 {
 	struct file *fp;
 	struct specialfd_eventfd *ae;
 	int error, fd, fflags;
 
 	fflags = 0;
 	error = falloc_noinstall(td, &fp);
 	if (error != 0)
 		return (error);
 
 	switch (type) {
 	case SPECIALFD_EVENTFD:
 		ae = arg;
 		if ((ae->flags & EFD_CLOEXEC) != 0)
 			fflags |= O_CLOEXEC;
 		error = eventfd_create_file(td, fp, ae->initval, ae->flags);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	if (error == 0)
 		error = finstall(td, fp, &fd, fflags, NULL);
 	fdrop(fp, td);
 	if (error == 0)
 		td->td_retval[0] = fd;
 	return (error);
 }
 
 int
 sys___specialfd(struct thread *td, struct __specialfd_args *args)
 {
 	struct specialfd_eventfd ae;
 	int error;
 
 	switch (args->type) {
 	case SPECIALFD_EVENTFD:
 		if (args->len != sizeof(struct specialfd_eventfd)) {
 			error = EINVAL;
 			break;
 		}
 		error = copyin(args->req, &ae, sizeof(ae));
 		if (error != 0)
 			break;
 		if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK |
 		    EFD_SEMAPHORE)) != 0) {
 			error = EINVAL;
 			break;
 		}
 		error = kern_specialfd(td, args->type, &ae);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 poll_no_poll(int events)
 {
 	/*
 	 * Return true for read/write.  If the user asked for something
 	 * special, return POLLNVAL, so that clients have a way of
 	 * determining reliably whether or not the extended
 	 * functionality is present without hard-coding knowledge
 	 * of specific filesystem implementations.
 	 */
 	if (events & ~POLLSTANDARD)
 		return (POLLNVAL);
 
 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 int
 sys_pselect(struct thread *td, struct pselect_args *uap)
 {
 	struct timespec ts;
 	struct timeval tv, *tvp;
 	sigset_t set, *uset;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts, sizeof(ts));
 		if (error != 0)
 		    return (error);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->sm != NULL) {
 		error = copyin(uap->sm, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		uset = &set;
 	} else
 		uset = NULL;
 	return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    uset, NFDBITS));
 }
 
 int
 kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
     struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
 {
 	int error;
 
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &td->td_oldsigmask, 0);
 		if (error != 0)
 			return (error);
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		ast_sched(td, TDA_SIGSUSPEND);
 	}
 	error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct select_args {
 	int	nd;
 	fd_set	*in, *ou, *ex;
 	struct	timeval *tv;
 };
 #endif
 int
 sys_select(struct thread *td, struct select_args *uap)
 {
 	struct timeval tv, *tvp;
 	int error;
 
 	if (uap->tv != NULL) {
 		error = copyin(uap->tv, &tv, sizeof(tv));
 		if (error)
 			return (error);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 
 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    NFDBITS));
 }
 
 /*
  * In the unlikely case when user specified n greater then the last
  * open file descriptor, check that no bits are set after the last
  * valid fd.  We must return EBADF if any is set.
  *
  * There are applications that rely on the behaviour.
  *
  * nd is fd_nfiles.
  */
 static int
 select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits)
 {
 	char *addr, *oaddr;
 	int b, i, res;
 	uint8_t bits;
 
 	if (nd >= ndu || fd_in == NULL)
 		return (0);
 
 	oaddr = NULL;
 	bits = 0; /* silence gcc */
 	for (i = nd; i < ndu; i++) {
 		b = i / NBBY;
 #if BYTE_ORDER == LITTLE_ENDIAN
 		addr = (char *)fd_in + b;
 #else
 		addr = (char *)fd_in;
 		if (abi_nfdbits == NFDBITS) {
 			addr += rounddown(b, sizeof(fd_mask)) +
 			    sizeof(fd_mask) - 1 - b % sizeof(fd_mask);
 		} else {
 			addr += rounddown(b, sizeof(uint32_t)) +
 			    sizeof(uint32_t) - 1 - b % sizeof(uint32_t);
 		}
 #endif
 		if (addr != oaddr) {
 			res = fubyte(addr);
 			if (res == -1)
 				return (EFAULT);
 			oaddr = addr;
 			bits = res;
 		}
 		if ((bits & (1 << (i % NBBY))) != 0)
 			return (EBADF);
 	}
 	return (0);
 }
 
 int
 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
     fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
 {
 	struct filedesc *fdp;
 	/*
 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
 	 * of 256.
 	 */
 	fd_mask s_selbits[howmany(2048, NFDBITS)];
 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
 	struct timeval rtv;
 	sbintime_t asbt, precision, rsbt;
 	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
 	int error, lf, ndu;
 
 	if (nd < 0)
 		return (EINVAL);
 	fdp = td->td_proc->p_fd;
 	ndu = nd;
 	lf = fdp->fd_nfiles;
 	if (nd > lf)
 		nd = lf;
 
 	error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 	error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 	error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Allocate just enough bits for the non-null fd_sets.  Use the
 	 * preallocated auto buffer if possible.
 	 */
 	nfdbits = roundup(nd, NFDBITS);
 	ncpbytes = nfdbits / NBBY;
 	ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
 	nbufbytes = 0;
 	if (fd_in != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ou != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ex != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (nbufbytes <= sizeof s_selbits)
 		selbits = &s_selbits[0];
 	else
 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
 
 	/*
 	 * Assign pointers into the bit buffers and fetch the input bits.
 	 * Put the output buffers together so that they can be bzeroed
 	 * together.
 	 */
 	sbp = selbits;
 #define	getbits(name, x) \
 	do {								\
 		if (name == NULL) {					\
 			ibits[x] = NULL;				\
 			obits[x] = NULL;				\
 		} else {						\
 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
 			obits[x] = sbp;					\
 			sbp += ncpbytes / sizeof *sbp;			\
 			error = copyin(name, ibits[x], ncpubytes);	\
 			if (error != 0)					\
 				goto done;				\
 			if (ncpbytes != ncpubytes)			\
 				bzero((char *)ibits[x] + ncpubytes,	\
 				    ncpbytes - ncpubytes);		\
 		}							\
 	} while (0)
 	getbits(fd_in, 0);
 	getbits(fd_ou, 1);
 	getbits(fd_ex, 2);
 #undef	getbits
 
 #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
 	/*
 	 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
 	 * we are running under 32-bit emulation. This should be more
 	 * generic.
 	 */
 #define swizzle_fdset(bits)						\
 	if (abi_nfdbits != NFDBITS && bits != NULL) {			\
 		int i;							\
 		for (i = 0; i < ncpbytes / sizeof *sbp; i++)		\
 			bits[i] = (bits[i] >> 32) | (bits[i] << 32);	\
 	}
 #else
 #define swizzle_fdset(bits)
 #endif
 
 	/* Make sure the bit order makes it through an ABI transition */
 	swizzle_fdset(ibits[0]);
 	swizzle_fdset(ibits[1]);
 	swizzle_fdset(ibits[2]);
 
 	if (nbufbytes != 0)
 		bzero(selbits, nbufbytes / 2);
 
 	precision = 0;
 	if (tvp != NULL) {
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
 		    rtv.tv_usec >= 1000000) {
 			error = EINVAL;
 			goto done;
 		}
 		if (!timevalisset(&rtv))
 			asbt = 0;
 		else if (rtv.tv_sec <= INT32_MAX) {
 			rsbt = tvtosbt(rtv);
 			precision = rsbt;
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
 			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
 		} else
 			asbt = -1;
 	} else
 		asbt = -1;
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = selscan(td, ibits, obits, nd);
 		if (error || td->td_retval[0] != 0)
 			break;
 		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 		error = selrescan(td, ibits, obits);
 		if (error || td->td_retval[0] != 0)
 			break;
 	}
 	seltdclear(td);
 
 done:
 	/* select is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 
 	/* swizzle bit order back, if necessary */
 	swizzle_fdset(obits[0]);
 	swizzle_fdset(obits[1]);
 	swizzle_fdset(obits[2]);
 #undef swizzle_fdset
 
 #define	putbits(name, x) \
 	if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
 		error = error2;
 	if (error == 0) {
 		int error2;
 
 		putbits(fd_in, 0);
 		putbits(fd_ou, 1);
 		putbits(fd_ex, 2);
 #undef putbits
 	}
 	if (selbits != &s_selbits[0])
 		free(selbits, M_SELECT);
 
 	return (error);
 }
 /* 
  * Convert a select bit set to poll flags.
  *
  * The backend always returns POLLHUP/POLLERR if appropriate and we
  * return this as a set bit in any set.
  */
 static const int select_flags[3] = {
     POLLRDNORM | POLLHUP | POLLERR,
     POLLWRNORM | POLLHUP | POLLERR,
     POLLRDBAND | POLLERR
 };
 
 /*
  * Compute the fo_poll flags required for a fd given by the index and
  * bit position in the fd_mask array.
  */
 static __inline int
 selflags(fd_mask **ibits, int idx, fd_mask bit)
 {
 	int flags;
 	int msk;
 
 	flags = 0;
 	for (msk = 0; msk < 3; msk++) {
 		if (ibits[msk] == NULL)
 			continue;
 		if ((ibits[msk][idx] & bit) == 0)
 			continue;
 		flags |= select_flags[msk];
 	}
 	return (flags);
 }
 
 /*
  * Set the appropriate output bits given a mask of fired events and the
  * input bits originally requested.
  */
 static __inline int
 selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
 {
 	int msk;
 	int n;
 
 	n = 0;
 	for (msk = 0; msk < 3; msk++) {
 		if ((events & select_flags[msk]) == 0)
 			continue;
 		if (ibits[msk] == NULL)
 			continue;
 		if ((ibits[msk][idx] & bit) == 0)
 			continue;
 		/*
 		 * XXX Check for a duplicate set.  This can occur because a
 		 * socket calls selrecord() twice for each poll() call
 		 * resulting in two selfds per real fd.  selrescan() will
 		 * call selsetbits twice as a result.
 		 */
 		if ((obits[msk][idx] & bit) != 0)
 			continue;
 		obits[msk][idx] |= bit;
 		n++;
 	}
 
 	return (n);
 }
 
 /*
  * Traverse the list of fds attached to this thread's seltd and check for
  * completion.
  */
 static int
 selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
 {
 	struct filedesc *fdp;
 	struct selinfo *si;
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct file *fp;
 	fd_mask bit;
 	int fd, ev, n, idx;
 	int error;
 	bool only_user;
 
 	fdp = td->td_proc->p_fd;
 	stp = td->td_sel;
 	n = 0;
 	only_user = FILEDESC_IS_ONLY_USER(fdp);
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
 		fd = (int)(uintptr_t)sfp->sf_cookie;
 		si = sfp->sf_si;
 		selfdfree(stp, sfp);
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
 		if (only_user)
 			error = fget_only_user(fdp, fd, &cap_event_rights, &fp);
 		else
 			error = fget_unlocked(td, fd, &cap_event_rights, &fp);
 		if (__predict_false(error != 0))
 			return (error);
 		idx = fd / NFDBITS;
 		bit = (fd_mask)1 << (fd % NFDBITS);
 		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
 		if (only_user)
 			fput_only_user(fdp, fp);
 		else
 			fdrop(fp, td);
 		if (ev != 0)
 			n += selsetbits(ibits, obits, idx, bit, ev);
 	}
 	stp->st_flags = 0;
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * Perform the initial filedescriptor scan and register ourselves with
  * each selinfo.
  */
 static int
 selscan(struct thread *td, fd_mask **ibits, fd_mask **obits, int nfd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	fd_mask bit;
 	int ev, flags, end, fd;
 	int n, idx;
 	int error;
 	bool only_user;
 
 	fdp = td->td_proc->p_fd;
 	n = 0;
 	only_user = FILEDESC_IS_ONLY_USER(fdp);
 	for (idx = 0, fd = 0; fd < nfd; idx++) {
 		end = imin(fd + NFDBITS, nfd);
 		for (bit = 1; fd < end; bit <<= 1, fd++) {
 			/* Compute the list of events we're interested in. */
 			flags = selflags(ibits, idx, bit);
 			if (flags == 0)
 				continue;
 			if (only_user)
 				error = fget_only_user(fdp, fd, &cap_event_rights, &fp);
 			else
 				error = fget_unlocked(td, fd, &cap_event_rights, &fp);
 			if (__predict_false(error != 0))
 				return (error);
 			selfdalloc(td, (void *)(uintptr_t)fd);
 			ev = fo_poll(fp, flags, td->td_ucred, td);
 			if (only_user)
 				fput_only_user(fdp, fp);
 			else
 				fdrop(fp, td);
 			if (ev != 0)
 				n += selsetbits(ibits, obits, idx, bit, ev);
 		}
 	}
 
 	td->td_retval[0] = n;
 	return (0);
 }
 
 int
 sys_poll(struct thread *td, struct poll_args *uap)
 {
 	struct timespec ts, *tsp;
 
 	if (uap->timeout != INFTIM) {
 		if (uap->timeout < 0)
 			return (EINVAL);
 		ts.tv_sec = uap->timeout / 1000;
 		ts.tv_nsec = (uap->timeout % 1000) * 1000000;
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL));
 }
 
 /*
  * kfds points to an array in the kernel.
  */
 int
 kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds,
     struct timespec *tsp, sigset_t *uset)
 {
 	sbintime_t sbt, precision, tmp;
 	time_t over;
 	struct timespec ts;
 	int error;
 
 	precision = 0;
 	if (tsp != NULL) {
 		if (!timespecvalid_interval(tsp))
 			return (EINVAL);
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			sbt = 0;
 		else {
 			ts = *tsp;
 			if (ts.tv_sec > INT32_MAX / 2) {
 				over = ts.tv_sec - INT32_MAX / 2;
 				ts.tv_sec -= over;
 			} else
 				over = 0;
 			tmp = tstosbt(ts);
 			precision = tmp;
 			precision >>= tc_precexp;
 			if (TIMESEL(&sbt, tmp))
 				sbt += tc_tick_sbt;
 			sbt += tmp;
 		}
 	} else
 		sbt = -1;
 
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &td->td_oldsigmask, 0);
 		if (error)
 			return (error);
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		ast_sched(td, TDA_SIGSUSPEND);
 	}
 
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
 		error = pollscan(td, kfds, nfds);
 		if (error || td->td_retval[0] != 0)
 			break;
 		error = seltdwait(td, sbt, precision);
 		if (error)
 			break;
 		error = pollrescan(td);
 		if (error || td->td_retval[0] != 0)
 			break;
 	}
 	seltdclear(td);
 
 	/* poll is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 	return (error);
 }
 
 int
 sys_ppoll(struct thread *td, struct ppoll_args *uap)
 {
 	struct timespec ts, *tsp;
 	sigset_t set, *ssp;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 	if (uap->set != NULL) {
 		error = copyin(uap->set, &set, sizeof(set));
 		if (error)
 			return (error);
 		ssp = &set;
 	} else
 		ssp = NULL;
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
 }
 
 /*
  * ufds points to an array in user space.
  */
 int
 kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds,
     struct timespec *tsp, sigset_t *set)
 {
 	struct pollfd *kfds;
 	struct pollfd stackfds[32];
 	int error;
 
 	if (kern_poll_maxfds(nfds))
 		return (EINVAL);
 	if (nfds > nitems(stackfds))
 		kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
 	else
 		kfds = stackfds;
 	error = copyin(ufds, kfds, nfds * sizeof(*kfds));
 	if (error != 0)
 		goto out;
 
 	error = kern_poll_kfds(td, kfds, nfds, tsp, set);
 	if (error == 0)
 		error = pollout(td, kfds, ufds, nfds);
 
 out:
 	if (nfds > nitems(stackfds))
 		free(kfds, M_TEMP);
 	return (error);
 }
 
 bool
 kern_poll_maxfds(u_int nfds)
 {
 
 	/*
 	 * This is kinda bogus.  We have fd limits, but that is not
 	 * really related to the size of the pollfd array.  Make sure
 	 * we let the process use at least FD_SETSIZE entries and at
 	 * least enough for the system-wide limits.  We want to be reasonably
 	 * safe, but not overly restrictive.
 	 */
 	return (nfds > maxfilesperproc && nfds > FD_SETSIZE);
 }
 
 static int
 pollrescan(struct thread *td)
 {
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct selinfo *si;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct pollfd *fd;
 	int n, error;
 	bool only_user;
 
 	n = 0;
 	fdp = td->td_proc->p_fd;
 	stp = td->td_sel;
 	only_user = FILEDESC_IS_ONLY_USER(fdp);
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
 		fd = (struct pollfd *)sfp->sf_cookie;
 		si = sfp->sf_si;
 		selfdfree(stp, sfp);
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
 		if (only_user)
 			error = fget_only_user(fdp, fd->fd, &cap_event_rights, &fp);
 		else
 			error = fget_unlocked(td, fd->fd, &cap_event_rights, &fp);
 		if (__predict_false(error != 0)) {
 			fd->revents = POLLNVAL;
 			n++;
 			continue;
 		}
 		/*
 		 * Note: backend also returns POLLHUP and
 		 * POLLERR if appropriate.
 		 */
 		fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
 		if (only_user)
 			fput_only_user(fdp, fp);
 		else
 			fdrop(fp, td);
 		if (fd->revents != 0)
 			n++;
 	}
 	stp->st_flags = 0;
 	td->td_retval[0] = n;
 	return (0);
 }
 
 static int
 pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
 {
 	int error = 0;
 	u_int i = 0;
 	u_int n = 0;
 
 	for (i = 0; i < nfd; i++) {
 		error = copyout(&fds->revents, &ufds->revents,
 		    sizeof(ufds->revents));
 		if (error)
 			return (error);
 		if (fds->revents != 0)
 			n++;
 		fds++;
 		ufds++;
 	}
 	td->td_retval[0] = n;
 	return (0);
 }
 
 static int
 pollscan(struct thread *td, struct pollfd *fds, u_int nfd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i, n, error;
 	bool only_user;
 
 	n = 0;
 	fdp = td->td_proc->p_fd;
 	only_user = FILEDESC_IS_ONLY_USER(fdp);
 	for (i = 0; i < nfd; i++, fds++) {
 		if (fds->fd < 0) {
 			fds->revents = 0;
 			continue;
 		}
 		if (only_user)
 			error = fget_only_user(fdp, fds->fd, &cap_event_rights, &fp);
 		else
 			error = fget_unlocked(td, fds->fd, &cap_event_rights, &fp);
 		if (__predict_false(error != 0)) {
 			fds->revents = POLLNVAL;
 			n++;
 			continue;
 		}
 		/*
 		 * Note: backend also returns POLLHUP and
 		 * POLLERR if appropriate.
 		 */
 		selfdalloc(td, fds);
 		fds->revents = fo_poll(fp, fds->events,
 		    td->td_ucred, td);
 		if (only_user)
 			fput_only_user(fdp, fp);
 		else
 			fdrop(fp, td);
 		/*
 		 * POSIX requires POLLOUT to be never
 		 * set simultaneously with POLLHUP.
 		 */
 		if ((fds->revents & POLLHUP) != 0)
 			fds->revents &= ~POLLOUT;
 
 		if (fds->revents != 0)
 			n++;
 	}
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * XXX This was created specifically to support netncp and netsmb.  This
  * allows the caller to specify a socket to wait for events on.  It returns
  * 0 if any events matched and an error otherwise.  There is no way to
  * determine which events fired.
  */
 int
 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
 {
 	struct timeval rtv;
 	sbintime_t asbt, precision, rsbt;
 	int error;
 
 	precision = 0;	/* stupid gcc! */
 	if (tvp != NULL) {
 		rtv = *tvp;
 		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 
 		    rtv.tv_usec >= 1000000)
 			return (EINVAL);
 		if (!timevalisset(&rtv))
 			asbt = 0;
 		else if (rtv.tv_sec <= INT32_MAX) {
 			rsbt = tvtosbt(rtv);
 			precision = rsbt;
 			precision >>= tc_precexp;
 			if (TIMESEL(&asbt, rsbt))
 				asbt += tc_tick_sbt;
 			if (asbt <= SBT_MAX - rsbt)
 				asbt += rsbt;
 			else
 				asbt = -1;
 		} else
 			asbt = -1;
 	} else
 		asbt = -1;
 	seltdinit(td);
 	/*
 	 * Iterate until the timeout expires or the socket becomes ready.
 	 */
 	for (;;) {
 		selfdalloc(td, NULL);
 		if (sopoll(so, events, NULL, td) != 0) {
 			error = 0;
 			break;
 		}
 		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 	}
 	seltdclear(td);
 	/* XXX Duplicates ncp/smb behavior. */
 	if (error == ERESTART)
 		error = 0;
 	return (error);
 }
 
 /*
  * Preallocate two selfds associated with 'cookie'.  Some fo_poll routines
  * have two select sets, one for read and another for write.
  */
 static void
 selfdalloc(struct thread *td, void *cookie)
 {
 	struct seltd *stp;
 
 	stp = td->td_sel;
 	if (stp->st_free1 == NULL)
 		stp->st_free1 = malloc(sizeof(*stp->st_free1), M_SELFD, M_WAITOK|M_ZERO);
 	stp->st_free1->sf_td = stp;
 	stp->st_free1->sf_cookie = cookie;
 	if (stp->st_free2 == NULL)
 		stp->st_free2 = malloc(sizeof(*stp->st_free2), M_SELFD, M_WAITOK|M_ZERO);
 	stp->st_free2->sf_td = stp;
 	stp->st_free2->sf_cookie = cookie;
 }
 
 static void
 selfdfree(struct seltd *stp, struct selfd *sfp)
 {
 	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
 	/*
 	 * Paired with doselwakeup.
 	 */
 	if (atomic_load_acq_ptr((uintptr_t *)&sfp->sf_si) != (uintptr_t)NULL) {
 		mtx_lock(sfp->sf_mtx);
 		if (sfp->sf_si != NULL) {
 			TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
 		}
 		mtx_unlock(sfp->sf_mtx);
 	}
 	free(sfp, M_SELFD);
 }
 
 /* Drain the waiters tied to all the selfd belonging the specified selinfo. */
 void
 seldrain(struct selinfo *sip)
 {
 
 	/*
 	 * This feature is already provided by doselwakeup(), thus it is
 	 * enough to go for it.
 	 * Eventually, the context, should take care to avoid races
 	 * between thread calling select()/poll() and file descriptor
 	 * detaching, but, again, the races are just the same as
 	 * selwakeup().
 	 */
         doselwakeup(sip, -1);
 }
 
 /*
  * Record a select request.
  */
 void
 selrecord(struct thread *selector, struct selinfo *sip)
 {
 	struct selfd *sfp;
 	struct seltd *stp;
 	struct mtx *mtxp;
 
 	stp = selector->td_sel;
 	/*
 	 * Don't record when doing a rescan.
 	 */
 	if (stp->st_flags & SELTD_RESCAN)
 		return;
 	/*
 	 * Grab one of the preallocated descriptors.
 	 */
 	sfp = NULL;
 	if ((sfp = stp->st_free1) != NULL)
 		stp->st_free1 = NULL;
 	else if ((sfp = stp->st_free2) != NULL)
 		stp->st_free2 = NULL;
 	else
 		panic("selrecord: No free selfd on selq");
 	mtxp = sip->si_mtx;
 	if (mtxp == NULL)
 		mtxp = mtx_pool_find(mtxpool_select, sip);
 	/*
 	 * Initialize the sfp and queue it in the thread.
 	 */
 	sfp->sf_si = sip;
 	sfp->sf_mtx = mtxp;
 	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
 	/*
 	 * Now that we've locked the sip, check for initialization.
 	 */
 	mtx_lock(mtxp);
 	if (sip->si_mtx == NULL) {
 		sip->si_mtx = mtxp;
 		TAILQ_INIT(&sip->si_tdlist);
 	}
 	/*
 	 * Add this thread to the list of selfds listening on this selinfo.
 	 */
 	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
 	mtx_unlock(sip->si_mtx);
 }
 
 /* Wake up a selecting thread. */
 void
 selwakeup(struct selinfo *sip)
 {
 	doselwakeup(sip, -1);
 }
 
 /* Wake up a selecting thread, and set its priority. */
 void
 selwakeuppri(struct selinfo *sip, int pri)
 {
 	doselwakeup(sip, pri);
 }
 
 /*
  * Do a wakeup when a selectable event occurs.
  */
 static void
 doselwakeup(struct selinfo *sip, int pri)
 {
 	struct selfd *sfp;
 	struct selfd *sfn;
 	struct seltd *stp;
 
 	/* If it's not initialized there can't be any waiters. */
 	if (sip->si_mtx == NULL)
 		return;
 	/*
 	 * Locking the selinfo locks all selfds associated with it.
 	 */
 	mtx_lock(sip->si_mtx);
 	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
 		/*
 		 * Once we remove this sfp from the list and clear the
 		 * sf_si seltdclear will know to ignore this si.
 		 */
 		TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
 		stp = sfp->sf_td;
 		mtx_lock(&stp->st_mtx);
 		stp->st_flags |= SELTD_PENDING;
 		cv_broadcastpri(&stp->st_wait, pri);
 		mtx_unlock(&stp->st_mtx);
 		/*
 		 * Paired with selfdfree.
 		 *
 		 * Storing this only after the wakeup provides an invariant that
 		 * stp is not used after selfdfree returns.
 		 */
 		atomic_store_rel_ptr((uintptr_t *)&sfp->sf_si, (uintptr_t)NULL);
 	}
 	mtx_unlock(sip->si_mtx);
 }
 
 static void
 seltdinit(struct thread *td)
 {
 	struct seltd *stp;
 
 	stp = td->td_sel;
 	if (stp != NULL) {
 		MPASS(stp->st_flags == 0);
 		MPASS(STAILQ_EMPTY(&stp->st_selq));
 		return;
 	}
 	stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
 	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
 	cv_init(&stp->st_wait, "select");
 	stp->st_flags = 0;
 	STAILQ_INIT(&stp->st_selq);
 	td->td_sel = stp;
 }
 
 static int
 seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision)
 {
 	struct seltd *stp;
 	int error;
 
 	stp = td->td_sel;
 	/*
 	 * An event of interest may occur while we do not hold the seltd
 	 * locked so check the pending flag before we sleep.
 	 */
 	mtx_lock(&stp->st_mtx);
 	/*
 	 * Any further calls to selrecord will be a rescan.
 	 */
 	stp->st_flags |= SELTD_RESCAN;
 	if (stp->st_flags & SELTD_PENDING) {
 		mtx_unlock(&stp->st_mtx);
 		return (0);
 	}
 	if (sbt == 0)
 		error = EWOULDBLOCK;
 	else if (sbt != -1)
 		error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx,
 		    sbt, precision, C_ABSOLUTE);
 	else
 		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
 	mtx_unlock(&stp->st_mtx);
 
 	return (error);
 }
 
 void
 seltdfini(struct thread *td)
 {
 	struct seltd *stp;
 
 	stp = td->td_sel;
 	if (stp == NULL)
 		return;
 	MPASS(stp->st_flags == 0);
 	MPASS(STAILQ_EMPTY(&stp->st_selq));
 	if (stp->st_free1)
 		free(stp->st_free1, M_SELFD);
 	if (stp->st_free2)
 		free(stp->st_free2, M_SELFD);
 	td->td_sel = NULL;
 	cv_destroy(&stp->st_wait);
 	mtx_destroy(&stp->st_mtx);
 	free(stp, M_SELECT);
 }
 
 /*
  * Remove the references to the thread from all of the objects we were
  * polling.
  */
 static void
 seltdclear(struct thread *td)
 {
 	struct seltd *stp;
 	struct selfd *sfp;
 	struct selfd *sfn;
 
 	stp = td->td_sel;
 	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
 		selfdfree(stp, sfp);
 	stp->st_flags = 0;
 }
 
 static void selectinit(void *);
 SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
 static void
 selectinit(void *dummy __unused)
 {
 
 	mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
 }
 
 /*
  * Set up a syscall return value that follows the convention specified for
  * posix_* functions.
  */
 int
 kern_posix_error(struct thread *td, int error)
 {
 
 	if (error <= 0)
 		return (error);
 	td->td_errno = error;
 	td->td_pflags |= TDP_NERRNO;
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 kcmp_cmp(uintptr_t a, uintptr_t b)
 {
 	if (a == b)
 		return (0);
 	else if (a < b)
 		return (1);
 	return (2);
 }
 
 static int
 kcmp_pget(struct thread *td, pid_t pid, struct proc **pp)
 {
 	if (pid == td->td_proc->p_pid) {
 		*pp = td->td_proc;
 		return (0);
 	}
 	return (pget(pid, PGET_CANDEBUG | PGET_NOTWEXIT | PGET_HOLD, pp));
 }
 
 int
 kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type,
     uintptr_t idx1, uintptr_t idx2)
 {
 	struct proc *p1, *p2;
 	struct file *fp1, *fp2;
 	int error, res;
 
 	res = -1;
 	p1 = p2 = NULL;
 	error = kcmp_pget(td, pid1, &p1);
 	if (error == 0)
 		error = kcmp_pget(td, pid2, &p2);
 	if (error != 0)
 		goto out;
 
 	switch (type) {
 	case KCMP_FILE:
 	case KCMP_FILEOBJ:
 		error = fget_remote(td, p1, idx1, &fp1);
 		if (error == 0) {
 			error = fget_remote(td, p2, idx2, &fp2);
 			if (error == 0) {
 				if (type == KCMP_FILEOBJ)
 					res = fo_cmp(fp1, fp2, td);
 				else
 					res = kcmp_cmp((uintptr_t)fp1,
 					    (uintptr_t)fp2);
 				fdrop(fp2, td);
 			}
 			fdrop(fp1, td);
 		}
 		break;
 	case KCMP_FILES:
 		res = kcmp_cmp((uintptr_t)p1->p_fd, (uintptr_t)p2->p_fd);
 		break;
 	case KCMP_SIGHAND:
 		res = kcmp_cmp((uintptr_t)p1->p_sigacts,
 		    (uintptr_t)p2->p_sigacts);
 		break;
 	case KCMP_VM:
 		res = kcmp_cmp((uintptr_t)p1->p_vmspace,
 		    (uintptr_t)p2->p_vmspace);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 out:
 	if (p1 != NULL && p1 != td->td_proc)
 		PRELE(p1);
 	if (p2 != NULL && p2 != td->td_proc)
 		PRELE(p2);
 
 	td->td_retval[0] = res;
 	return (error);
 }
 
 int
 sys_kcmp(struct thread *td, struct kcmp_args *uap)
 {
 	return (kern_kcmp(td, uap->pid1, uap->pid2, uap->type,
 	    uap->idx1, uap->idx2));
 }
 
 int
 file_kcmp_generic(struct file *fp1, struct file *fp2, struct thread *td)
 {
 	if (fp1->f_type != fp2->f_type)
 		return (3);
 	return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data));
 }
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 45f912a7cc2d..fd6682ef03b0 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -1,3164 +1,3164 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. John S. Dyson's name may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  */
 
 /*
  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
 #include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vnode_pager.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
  * overflow. (XXX will be removed soon.)
  */
 static u_long jobrefid;
 
 /*
  * Counter for aio_fsync.
  */
 static uint64_t jobseqno;
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
 #endif
 
 #ifndef MAX_AIO_QUEUE_PER_PROC
 #define MAX_AIO_QUEUE_PER_PROC	256
 #endif
 
 #ifndef MAX_AIO_QUEUE
 #define MAX_AIO_QUEUE		1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
 #endif
 
 #ifndef MAX_BUF_AIO
 #define MAX_BUF_AIO		16
 #endif
 
 FEATURE(aio, "Asynchronous I/O");
 SYSCTL_DECL(_p1003_1b);
 
 static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
 static MALLOC_DEFINE(M_AIO, "aio", "structures for asynchronous I/O");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Async IO management");
 
 static int enable_aio_unsafe = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
     "Permit asynchronous IO on all file types, not just known-safe types");
 
 static unsigned int unsafe_warningcnt = 1;
 SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
     &unsafe_warningcnt, 0,
     "Warnings that will be triggered upon failed IO requests on unsafe files");
 
 static int max_aio_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
     "Maximum number of kernel processes to use for handling async IO ");
 
 static int num_aio_procs = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
     "Number of presently active kernel processes for async IO");
 
 /*
  * The code will adjust the actual number of AIO processes towards this
  * number when it gets a chance.
  */
 static int target_aio_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
     0,
     "Preferred number of ready kernel processes for async IO");
 
 static int max_queue_count = MAX_AIO_QUEUE;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
     "Maximum number of aio requests to queue, globally");
 
 static int num_queue_count = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
     "Number of queued aio requests");
 
 static int num_buf_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
     "Number of aio requests presently handled by the buf subsystem");
 
 static int num_unmapped_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio,
     0,
     "Number of aio requests presently handled by unmapped I/O buffers");
 
 /* Number of async I/O processes in the process of being started */
 /* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_lifetime;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static int max_aio_per_proc = MAX_AIO_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
     0,
     "Maximum active aio requests per process");
 
 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
     &max_aio_queue_per_proc, 0,
     "Maximum queued aio requests per process");
 
 static int max_buf_aio = MAX_BUF_AIO;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process");
 
 /* 
  * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
  * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
  * vfs.aio.aio_listio_max.
  */
 SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
     CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
     0, "Maximum aio requests for a single lio_listio call");
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb {
 	int	aio_fildes;		/* File descriptor */
 	off_t	aio_offset;		/* File offset for I/O */
 	volatile void *aio_buf;         /* I/O buffer in process space */
 	size_t	aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent aio_sigevent;	/* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private	_aiocb_private;
 } oaiocb_t;
 #endif
 
 /*
  * Below is a key of locks used to protect each member of struct kaiocb
  * aioliojob and kaioinfo and any backends.
  *
  * * - need not protected
  * a - locked by kaioinfo lock
  * b - locked by backend lock, the backend lock can be null in some cases,
  *     for example, BIO belongs to this type, in this case, proc lock is
  *     reused.
  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
  */
 
 /*
  * If the routine that services an AIO request blocks while running in an
  * AIO kernel process it can starve other I/O requests.  BIO requests
  * queued via aio_qbio() complete asynchronously and do not use AIO kernel
  * processes at all.  Socket I/O requests use a separate pool of
  * kprocs and also force non-blocking I/O.  Other file I/O requests
  * use the generic fo_read/fo_write operations which can block.  The
  * fsync and mlock operations can also block while executing.  Ideally
  * none of these requests would block while executing.
  *
  * Note that the service routines cannot toggle O_NONBLOCK in the file
  * structure directly while handling a request due to races with
  * userland threads.
  */
 
 /* jobflags */
 #define	KAIOCB_QUEUEING		0x01
 #define	KAIOCB_CANCELLED	0x02
 #define	KAIOCB_CANCELLING	0x04
 #define	KAIOCB_CHECKSYNC	0x08
 #define	KAIOCB_CLEARED		0x10
 #define	KAIOCB_FINISHED		0x20
 
 /* ioflags */
 #define	KAIOCB_IO_FOFFSET	0x01
 
 /*
  * AIO process info
  */
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aioproc {
 	int	aioprocflags;			/* (c) AIO proc flags */
 	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
 	struct	proc *aioproc;			/* (*) the AIO proc */
 };
 
 /*
  * data-structure for lio signal management
  */
 struct aioliojob {
 	int	lioj_flags;			/* (a) listio flags */
 	int	lioj_count;			/* (a) count of jobs */
 	int	lioj_finished_count;		/* (a) count of finished jobs */
 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
 	struct	knlist klist;			/* (a) list of knotes */
 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
 
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
 	struct	mtx kaio_mtx;		/* the lock to protect this struct */
 	int	kaio_flags;		/* (a) per process kaio flags */
 	int	kaio_active_count;	/* (c) number of currently used AIOs */
 	int	kaio_count;		/* (a) size of AIO queue */
 	int	kaio_buffer_count;	/* (a) number of bio buffers */
 	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
 	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
 	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
 	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
 	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
 	struct	task kaio_task;		/* (*) task to kick aio processes */
 	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
 };
 
 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
 
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
 
 /*
  * Operations used to interact with userland aio control blocks.
  * Different ABIs provide their own operations.
  */
 struct aiocb_ops {
 	int	(*aio_copyin)(struct aiocb *ujob, struct kaiocb *kjob, int ty);
 	long	(*fetch_status)(struct aiocb *ujob);
 	long	(*fetch_error)(struct aiocb *ujob);
 	int	(*store_status)(struct aiocb *ujob, long status);
 	int	(*store_error)(struct aiocb *ujob, long error);
 	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
 	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
 };
 
 static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
 static struct sema aio_newproc_sem;
 static struct mtx aio_job_mtx;
 static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 static void	aio_biocleanup(struct bio *bp);
 void		aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
 static void	aio_process_rw(struct kaiocb *job);
 static void	aio_process_sync(struct kaiocb *job);
 static void	aio_process_mlock(struct kaiocb *job);
 static void	aio_schedule_fsync(void *context, int pending);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *ujob,
 		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
 static int	aio_queue_file(struct file *fp, struct kaiocb *job);
 static void	aio_biowakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p,
 		    struct image_params *imgp);
 static int	aio_qbio(struct proc *p, struct kaiocb *job);
 static void	aio_daemon(void *param);
 static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
 static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
 static int	aio_kick(struct proc *userp);
 static void	aio_kick_nowait(struct proc *userp);
 static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
 static int	filt_lioattach(struct knote *kn);
 static void	filt_liodetach(struct knote *kn);
 static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
  * 	kaio	Per process async io info
  *	aiocb	async io jobs
  *	aiolio	list io jobs
  */
 static uma_zone_t kaio_zone, aiocb_zone, aiolio_zone;
 
 /* kqueue filters for aio */
 static struct filterops aio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_aioattach,
 	.f_detach = filt_aiodetach,
 	.f_event = filt_aio,
 };
 static struct filterops lio_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_lioattach,
 	.f_detach = filt_liodetach,
 	.f_event = filt_lio
 };
 
 static eventhandler_tag exit_tag, exec_tag;
 
 TASKQUEUE_DEFINE_THREAD(aiod_kick);
 
 /*
  * Main operations function for use as a kernel module.
  */
 static int
 aio_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		aio_onceonly();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t aio_mod = {
 	"aio",
 	&aio_modload,
 	NULL
 };
 
 DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(aio, 1);
 
 /*
  * Startup initialization
  */
 static int
 aio_onceonly(void)
 {
 
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
 	jobrefid = 1;
 	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
 
 	return (0);
 }
 
 /*
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
 void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
 	ki->kaio_flags = 0;
 	ki->kaio_active_count = 0;
 	ki->kaio_count = 0;
 	ki->kaio_buffer_count = 0;
 	TAILQ_INIT(&ki->kaio_all);
 	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_syncqueue);
 	TAILQ_INIT(&ki->kaio_syncready);
 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
 		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
 		aio_newproc(NULL);
 }
 
 static int
 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi, bool ext)
 {
 	struct thread *td;
 	int error;
 
 	error = sigev_findtd(p, sigev, &td);
 	if (error)
 		return (error);
 	if (!KSI_ONQ(ksi)) {
 		ksiginfo_set_sigev(ksi, sigev);
 		ksi->ksi_code = SI_ASYNCIO;
 		ksi->ksi_flags |= ext ? (KSI_EXT | KSI_INS) : 0;
 		tdsendsignal(p, td, ksi->ksi_signo, ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Free a job entry.  Wait for completion if it is currently active, but don't
  * delay forever.  If we delay, we return a flag that says that we have to
  * restart the queue scan.
  */
 static int
 aio_free_entry(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct proc *p;
 
 	p = job->userproc;
 	MPASS(curproc == p);
 	ki = p->p_aioinfo;
 	MPASS(ki != NULL);
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	atomic_subtract_int(&num_queue_count, 1);
 
 	ki->kaio_count--;
 	MPASS(ki->kaio_count >= 0);
 
 	TAILQ_REMOVE(&ki->kaio_done, job, plist);
 	TAILQ_REMOVE(&ki->kaio_all, job, allist);
 
 	lj = job->lio;
 	if (lj) {
 		lj->lioj_count--;
 		lj->lioj_finished_count--;
 
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			/* lio is going away, we need to destroy any knotes */
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		}
 	}
 
 	/* job is going away, we need to destroy any knotes */
 	knlist_delete(&job->klist, curthread, 1);
 	PROC_LOCK(p);
 	sigqueue_take(&job->ksi);
 	PROC_UNLOCK(p);
 
 	AIO_UNLOCK(ki);
 
 	/*
 	 * The thread argument here is used to find the owning process
 	 * and is also passed to fo_close() which may pass it to various
 	 * places such as devsw close() routines.  Because of that, we
 	 * need a thread pointer from the process owning the job that is
 	 * persistent and won't disappear out from under us or move to
 	 * another process.
 	 *
 	 * Currently, all the callers of this function call it to remove
 	 * a kaiocb from the current process' job list either via a
 	 * syscall or due to the current process calling exit() or
 	 * execve().  Thus, we know that p == curproc.  We also know that
 	 * curthread can't exit since we are curthread.
 	 *
 	 * Therefore, we use curthread as the thread to pass to
 	 * knlist_delete().  This does mean that it is possible for the
 	 * thread pointer at close time to differ from the thread pointer
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
 	if (job->fd_file)
 		fdrop(job->fd_file, curthread);
 	crfree(job->cred);
 	if (job->uiop != &job->uio)
-		free(job->uiop, M_IOV);
+		freeuio(job->uiop);
 	uma_zfree(aiocb_zone, job);
 	AIO_LOCK(ki);
 
 	return (0);
 }
 
 static void
 aio_proc_rundown_exec(void *arg, struct proc *p,
     struct image_params *imgp __unused)
 {
    	aio_proc_rundown(arg, p);
 }
 
 static int
 aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
 {
 	aio_cancel_fn_t *func;
 	int cancelled;
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
 		return (0);
 	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
 	job->jobflags |= KAIOCB_CANCELLED;
 
 	func = job->cancel_fn;
 
 	/*
 	 * If there is no cancel routine, just leave the job marked as
 	 * cancelled.  The job should be in active use by a caller who
 	 * should complete it normally or when it fails to install a
 	 * cancel routine.
 	 */
 	if (func == NULL)
 		return (0);
 
 	/*
 	 * Set the CANCELLING flag so that aio_complete() will defer
 	 * completions of this job.  This prevents the job from being
 	 * freed out from under the cancel callback.  After the
 	 * callback any deferred completion (whether from the callback
 	 * or any other source) will be completed.
 	 */
 	job->jobflags |= KAIOCB_CANCELLING;
 	AIO_UNLOCK(ki);
 	func(job);
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_CANCELLING;
 	if (job->jobflags & KAIOCB_FINISHED) {
 		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(p, job);
 	} else {
 		/*
 		 * The cancel callback might have scheduled an
 		 * operation to cancel this request, but it is
 		 * only counted as cancelled if the request is
 		 * cancelled when the callback returns.
 		 */
 		cancelled = 0;
 	}
 	return (cancelled);
 }
 
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kaiocb *job, *jobn;
 
 	KASSERT(curthread->td_proc == p,
 	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
 	AIO_LOCK(ki);
 	ki->kaio_flags |= KAIO_RUNDOWN;
 
 restart:
 
 	/*
 	 * Try to cancel all pending requests. This code simulates
 	 * aio_cancel on all pending I/O requests.
 	 */
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		aio_cancel_job(p, ki, job);
 	}
 
 	/* Wait for all running I/O to be finished */
 	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
 		goto restart;
 	}
 
 	/* Free all completed I/O requests. */
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
 		aio_free_entry(job);
 
 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
 			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
 	AIO_UNLOCK(ki);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
 	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
 	mtx_destroy(&ki->kaio_mtx);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
 }
 
 /*
  * Select a job to run (called by an AIO daemon).
  */
 static struct kaiocb *
 aio_selectjob(struct aioproc *aiop)
 {
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 restart:
 	TAILQ_FOREACH(job, &aio_jobs, list) {
 		userp = job->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < max_aio_per_proc) {
 			TAILQ_REMOVE(&aio_jobs, job, list);
 			if (!aio_clear_cancel_function(job))
 				goto restart;
 
 			/* Account for currently active jobs. */
 			ki->kaio_active_count++;
 			break;
 		}
 	}
 	return (job);
 }
 
 /*
  * Move all data to a permanent storage device.  This code
  * simulates the fsync and fdatasync syscalls.
  */
 static int
 aio_fsync_vnode(struct thread *td, struct vnode *vp, int op)
 {
 	struct mount *mp;
 	int error;
 
 	for (;;) {
 		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 		if (error != 0)
 			break;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vnode_pager_clean_async(vp);
 		if (op == LIO_DSYNC)
 			error = VOP_FDATASYNC(vp, td);
 		else
 			error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 		if (error != ERELOOKUP)
 			break;
 	}
 	return (error);
 }
 
 /*
  * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
  * does the I/O request for the non-bio version of the operations.  The normal
  * vn operations are used, and this code should work in all instances for every
  * type of file, including pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process_rw(struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct file *fp;
 	ssize_t cnt;
 	long msgsnd_st, msgsnd_end;
 	long msgrcv_st, msgrcv_end;
 	long oublock_st, oublock_end;
 	long inblock_st, inblock_end;
 	int error, opcode;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
 	    job->uaiocb.aio_lio_opcode == LIO_READV ||
 	    job->uaiocb.aio_lio_opcode == LIO_WRITE ||
 	    job->uaiocb.aio_lio_opcode == LIO_WRITEV,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 	job->uiop->uio_td = td;
 	fp = job->fd_file;
 
 	opcode = job->uaiocb.aio_lio_opcode;
 	cnt = job->uiop->uio_resid;
 
 	msgrcv_st = td->td_ru.ru_msgrcv;
 	msgsnd_st = td->td_ru.ru_msgsnd;
 	inblock_st = td->td_ru.ru_inblock;
 	oublock_st = td->td_ru.ru_oublock;
 
 	/*
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (opcode == LIO_READ || opcode == LIO_READV) {
 		if (job->uiop->uio_resid == 0)
 			error = 0;
 		else
 			error = fo_read(fp, job->uiop, fp->f_cred,
 			    (job->ioflags & KAIOCB_IO_FOFFSET) != 0 ? 0 :
 			    FOF_OFFSET, td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
 		error = fo_write(fp, job->uiop, fp->f_cred, (job->ioflags &
 		    KAIOCB_IO_FOFFSET) != 0 ? 0 : FOF_OFFSET, td);
 	}
 	msgrcv_end = td->td_ru.ru_msgrcv;
 	msgsnd_end = td->td_ru.ru_msgsnd;
 	inblock_end = td->td_ru.ru_inblock;
 	oublock_end = td->td_ru.ru_oublock;
 
 	job->msgrcv = msgrcv_end - msgrcv_st;
 	job->msgsnd = msgsnd_end - msgsnd_st;
 	job->inblock = inblock_end - inblock_st;
 	job->outblock = oublock_end - oublock_st;
 
 	if (error != 0 && job->uiop->uio_resid != cnt) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if (error == EPIPE && (opcode & LIO_WRITE)) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
 	cnt -= job->uiop->uio_resid;
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, cnt, 0);
 }
 
 static void
 aio_process_sync(struct kaiocb *job)
 {
 	struct thread *td = curthread;
 	struct ucred *td_savedcred = td->td_ucred;
 	struct file *fp = job->fd_file;
 	int error = 0;
 
 	KASSERT(job->uaiocb.aio_lio_opcode & LIO_SYNC,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	td->td_ucred = job->cred;
 	if (fp->f_vnode != NULL) {
 		error = aio_fsync_vnode(td, fp->f_vnode,
 		    job->uaiocb.aio_lio_opcode);
 	}
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, 0, 0);
 }
 
 static void
 aio_process_mlock(struct kaiocb *job)
 {
 	struct aiocb *cb = &job->uaiocb;
 	int error;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	error = kern_mlock(job->userproc, job->cred,
 	    __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
 	aio_complete(job, error != 0 ? -1 : 0, error);
 }
 
 static void
 aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct kaiocb *sjob, *sjobn;
 	int lj_done;
 	bool schedule_fsync;
 
 	ki = userp->p_aioinfo;
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	lj = job->lio;
 	lj_done = 0;
 	if (lj) {
 		lj->lioj_finished_count++;
 		if (lj->lioj_count == lj->lioj_finished_count)
 			lj_done = 1;
 	}
 	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
 	MPASS(job->jobflags & KAIOCB_FINISHED);
 
 	if (ki->kaio_flags & KAIO_RUNDOWN)
 		goto notification_done;
 
 	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
 		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi, true);
 
 	KNOTE_LOCKED(&job->klist, 1);
 
 	if (lj_done) {
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 			KNOTE_LOCKED(&lj->klist, 1);
 		}
 		if ((lj->lioj_flags & (LIOJ_SIGNAL | LIOJ_SIGNAL_POSTED))
 		    == LIOJ_SIGNAL &&
 		    (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 		    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi,
 			    true);
 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 		}
 	}
 
 notification_done:
 	if (job->jobflags & KAIOCB_CHECKSYNC) {
 		schedule_fsync = false;
 		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
 			if (job->fd_file != sjob->fd_file ||
 			    job->seqno >= sjob->seqno)
 				continue;
 			if (--sjob->pending > 0)
 				continue;
 			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
 			if (!aio_clear_cancel_function_locked(sjob))
 				continue;
 			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
 			schedule_fsync = true;
 		}
 		if (schedule_fsync)
 			taskqueue_enqueue(taskqueue_aiod_kick,
 			    &ki->kaio_sync_task);
 	}
 	if (ki->kaio_flags & KAIO_WAKEUP) {
 		ki->kaio_flags &= ~KAIO_WAKEUP;
 		wakeup(&userp->p_aioinfo);
 	}
 }
 
 static void
 aio_schedule_fsync(void *context, int pending)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 
 	ki = context;
 	AIO_LOCK(ki);
 	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
 		job = TAILQ_FIRST(&ki->kaio_syncready);
 		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		AIO_LOCK(ki);
 	}
 	AIO_UNLOCK(ki);
 }
 
 bool
 aio_cancel_cleared(struct kaiocb *job)
 {
 
 	/*
 	 * The caller should hold the same queue lock held when
 	 * aio_clear_cancel_function() was called and set this flag
 	 * ensuring this check sees an up-to-date value.  However,
 	 * there is no way to assert that.
 	 */
 	return ((job->jobflags & KAIOCB_CLEARED) != 0);
 }
 
 static bool
 aio_clear_cancel_function_locked(struct kaiocb *job)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	MPASS(job->cancel_fn != NULL);
 	if (job->jobflags & KAIOCB_CANCELLING) {
 		job->jobflags |= KAIOCB_CLEARED;
 		return (false);
 	}
 	job->cancel_fn = NULL;
 	return (true);
 }
 
 bool
 aio_clear_cancel_function(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_clear_cancel_function_locked(job);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 static bool
 aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 
 	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
 	if (job->jobflags & KAIOCB_CANCELLED)
 		return (false);
 	job->cancel_fn = func;
 	return (true);
 }
 
 bool
 aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
 {
 	struct kaioinfo *ki;
 	bool ret;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	ret = aio_set_cancel_function_locked(job, func);
 	AIO_UNLOCK(ki);
 	return (ret);
 }
 
 void
 aio_complete(struct kaiocb *job, long status, int error)
 {
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	job->uaiocb._aiocb_private.error = error;
 	job->uaiocb._aiocb_private.status = status;
 
 	userp = job->userproc;
 	ki = userp->p_aioinfo;
 
 	AIO_LOCK(ki);
 	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
 	    ("duplicate aio_complete"));
 	job->jobflags |= KAIOCB_FINISHED;
 	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
 		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
 		aio_bio_done_notify(userp, job);
 	}
 	AIO_UNLOCK(ki);
 }
 
 void
 aio_cancel(struct kaiocb *job)
 {
 
 	aio_complete(job, -1, ECANCELED);
 }
 
 void
 aio_switch_vmspace(struct kaiocb *job)
 {
 
 	vmspace_switch_aio(job->userproc->p_vmspace);
 }
 
 /*
  * The AIO daemon, most of the actual work is done in aio_process_*,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
 aio_daemon(void *_id)
 {
 	struct kaiocb *job;
 	struct aioproc *aiop;
 	struct kaioinfo *ki;
 	struct proc *p;
 	struct vmspace *myvm;
 	struct thread *td = curthread;
 	int id = (intptr_t)_id;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = td->td_proc;
 	myvm = vmspace_acquire_ref(p);
 
 	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
 
 	/*
 	 * Allocate and ready the aio control info.  There is one aiop structure
 	 * per daemon.
 	 */
 	aiop = malloc(sizeof(*aiop), M_AIO, M_WAITOK);
 	aiop->aioproc = p;
 	aiop->aioprocflags = 0;
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
 	sema_post(&aio_newproc_sem);
 
 	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * Take daemon off of free queue
 		 */
 		if (aiop->aioprocflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aioprocflags &= ~AIOP_FREE;
 		}
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((job = aio_selectjob(aiop)) != NULL) {
 			mtx_unlock(&aio_job_mtx);
 
 			ki = job->userproc->p_aioinfo;
 			job->handle_fn(job);
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&aio_job_mtx);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&aio_job_mtx);
 			/*
 			 * We have to restart to avoid race, we only sleep if
 			 * no job can be selected.
 			 */
 			continue;
 		}
 
 		mtx_assert(&aio_job_mtx, MA_OWNED);
 
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aioprocflags |= AIOP_FREE;
 
 		/*
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
 		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
 		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
 		    (aiop->aioprocflags & AIOP_FREE) &&
 		    num_aio_procs > target_aio_procs)
 			break;
 	}
 	TAILQ_REMOVE(&aio_freeproc, aiop, list);
 	num_aio_procs--;
 	mtx_unlock(&aio_job_mtx);
 	free(aiop, M_AIO);
 	free_unr(aiod_unr, id);
 	vmspace_free(myvm);
 
 	KASSERT(p->p_vmspace == myvm,
 	    ("AIOD: bad vmspace for exiting daemon"));
 	KASSERT(refcount_load(&myvm->vm_refcnt) > 1,
 	    ("AIOD: bad vm refcnt for exiting daemon: %d",
 	    refcount_load(&myvm->vm_refcnt)));
 	kproc_exit(0);
 }
 
 /*
  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
 aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
 	int id;
 
 	id = alloc_unr(aiod_unr);
 	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
 		RFNOWAIT, 0, "aiod%d", id);
 	if (error == 0) {
 		/*
 		 * Wait until daemon is started.
 		 */
 		sema_wait(&aio_newproc_sem);
 		mtx_lock(&aio_job_mtx);
 		num_aio_procs++;
 		if (start != NULL)
 			(*start)--;
 		mtx_unlock(&aio_job_mtx);
 	} else {
 		free_unr(aiod_unr, id);
 	}
 	return (error);
 }
 
 /*
  * Try the high-performance, low-overhead bio method for eligible
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
  * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qbio(struct proc *p, struct kaiocb *job)
 {
 	struct aiocb *cb;
 	struct file *fp;
 	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	struct bio **bios = NULL;
 	off_t offset;
 	int bio_cmd, error, i, iovcnt, opcode, poff, ref;
 	vm_prot_t prot;
 	bool use_unmapped;
 
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 	opcode = cb->aio_lio_opcode;
 
 	if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV ||
 	    opcode == LIO_READ || opcode == LIO_READV))
 		return (-1);
 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VCHR)
 		return (-1);
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
 
 	bio_cmd = (opcode & LIO_WRITE) ? BIO_WRITE : BIO_READ;
 	iovcnt = job->uiop->uio_iovcnt;
 	if (iovcnt > max_buf_aio)
 		return (-1);
 	for (i = 0; i < iovcnt; i++) {
 		if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0)
 			return (-1);
 		if (job->uiop->uio_iov[i].iov_len > maxphys) {
 			error = -1;
 			return (-1);
 		}
 	}
 	offset = cb->aio_offset;
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
 
 	if ((csw->d_flags & D_DISK) == 0) {
 		error = -1;
 		goto unref;
 	}
 	if (job->uiop->uio_resid > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
 	ki = p->p_aioinfo;
 	job->error = 0;
 
 	use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed;
 	if (!use_unmapped) {
 		AIO_LOCK(ki);
 		if (ki->kaio_buffer_count + iovcnt > max_buf_aio) {
 			AIO_UNLOCK(ki);
 			error = EAGAIN;
 			goto unref;
 		}
 		ki->kaio_buffer_count += iovcnt;
 		AIO_UNLOCK(ki);
 	}
 
 	bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK);
 	refcount_init(&job->nbio, iovcnt);
 	for (i = 0; i < iovcnt; i++) {
 		struct vm_page** pages;
 		struct bio *bp;
 		void *buf;
 		size_t nbytes;
 		int npages;
 
 		buf = job->uiop->uio_iov[i].iov_base;
 		nbytes = job->uiop->uio_iov[i].iov_len;
 
 		bios[i] = g_alloc_bio();
 		bp = bios[i];
 
 		poff = (vm_offset_t)buf & PAGE_MASK;
 		if (use_unmapped) {
 			pbuf = NULL;
 			pages = malloc(sizeof(vm_page_t) * (atop(round_page(
 			    nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO);
 		} else {
 			pbuf = uma_zalloc(pbuf_zone, M_WAITOK);
 			BUF_KERNPROC(pbuf);
 			pages = pbuf->b_pages;
 		}
 
 		bp->bio_length = nbytes;
 		bp->bio_bcount = nbytes;
 		bp->bio_done = aio_biowakeup;
 		bp->bio_offset = offset;
 		bp->bio_cmd = bio_cmd;
 		bp->bio_dev = dev;
 		bp->bio_caller1 = job;
 		bp->bio_caller2 = pbuf;
 
 		prot = VM_PROT_READ;
 		if (opcode == LIO_READ || opcode == LIO_READV)
 			prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 		npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 		    (vm_offset_t)buf, bp->bio_length, prot, pages,
 		    atop(maxphys) + 1);
 		if (npages < 0) {
 			if (pbuf != NULL)
 				uma_zfree(pbuf_zone, pbuf);
 			else
 				free(pages, M_TEMP);
 			error = EFAULT;
 			g_destroy_bio(bp);
 			i--;
 			goto destroy_bios;
 		}
 		if (pbuf != NULL) {
 			pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages);
 			bp->bio_data = pbuf->b_data + poff;
 			pbuf->b_npages = npages;
 			atomic_add_int(&num_buf_aio, 1);
 		} else {
 			bp->bio_ma = pages;
 			bp->bio_ma_n = npages;
 			bp->bio_ma_offset = poff;
 			bp->bio_data = unmapped_buf;
 			bp->bio_flags |= BIO_UNMAPPED;
 			atomic_add_int(&num_unmapped_aio, 1);
 		}
 
 		offset += nbytes;
 	}
 
 	/* Perform transfer. */
 	for (i = 0; i < iovcnt; i++)
 		csw->d_strategy(bios[i]);
 	free(bios, M_TEMP);
 
 	dev_relthread(dev, ref);
 	return (0);
 
 destroy_bios:
 	for (; i >= 0; i--)
 		aio_biocleanup(bios[i]);
 	free(bios, M_TEMP);
 unref:
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	nsig->sigev_notify = osig->sigev_notify;
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob,
     int type __unused)
 {
 	struct oaiocb *ojob;
 	struct aiocb *kcb = &kjob->uaiocb;
 	int error;
 
 	bzero(kcb, sizeof(struct aiocb));
 	error = copyin(ujob, kcb, sizeof(struct oaiocb));
 	if (error)
 		return (error);
 	/* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */
 	ojob = (struct oaiocb *)kcb;
 	return (convert_old_sigevent(&ojob->aio_sigevent, &kcb->aio_sigevent));
 }
 #endif
 
 static int
 aiocb_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type)
 {
 	struct aiocb *kcb = &kjob->uaiocb;
 	int error;
 
 	error = copyin(ujob, kcb, sizeof(struct aiocb));
 	if (error)
 		return (error);
 	if (type == LIO_NOP)
 		type = kcb->aio_lio_opcode;
 	if (type & LIO_VECTORED) {
 		/* malloc a uio and copy in the iovec */
 		error = copyinuio(__DEVOLATILE(struct iovec*, kcb->aio_iov),
 		    kcb->aio_iovcnt, &kjob->uiop);
 	}
 
 	return (error);
 }
 
 static long
 aiocb_fetch_status(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.status));
 }
 
 static long
 aiocb_fetch_error(struct aiocb *ujob)
 {
 
 	return (fuword(&ujob->_aiocb_private.error));
 }
 
 static int
 aiocb_store_status(struct aiocb *ujob, long status)
 {
 
 	return (suword(&ujob->_aiocb_private.status, status));
 }
 
 static int
 aiocb_store_error(struct aiocb *ujob, long error)
 {
 
 	return (suword(&ujob->_aiocb_private.error, error));
 }
 
 static int
 aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 
 	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb_ops = {
 	.aio_copyin = aiocb_copyin,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb_ops_osigevent = {
 	.aio_copyin = aiocb_copyin_old_sigevent,
 	.fetch_status = aiocb_fetch_status,
 	.fetch_error = aiocb_fetch_error,
 	.store_status = aiocb_store_status,
 	.store_error = aiocb_store_error,
 	.store_kernelinfo = aiocb_store_kernelinfo,
 	.store_aiocb = aiocb_store_aiocb,
 };
 #endif
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct bio VCHR
  * technique is done in this code.
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
     int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp = NULL;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	struct kevent kev;
 	int opcode;
 	int error;
 	int fd, kqfd;
 	int jid;
 	u_short evflags;
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	ops->store_status(ujob, -1);
 	ops->store_error(ujob, 0);
 	ops->store_kernelinfo(ujob, -1);
 
 	if (num_queue_count >= max_queue_count ||
 	    ki->kaio_count >= max_aio_queue_per_proc) {
 		error = EAGAIN;
 		goto err1;
 	}
 
 	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
 	knlist_init_mtx(&job->klist, AIO_MTX(ki));
 
 	error = ops->aio_copyin(ujob, job, type);
 	if (error)
 		goto err2;
 
 	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
 		error = EINVAL;
 		goto err2;
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
 	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
 		error = EINVAL;
 		goto err2;
 	}
 
 	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
 		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
 		error = EINVAL;
 		goto err2;
 	}
 
 	/* Get the opcode. */
 	if (type == LIO_NOP) {
 		switch (job->uaiocb.aio_lio_opcode & ~LIO_FOFFSET) {
 		case LIO_WRITE:
 		case LIO_WRITEV:
 		case LIO_NOP:
 		case LIO_READ:
 		case LIO_READV:
 			opcode = job->uaiocb.aio_lio_opcode & ~LIO_FOFFSET;
 			if ((job->uaiocb.aio_lio_opcode & LIO_FOFFSET) != 0)
 				job->ioflags |= KAIOCB_IO_FOFFSET;
 			break;
 		default:
 			error = EINVAL;
 			goto err2;
 		}
 	} else
 		opcode = job->uaiocb.aio_lio_opcode = type;
 
 	ksiginfo_init(&job->ksi);
 
 	/* Save userspace address of the job info. */
 	job->ujob = ujob;
 
 	/*
 	 * Validate the opcode and fetch the file object for the specified
 	 * file descriptor.
 	 *
 	 * XXXRW: Moved the opcode validation up here so that we don't
 	 * retrieve a file descriptor without knowing what the capabiltity
 	 * should be.
 	 */
 	fd = job->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
 	case LIO_WRITEV:
 		error = fget_write(td, fd, &cap_pwrite_rights, &fp);
 		break;
 	case LIO_READ:
 	case LIO_READV:
 		error = fget_read(td, fd, &cap_pread_rights, &fp);
 		break;
 	case LIO_SYNC:
 	case LIO_DSYNC:
 		error = fget(td, fd, &cap_fsync_rights, &fp);
 		break;
 	case LIO_MLOCK:
 		break;
 	case LIO_NOP:
 		error = fget(td, fd, &cap_no_rights, &fp);
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error)
 		goto err3;
 
 	if ((opcode & LIO_SYNC) && fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto err3;
 	}
 
 	if ((opcode == LIO_READ || opcode == LIO_READV ||
 	    opcode == LIO_WRITE || opcode == LIO_WRITEV) &&
 	    job->uaiocb.aio_offset < 0 &&
 	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
 		error = EINVAL;
 		goto err3;
 	}
 
 	if (fp != NULL && fp->f_ops == &path_fileops) {
 		error = EBADF;
 		goto err3;
 	}
 
 	job->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
 	jid = jobrefid++;
 	job->seqno = jobseqno++;
 	mtx_unlock(&aio_job_mtx);
 	error = ops->store_kernelinfo(ujob, jid);
 	if (error) {
 		error = EINVAL;
 		goto err3;
 	}
 	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		MPASS(job->uiop == &job->uio || job->uiop == NULL);
 		uma_zfree(aiocb_zone, job);
 		return (0);
 	}
 
 	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
 	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
 	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
 		error = EINVAL;
 		goto err3;
 	}
 	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	memset(&kev, 0, sizeof(kev));
 	kev.ident = (uintptr_t)job->ujob;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
 	kev.data = (intptr_t)job;
 	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
 	error = kqfd_register(kqfd, &kev, td, M_WAITOK);
 	if (error)
 		goto err3;
 
 no_kqueue:
 
 	ops->store_error(ujob, EINPROGRESS);
 	job->uaiocb._aiocb_private.error = EINPROGRESS;
 	job->userproc = p;
 	job->cred = crhold(td->td_ucred);
 	job->jobflags = KAIOCB_QUEUEING;
 	job->lio = lj;
 
 	if (opcode & LIO_VECTORED) {
 		/* Use the uio copied in by aio_copyin */
 		MPASS(job->uiop != &job->uio && job->uiop != NULL);
 	} else {
 		/* Setup the inline uio */
 		job->iov[0].iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf;
 		job->iov[0].iov_len = job->uaiocb.aio_nbytes;
 		job->uio.uio_iov = job->iov;
 		job->uio.uio_iovcnt = 1;
 		job->uio.uio_resid = job->uaiocb.aio_nbytes;
 		job->uio.uio_segflg = UIO_USERSPACE;
 		job->uiop = &job->uio;
 	}
 	switch (opcode & (LIO_READ | LIO_WRITE)) {
 	case LIO_READ:
 		job->uiop->uio_rw = UIO_READ;
 		break;
 	case LIO_WRITE:
 		job->uiop->uio_rw = UIO_WRITE;
 		break;
 	}
 	job->uiop->uio_offset = job->uaiocb.aio_offset;
 	job->uiop->uio_td = td;
 
 	if (opcode == LIO_MLOCK) {
 		aio_schedule(job, aio_process_mlock);
 		error = 0;
 	} else if (fp->f_ops->fo_aio_queue == NULL)
 		error = aio_queue_file(fp, job);
 	else
 		error = fo_aio_queue(fp, job);
 	if (error)
 		goto err4;
 
 	AIO_LOCK(ki);
 	job->jobflags &= ~KAIOCB_QUEUEING;
 	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
 	ki->kaio_count++;
 	if (lj)
 		lj->lioj_count++;
 	atomic_add_int(&num_queue_count, 1);
 	if (job->jobflags & KAIOCB_FINISHED) {
 		/*
 		 * The queue callback completed the request synchronously.
 		 * The bulk of the completion is deferred in that case
 		 * until this point.
 		 */
 		aio_bio_done_notify(p, job);
 	} else
 		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
 	AIO_UNLOCK(ki);
 	return (0);
 
 err4:
 	crfree(job->cred);
 err3:
 	if (fp)
 		fdrop(fp, td);
 	knlist_delete(&job->klist, curthread, 0);
 err2:
 	if (job->uiop != &job->uio)
-		free(job->uiop, M_IOV);
+		freeuio(job->uiop);
 	uma_zfree(aiocb_zone, job);
 err1:
 	ops->store_error(ujob, error);
 	return (error);
 }
 
 static void
 aio_cancel_daemon_job(struct kaiocb *job)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&aio_jobs, job, list);
 	mtx_unlock(&aio_job_mtx);
 	aio_cancel(job);
 }
 
 void
 aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
 {
 
 	mtx_lock(&aio_job_mtx);
 	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
 		mtx_unlock(&aio_job_mtx);
 		aio_cancel(job);
 		return;
 	}
 	job->handle_fn = func;
 	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
 	aio_kick_nowait(job->userproc);
 	mtx_unlock(&aio_job_mtx);
 }
 
 static void
 aio_cancel_sync(struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 
 	ki = job->userproc->p_aioinfo;
 	AIO_LOCK(ki);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
 	AIO_UNLOCK(ki);
 	aio_cancel(job);
 }
 
 int
 aio_queue_file(struct file *fp, struct kaiocb *job)
 {
 	struct kaioinfo *ki;
 	struct kaiocb *job2;
 	struct vnode *vp;
 	struct mount *mp;
 	int error;
 	bool safe;
 
 	ki = job->userproc->p_aioinfo;
 	error = aio_qbio(job->userproc, job);
 	if (error >= 0)
 		return (error);
 	safe = false;
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vp->v_type == VREG || vp->v_type == VDIR) {
 			mp = fp->f_vnode->v_mount;
 			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
 				safe = true;
 		}
 	}
 	if (!(safe || enable_aio_unsafe)) {
 		counted_warning(&unsafe_warningcnt,
 		    "is attempting to use unsafe AIO requests");
 		return (EOPNOTSUPP);
 	}
 
 	if (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) {
 		aio_schedule(job, aio_process_rw);
 		error = 0;
 	} else if (job->uaiocb.aio_lio_opcode & LIO_SYNC) {
 		AIO_LOCK(ki);
 		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
 			if (job2->fd_file == job->fd_file &&
 			    ((job2->uaiocb.aio_lio_opcode & LIO_SYNC) == 0) &&
 			    job2->seqno < job->seqno) {
 				job2->jobflags |= KAIOCB_CHECKSYNC;
 				job->pending++;
 			}
 		}
 		if (job->pending != 0) {
 			if (!aio_set_cancel_function_locked(job,
 				aio_cancel_sync)) {
 				AIO_UNLOCK(ki);
 				aio_cancel(job);
 				return (0);
 			}
 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 		AIO_UNLOCK(ki);
 		aio_schedule(job, aio_process_sync);
 		error = 0;
 	} else {
 		error = EINVAL;
 	}
 	return (error);
 }
 
 static void
 aio_kick_nowait(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
 		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
 	}
 }
 
 static int
 aio_kick(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aioproc *aiop;
 	int error, ret = 0;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aioprocflags &= ~AIOP_FREE;
 		wakeup(aiop->aioproc);
 	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
 	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
 		num_aio_resv_start++;
 		mtx_unlock(&aio_job_mtx);
 		error = aio_newproc(&num_aio_resv_start);
 		mtx_lock(&aio_job_mtx);
 		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
 	} else {
 		ret = -1;
 	}
 	return (ret);
 }
 
 static void
 aio_kick_helper(void *context, int pending)
 {
 	struct proc *userp = context;
 
 	mtx_lock(&aio_job_mtx);
 	while (--pending >= 0) {
 		if (aio_kick(userp))
 			break;
 	}
 	mtx_unlock(&aio_job_mtx);
 }
 
 /*
  * Support the aio_return system call, as a side-effect, kernel resources are
  * released.
  */
 static int
 kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	long status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
 		if (job->ujob == ujob)
 			break;
 	}
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else {
 		error = EINVAL;
 		AIO_UNLOCK(ki);
 	}
 	return (error);
 }
 
 int
 sys_aio_return(struct thread *td, struct aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
 }
 
 /*
  * Allow a process to wakeup when any of the I/O requests are completed.
  */
 static int
 kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
     struct timespec *ts)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *firstjob, *job;
 	int error, i, timo;
 
 	timo = 0;
 	if (ts) {
 		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EAGAIN);
 
 	if (njoblist == 0)
 		return (0);
 
 	AIO_LOCK(ki);
 	for (;;) {
 		firstjob = NULL;
 		error = 0;
 		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
 				if (job->ujob == ujoblist[i]) {
 					if (firstjob == NULL)
 						firstjob = job;
 					if (job->jobflags & KAIOCB_FINISHED)
 						goto RETURN;
 				}
 			}
 		}
 		/* All tasks were finished. */
 		if (firstjob == NULL)
 			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiospn", timo);
 		if (error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 RETURN:
 	AIO_UNLOCK(ki);
 	return (error);
 }
 
 int
 sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
 {
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	int error;
 
 	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK);
 	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
 	if (error == 0)
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	free(ujoblist, M_AIO);
 	return (error);
 }
 
 /*
  * aio_cancel cancels any non-bio aio operations not currently in progress.
  */
 int
 sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 	struct kaiocb *job, *jobn;
 	struct file *fp;
 	int error;
 	int cancelled = 0;
 	int notcancelled = 0;
 	struct vnode *vp;
 
 	/* Lookup file object. */
 	error = fget(td, uap->fd, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vn_isdisk(vp)) {
 			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
 		if ((uap->fd == job->uaiocb.aio_fildes) &&
 		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == job->ujob))) {
 			if (aio_cancel_job(p, ki, job)) {
 				cancelled++;
 			} else {
 				notcancelled++;
 			}
 			if (uap->aiocbp != NULL)
 				break;
 		}
 	}
 	AIO_UNLOCK(ki);
 
 done:
 	fdrop(fp, td);
 
 	if (uap->aiocbp != NULL) {
 		if (cancelled) {
 			td->td_retval[0] = AIO_CANCELED;
 			return (0);
 		}
 	}
 
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
 
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
 
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
  * aio_error is implemented in the kernel level for compatibility purposes
  * only.  For a user mode async implementation, it would be best to do it in
  * a userland subroutine.
  */
 static int
 kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct kaiocb *job;
 	struct kaioinfo *ki;
 	int status;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL) {
 		td->td_retval[0] = EINVAL;
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
 		if (job->ujob == ujob) {
 			if (job->jobflags & KAIOCB_FINISHED)
 				td->td_retval[0] =
 					job->uaiocb._aiocb_private.error;
 			else
 				td->td_retval[0] = EINPROGRESS;
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Hack for failure of aio_aqueue.
 	 */
 	status = ops->fetch_status(ujob);
 	if (status == -1) {
 		td->td_retval[0] = ops->fetch_error(ujob);
 		return (0);
 	}
 
 	td->td_retval[0] = EINVAL;
 	return (0);
 }
 
 int
 sys_aio_error(struct thread *td, struct aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
 }
 
 int
 sys_aio_readv(struct thread *td, struct aio_readv_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READV, &aiocb_ops));
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb_ops_osigevent));
 }
 #endif
 
 int
 sys_aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
 }
 
 int
 sys_aio_writev(struct thread *td, struct aio_writev_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITEV, &aiocb_ops));
 }
 
 int
 sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
 }
 
 static int
 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
     struct aiocb **acb_list, int nent, struct sigevent *sig,
     struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct aiocb *job;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kevent kev;
 	int error;
 	int nagain, nerror;
 	int i;
 
 	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
 		return (EINVAL);
 
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
 	lj->lioj_flags = 0;
 	lj->lioj_count = 0;
 	lj->lioj_finished_count = 0;
 	lj->lioj_signal.sigev_notify = SIGEV_NONE;
 	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
 	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (sig && (mode == LIO_NOWAIT)) {
 		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			/* Assume only new style KEVENT */
 			memset(&kev, 0, sizeof(kev));
 			kev.filter = EVFILT_LIO;
 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 			kev.ident = (uintptr_t)uacb_list; /* something unique */
 			kev.data = (intptr_t)lj;
 			/* pass user defined sigval data */
 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
 			error = kqfd_register(
 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td,
 			    M_WAITOK);
 			if (error) {
 				uma_zfree(aiolio_zone, lj);
 				return (error);
 			}
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
 			;
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 					uma_zfree(aiolio_zone, lj);
 					return EINVAL;
 				}
 				lj->lioj_flags |= LIOJ_SIGNAL;
 		} else {
 			uma_zfree(aiolio_zone, lj);
 			return EINVAL;
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
 	 * Add extra aiocb count to avoid the lio to be freed
 	 * by other threads doing aio_waitcomplete or aio_return,
 	 * and prevent event from being sent until we have queued
 	 * all tasks.
 	 */
 	lj->lioj_count = 1;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nagain = 0;
 	nerror = 0;
 	for (i = 0; i < nent; i++) {
 		job = acb_list[i];
 		if (job != NULL) {
 			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
 			if (error == EAGAIN)
 				nagain++;
 			else if (error != 0)
 				nerror++;
 		}
 	}
 
 	error = 0;
 	AIO_LOCK(ki);
 	if (mode == LIO_WAIT) {
 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
 			    PRIBIO | PCATCH, "aiospn", 0);
 			if (error == ERESTART)
 				error = EINTR;
 			if (error)
 				break;
 		}
 	} else {
 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 				KNOTE_LOCKED(&lj->klist, 1);
 			}
 			if ((lj->lioj_flags & (LIOJ_SIGNAL |
 			    LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL &&
 			    (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 				aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi,
 				    lj->lioj_count != 1);
 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 			}
 		}
 	}
 	lj->lioj_count--;
 	if (lj->lioj_count == 0) {
 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 		knlist_delete(&lj->klist, curthread, 1);
 		PROC_LOCK(p);
 		sigqueue_take(&lj->lioj_ksi);
 		PROC_UNLOCK(p);
 		AIO_UNLOCK(ki);
 		uma_zfree(aiolio_zone, lj);
 	} else
 		AIO_UNLOCK(ki);
 
 	if (nerror)
 		return (EIO);
 	else if (nagain)
 		return (EAGAIN);
 	else
 		return (error);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent osig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode,
 		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 		    &aiocb_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 /* syscall - list directed I/O (REALTIME) */
 int
 sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	int error, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig, sizeof(sig));
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
 	if (error == 0)
 		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
 		    nent, sigp, &aiocb_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 static void
 aio_biocleanup(struct bio *bp)
 {
 	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
 	struct kaioinfo *ki;
 	struct buf *pbuf = (struct buf *)bp->bio_caller2;
 
 	/* Release mapping into kernel space. */
 	if (pbuf != NULL) {
 		MPASS(pbuf->b_npages <= atop(maxphys) + 1);
 		pmap_qremove((vm_offset_t)pbuf->b_data, pbuf->b_npages);
 		vm_page_unhold_pages(pbuf->b_pages, pbuf->b_npages);
 		uma_zfree(pbuf_zone, pbuf);
 		atomic_subtract_int(&num_buf_aio, 1);
 		ki = job->userproc->p_aioinfo;
 		AIO_LOCK(ki);
 		ki->kaio_buffer_count--;
 		AIO_UNLOCK(ki);
 	} else {
 		MPASS(bp->bio_ma_n <= atop(maxphys) + 1);
 		vm_page_unhold_pages(bp->bio_ma, bp->bio_ma_n);
 		free(bp->bio_ma, M_TEMP);
 		atomic_subtract_int(&num_unmapped_aio, 1);
 	}
 	g_destroy_bio(bp);
 }
 
 static void
 aio_biowakeup(struct bio *bp)
 {
 	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
 	size_t nbytes;
 	long bcount = bp->bio_bcount;
 	long resid = bp->bio_resid;
 	int opcode, nblks;
 	int bio_error = bp->bio_error;
 	uint16_t flags = bp->bio_flags;
 
 	opcode = job->uaiocb.aio_lio_opcode;
 
 	aio_biocleanup(bp);
 
 	nbytes = bcount - resid;
 	atomic_add_acq_long(&job->nbytes, nbytes);
 	nblks = btodb(nbytes);
 
 	/*
 	 * If multiple bios experienced an error, the job will reflect the
 	 * error of whichever failed bio completed last.
 	 */
 	if (flags & BIO_ERROR)
 		atomic_store_int(&job->error, bio_error);
 	if (opcode & LIO_WRITE)
 		atomic_add_int(&job->outblock, nblks);
 	else
 		atomic_add_int(&job->inblock, nblks);
 
 	if (refcount_release(&job->nbio)) {
 		bio_error = atomic_load_int(&job->error);
 		if (bio_error != 0)
 			aio_complete(job, -1, bio_error);
 		else
 			aio_complete(job, atomic_load_long(&job->nbytes), 0);
 	}
 }
 
 /* syscall - wait for the next completion of an aio request */
 static int
 kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
     struct timespec *ts, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct kaioinfo *ki;
 	struct kaiocb *job;
 	struct aiocb *ujob;
 	long error, status;
 	int timo;
 
 	ops->store_aiocb(ujobp, NULL);
 
 	if (ts == NULL) {
 		timo = 0;
 	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
 		timo = -1;
 	} else {
 		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
 
 	error = 0;
 	job = NULL;
 	AIO_LOCK(ki);
 	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
 		if (timo == -1) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiowc", timo);
 		if (timo && error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 
 	if (job != NULL) {
 		MPASS(job->jobflags & KAIOCB_FINISHED);
 		ujob = job->ujob;
 		status = job->uaiocb._aiocb_private.status;
 		error = job->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		td->td_ru.ru_oublock += job->outblock;
 		td->td_ru.ru_inblock += job->inblock;
 		td->td_ru.ru_msgsnd += job->msgsnd;
 		td->td_ru.ru_msgrcv += job->msgrcv;
 		aio_free_entry(job);
 		AIO_UNLOCK(ki);
 		ops->store_aiocb(ujobp, ujob);
 		ops->store_error(ujob, error);
 		ops->store_status(ujob, status);
 	} else
 		AIO_UNLOCK(ki);
 
 	return (error);
 }
 
 int
 sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
 }
 
 static int
 kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
     struct aiocb_ops *ops)
 {
 	int listop;
 
 	switch (op) {
 	case O_SYNC:
 		listop = LIO_SYNC;
 		break;
 	case O_DSYNC:
 		listop = LIO_DSYNC;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (aio_aqueue(td, ujob, NULL, listop, ops));
 }
 
 int
 sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
 }
 
 /* kqueue attach function */
 static int
 filt_aioattach(struct knote *kn)
 {
 	struct kaiocb *job;
 
 	job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;
 
 	/*
 	 * The job pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_aio = job;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&job->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_aiodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_aio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_aio(struct knote *kn, long hint)
 {
 	struct kaiocb *job = kn->kn_ptr.p_aio;
 
 	kn->kn_data = job->uaiocb._aiocb_private.error;
 	if (!(job->jobflags & KAIOCB_FINISHED))
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
 
 /* kqueue attach function */
 static int
 filt_lioattach(struct knote *kn)
 {
 	struct aioliojob *lj;
 
 	lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;
 
 	/*
 	 * The aioliojob pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_ptr.p_lio = lj;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&lj->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_liodetach(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = &kn->kn_ptr.p_lio->klist;
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_lio(struct knote *kn, long hint)
 {
 	struct aioliojob * lj = kn->kn_ptr.p_lio;
 
 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
 }
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <sys/sysent.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 struct __aiocb_private32 {
 	int32_t	status;
 	int32_t	error;
 	uint32_t kernelinfo;
 };
 
 #ifdef COMPAT_FREEBSD6
 typedef struct oaiocb32 {
 	int	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;		/* I/O buffer in process space */
 	uint32_t aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent32 aio_sigevent; /* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 } oaiocb32_t;
 #endif
 
 typedef struct aiocb32 {
 	int32_t	aio_fildes;		/* File descriptor */
 	uint64_t aio_offset __packed;	/* File offset for I/O */
 	uint32_t aio_buf;	/* I/O buffer in process space */
 	uint32_t aio_nbytes;	/* Number of bytes for I/O */
 	int	__spare__[2];
 	uint32_t __spare2__;
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private32 _aiocb_private;
 	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
 } aiocb32_t;
 
 #ifdef COMPAT_FREEBSD6
 static int
 convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
 {
 
 	/*
 	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
 	 * supported by AIO with the old sigevent structure.
 	 */
 	CP(*osig, *nsig, sigev_notify);
 	switch (nsig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_SIGNAL:
 		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
 		break;
 	case SIGEV_KEVENT:
 		nsig->sigev_notify_kqueue =
 		    osig->__sigev_u.__sigev_notify_kqueue;
 		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob,
     int type __unused)
 {
 	struct oaiocb32 job32;
 	struct aiocb *kcb = &kjob->uaiocb;
 	int error;
 
 	bzero(kcb, sizeof(struct aiocb));
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 
 	/* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */
 
 	CP(job32, *kcb, aio_fildes);
 	CP(job32, *kcb, aio_offset);
 	PTRIN_CP(job32, *kcb, aio_buf);
 	CP(job32, *kcb, aio_nbytes);
 	CP(job32, *kcb, aio_lio_opcode);
 	CP(job32, *kcb, aio_reqprio);
 	CP(job32, *kcb, _aiocb_private.status);
 	CP(job32, *kcb, _aiocb_private.error);
 	PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo);
 	return (convert_old_sigevent32(&job32.aio_sigevent,
 	    &kcb->aio_sigevent));
 }
 #endif
 
 static int
 aiocb32_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type)
 {
 	struct aiocb32 job32;
 	struct aiocb *kcb = &kjob->uaiocb;
 	struct iovec32 *iov32;
 	int error;
 
 	error = copyin(ujob, &job32, sizeof(job32));
 	if (error)
 		return (error);
 	CP(job32, *kcb, aio_fildes);
 	CP(job32, *kcb, aio_offset);
 	CP(job32, *kcb, aio_lio_opcode);
 	if (type == LIO_NOP)
 		type = kcb->aio_lio_opcode;
 	if (type & LIO_VECTORED) {
 		iov32 = PTRIN(job32.aio_iov);
 		CP(job32, *kcb, aio_iovcnt);
 		/* malloc a uio and copy in the iovec */
 		error = freebsd32_copyinuio(iov32,
 		    kcb->aio_iovcnt, &kjob->uiop);
 		if (error)
 			return (error);
 	} else {
 		PTRIN_CP(job32, *kcb, aio_buf);
 		CP(job32, *kcb, aio_nbytes);
 	}
 	CP(job32, *kcb, aio_reqprio);
 	CP(job32, *kcb, _aiocb_private.status);
 	CP(job32, *kcb, _aiocb_private.error);
 	PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo);
 	error = convert_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent);
 
 	return (error);
 }
 
 static long
 aiocb32_fetch_status(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.status));
 }
 
 static long
 aiocb32_fetch_error(struct aiocb *ujob)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (fuword32(&ujob32->_aiocb_private.error));
 }
 
 static int
 aiocb32_store_status(struct aiocb *ujob, long status)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.status, status));
 }
 
 static int
 aiocb32_store_error(struct aiocb *ujob, long error)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.error, error));
 }
 
 static int
 aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
 {
 	struct aiocb32 *ujob32;
 
 	ujob32 = (struct aiocb32 *)ujob;
 	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
 }
 
 static int
 aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
 {
 
 	return (suword32(ujobp, (long)ujob));
 }
 
 static struct aiocb_ops aiocb32_ops = {
 	.aio_copyin = aiocb32_copyin,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 
 #ifdef COMPAT_FREEBSD6
 static struct aiocb_ops aiocb32_ops_osigevent = {
 	.aio_copyin = aiocb32_copyin_old_sigevent,
 	.fetch_status = aiocb32_fetch_status,
 	.fetch_error = aiocb32_fetch_error,
 	.store_status = aiocb32_store_status,
 	.store_error = aiocb32_store_error,
 	.store_kernelinfo = aiocb32_store_kernelinfo,
 	.store_aiocb = aiocb32_store_aiocb,
 };
 #endif
 
 int
 freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
 {
 
 	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 int
 freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct aiocb **ujoblist;
 	uint32_t *ujoblist32;
 	int error, i;
 
 	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK);
 	ujoblist32 = (uint32_t *)ujoblist;
 	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
 	    sizeof(ujoblist32[0]));
 	if (error == 0) {
 		for (i = uap->nent - 1; i >= 0; i--)
 			ujoblist[i] = PTRIN(ujoblist32[i]);
 
 		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
 	}
 	free(ujoblist, M_AIO);
 	return (error);
 }
 
 int
 freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
 {
 
 	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_read(struct thread *td,
     struct freebsd6_freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_readv(struct thread *td, struct freebsd32_aio_readv_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READV,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_aio_write(struct thread *td,
     struct freebsd6_freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops_osigevent));
 }
 #endif
 
 int
 freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_writev(struct thread *td, struct freebsd32_aio_writev_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITEV,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
 {
 
 	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_waitcomplete(struct thread *td,
     struct freebsd32_aio_waitcomplete_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
 	    &aiocb32_ops));
 }
 
 int
 freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
 {
 
 	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
 	    &aiocb32_ops));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_lio_listio(struct thread *td,
     struct freebsd6_freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct osigevent32 osig;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &osig, sizeof(osig));
 		if (error)
 			return (error);
 		error = convert_old_sigevent32(&osig, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops_osigevent);
 	free(acb_list, M_LIO);
 	return (error);
 }
 #endif
 
 int
 freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
 {
 	struct aiocb **acb_list;
 	struct sigevent *sigp, sig;
 	struct sigevent32 sig32;
 	uint32_t *acb_list32;
 	int error, i, nent;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > max_aio_queue_per_proc)
 		return (EINVAL);
 
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		error = copyin(uap->sig, &sig32, sizeof(sig32));
 		if (error)
 			return (error);
 		error = convert_sigevent32(&sig32, &sig);
 		if (error)
 			return (error);
 		sigp = &sig;
 	} else
 		sigp = NULL;
 
 	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
 	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
 	if (error) {
 		free(acb_list32, M_LIO);
 		return (error);
 	}
 	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
 	for (i = 0; i < nent; i++)
 		acb_list[i] = PTRIN(acb_list32[i]);
 	free(acb_list32, M_LIO);
 
 	error = kern_lio_listio(td, uap->mode,
 	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
 	    &aiocb32_ops);
 	free(acb_list, M_LIO);
 	return (error);
 }
 
 #endif
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index c36fda4e8a9e..49d1d21d2d6d 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -1,3193 +1,3193 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/smp.h>
 #include <sys/devctl.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/sbuf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
 
 static int	vfs_domount(struct thread *td, const char *fstype, char *fspath,
 		    uint64_t fsflags, bool jail_export,
 		    struct vfsoptlist **optlist);
 static void	free_mntarg(struct mntarg *ma);
 
 static int	usermount = 0;
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
     "Unprivileged users may mount and unmount file systems");
 
 static bool	default_autoro = false;
 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
     "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
 
 static bool	recursive_forced_unmount = false;
 SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW,
     &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts"
     " when a file system is forcibly unmounted");
 
 static SYSCTL_NODE(_vfs, OID_AUTO, deferred_unmount,
     CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "deferred unmount controls");
 
 static unsigned int	deferred_unmount_retry_limit = 10;
 SYSCTL_UINT(_vfs_deferred_unmount, OID_AUTO, retry_limit, CTLFLAG_RW,
     &deferred_unmount_retry_limit, 0,
     "Maximum number of retries for deferred unmount failure");
 
 static int	deferred_unmount_retry_delay_hz;
 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, retry_delay_hz, CTLFLAG_RW,
     &deferred_unmount_retry_delay_hz, 0,
     "Delay in units of [1/kern.hz]s when retrying a failed deferred unmount");
 
 static int	deferred_unmount_total_retries = 0;
 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, total_retries, CTLFLAG_RD,
     &deferred_unmount_total_retries, 0,
     "Total number of retried deferred unmounts");
 
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
 static uma_zone_t mount_zone;
 
 /* List of mounted filesystems. */
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 
 /* For any iteration/modification of mountlist */
 struct mtx_padalign __exclusive_cache_line mountlist_mtx;
 
 EVENTHANDLER_LIST_DEFINE(vfs_mounted);
 EVENTHANDLER_LIST_DEFINE(vfs_unmounted);
 
 static void vfs_deferred_unmount(void *arg, int pending);
 static struct timeout_task deferred_unmount_task;
 static struct mtx deferred_unmount_lock;
 MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount",
     MTX_DEF);
 static STAILQ_HEAD(, mount) deferred_unmount_list =
     STAILQ_HEAD_INITIALIZER(deferred_unmount_list);
 TASKQUEUE_DEFINE_THREAD(deferred_unmount);
 
 static void mount_devctl_event(const char *type, struct mount *mp, bool donew);
 
 /*
  * Global opts, taken by all filesystems
  */
 static const char *global_opts[] = {
 	"errmsg",
 	"fstype",
 	"fspath",
 	"ro",
 	"rw",
 	"nosuid",
 	"noexec",
 	NULL
 };
 
 static int
 mount_init(void *mem, int size, int flags)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
 	mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO);
 	mp->mnt_ref = 0;
 	mp->mnt_vfs_ops = 1;
 	mp->mnt_rootvnode = NULL;
 	return (0);
 }
 
 static void
 mount_fini(void *mem, int size)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu);
 	lockdestroy(&mp->mnt_explock);
 	mtx_destroy(&mp->mnt_listmtx);
 	mtx_destroy(&mp->mnt_mtx);
 }
 
 static void
 vfs_mount_init(void *dummy __unused)
 {
 	TIMEOUT_TASK_INIT(taskqueue_deferred_unmount, &deferred_unmount_task,
 	    0, vfs_deferred_unmount, NULL);
 	deferred_unmount_retry_delay_hz = hz;
 	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
 	    NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
 }
 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
 
 /*
  * ---------------------------------------------------------------------
  * Functions for building and sanitizing the mount options
  */
 
 /* Remove one mount option. */
 static void
 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 {
 
 	TAILQ_REMOVE(opts, opt, link);
 	free(opt->name, M_MOUNT);
 	if (opt->value != NULL)
 		free(opt->value, M_MOUNT);
 	free(opt, M_MOUNT);
 }
 
 /* Release all resources related to the mount options. */
 void
 vfs_freeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
 
 	while (!TAILQ_EMPTY(opts)) {
 		opt = TAILQ_FIRST(opts);
 		vfs_freeopt(opts, opt);
 	}
 	free(opts, M_MOUNT);
 }
 
 void
 vfs_deleteopt(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt, *temp;
 
 	if (opts == NULL)
 		return;
 	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
 		if (strcmp(opt->name, name) == 0)
 			vfs_freeopt(opts, opt);
 	}
 }
 
 static int
 vfs_isopt_ro(const char *opt)
 {
 
 	if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
 	    strcmp(opt, "norw") == 0)
 		return (1);
 	return (0);
 }
 
 static int
 vfs_isopt_rw(const char *opt)
 {
 
 	if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
 		return (1);
 	return (0);
 }
 
 /*
  * Check if options are equal (with or without the "no" prefix).
  */
 static int
 vfs_equalopts(const char *opt1, const char *opt2)
 {
 	char *p;
 
 	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
 	if (strcmp(opt1, opt2) == 0)
 		return (1);
 	/* "noopt" vs. "opt" */
 	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 		return (1);
 	/* "opt" vs. "noopt" */
 	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 		return (1);
 	while ((p = strchr(opt1, '.')) != NULL &&
 	    !strncmp(opt1, opt2, ++p - opt1)) {
 		opt2 += p - opt1;
 		opt1 = p;
 		/* "foo.noopt" vs. "foo.opt" */
 		if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 			return (1);
 		/* "foo.opt" vs. "foo.noopt" */
 		if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 			return (1);
 	}
 	/* "ro" / "rdonly" / "norw" / "rw" / "noro" */
 	if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
 	    (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
 		return (1);
 	return (0);
 }
 
 /*
  * If a mount option is specified several times,
  * (with or without the "no" prefix) only keep
  * the last occurrence of it.
  */
 static void
 vfs_sanitizeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *tmp;
 
 	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 		while (opt2 != NULL) {
 			if (vfs_equalopts(opt->name, opt2->name)) {
 				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 				vfs_freeopt(opts, opt2);
 				opt2 = tmp;
 			} else {
 				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 			}
 		}
 	}
 }
 
 /*
  * Build a linked list of mount options from a struct uio.
  */
 int
 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 {
 	struct vfsoptlist *opts;
 	struct vfsopt *opt;
 	size_t memused, namelen, optlen;
 	unsigned int i, iovcnt;
 	int error;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	memused = 0;
 	iovcnt = auio->uio_iovcnt;
 	for (i = 0; i < iovcnt; i += 2) {
 		namelen = auio->uio_iov[i].iov_len;
 		optlen = auio->uio_iov[i + 1].iov_len;
 		memused += sizeof(struct vfsopt) + optlen + namelen;
 		/*
 		 * Avoid consuming too much memory, and attempts to overflow
 		 * memused.
 		 */
 		if (memused > VFS_MOUNTARG_SIZE_MAX ||
 		    optlen > VFS_MOUNTARG_SIZE_MAX ||
 		    namelen > VFS_MOUNTARG_SIZE_MAX) {
 			error = EINVAL;
 			goto bad;
 		}
 
 		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 		opt->value = NULL;
 		opt->len = 0;
 		opt->pos = i / 2;
 		opt->seen = 0;
 
 		/*
 		 * Do this early, so jumps to "bad" will free the current
 		 * option.
 		 */
 		TAILQ_INSERT_TAIL(opts, opt, link);
 
 		if (auio->uio_segflg == UIO_SYSSPACE) {
 			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 		} else {
 			error = copyin(auio->uio_iov[i].iov_base, opt->name,
 			    namelen);
 			if (error)
 				goto bad;
 		}
 		/* Ensure names are null-terminated strings. */
 		if (namelen == 0 || opt->name[namelen - 1] != '\0') {
 			error = EINVAL;
 			goto bad;
 		}
 		if (optlen != 0) {
 			opt->len = optlen;
 			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 			if (auio->uio_segflg == UIO_SYSSPACE) {
 				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 				    optlen);
 			} else {
 				error = copyin(auio->uio_iov[i + 1].iov_base,
 				    opt->value, optlen);
 				if (error)
 					goto bad;
 			}
 		}
 	}
 	vfs_sanitizeopts(opts);
 	*options = opts;
 	return (0);
 bad:
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Merge the old mount options with the new ones passed
  * in the MNT_UPDATE case.
  *
  * XXX: This function will keep a "nofoo" option in the new
  * options.  E.g, if the option's canonical name is "foo",
  * "nofoo" ends up in the mount point's active options.
  */
 static void
 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
 {
 	struct vfsopt *opt, *new;
 
 	TAILQ_FOREACH(opt, oldopts, link) {
 		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		new->name = strdup(opt->name, M_MOUNT);
 		if (opt->len != 0) {
 			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 			bcopy(opt->value, new->value, opt->len);
 		} else
 			new->value = NULL;
 		new->len = opt->len;
 		new->seen = opt->seen;
 		TAILQ_INSERT_HEAD(toopts, new, link);
 	}
 	vfs_sanitizeopts(toopts);
 }
 
 /*
  * Mount a filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nmount_args {
 	struct iovec *iovp;
 	unsigned int iovcnt;
 	int flags;
 };
 #endif
 int
 sys_nmount(struct thread *td, struct nmount_args *uap)
 {
 	struct uio *auio;
 	int error;
 	u_int iovcnt;
 	uint64_t flags;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit archtectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
 	    uap->iovp, uap->iovcnt, flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	iovcnt = uap->iovcnt;
 	/*
 	 * Check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((iovcnt & 1) || (iovcnt < 4)) {
 		CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
 		    uap->iovcnt);
 		return (EINVAL);
 	}
 
 	error = copyinuio(uap->iovp, iovcnt, &auio);
 	if (error) {
 		CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
 		    __func__, error);
 		return (error);
 	}
 	error = vfs_donmount(td, flags, auio);
 
-	free(auio, M_IOV);
+	freeuio(auio);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Various utility functions
  */
 
 /*
  * Get a reference on a mount point from a vnode.
  *
  * The vnode is allowed to be passed unlocked and race against dooming. Note in
  * such case there are no guarantees the referenced mount point will still be
  * associated with it after the function returns.
  */
 struct mount *
 vfs_ref_from_vp(struct vnode *vp)
 {
 	struct mount *mp;
 	struct mount_pcpu *mpcpu;
 
 	mp = atomic_load_ptr(&vp->v_mount);
 	if (__predict_false(mp == NULL)) {
 		return (mp);
 	}
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		if (__predict_true(mp == vp->v_mount)) {
 			vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 			vfs_op_thread_exit(mp, mpcpu);
 		} else {
 			vfs_op_thread_exit(mp, mpcpu);
 			mp = NULL;
 		}
 	} else {
 		MNT_ILOCK(mp);
 		if (mp == vp->v_mount) {
 			MNT_REF(mp);
 			MNT_IUNLOCK(mp);
 		} else {
 			MNT_IUNLOCK(mp);
 			mp = NULL;
 		}
 	}
 	return (mp);
 }
 
 void
 vfs_ref(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		vfs_mp_count_add_pcpu(mpcpu, ref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Register ump as an upper mount of the mount associated with
  * vnode vp.  This registration will be tracked through
  * mount_upper_node upper, which should be allocated by the
  * caller and stored in per-mount data associated with mp.
  *
  * If successful, this function will return the mount associated
  * with vp, and will ensure that it cannot be unmounted until
  * ump has been unregistered as one of its upper mounts.
  * 
  * Upon failure this function will return NULL.
  */
 struct mount *
 vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump,
     struct mount_upper_node *upper)
 {
 	struct mount *mp;
 
 	mp = atomic_load_ptr(&vp->v_mount);
 	if (mp == NULL)
 		return (NULL);
 	MNT_ILOCK(mp);
 	if (mp != vp->v_mount ||
 	    ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) {
 		MNT_IUNLOCK(mp);
 		return (NULL);
 	}
 	KASSERT(ump != mp, ("upper and lower mounts are identical"));
 	upper->mp = ump;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link);
 	MNT_IUNLOCK(mp);
 	return (mp);
 }
 
 /*
  * Register upper mount ump to receive vnode unlink/reclaim
  * notifications from lower mount mp. This registration will
  * be tracked through mount_upper_node upper, which should be
  * allocated by the caller and stored in per-mount data
  * associated with mp.
  *
  * ump must already be registered as an upper mount of mp
  * through a call to vfs_register_upper_from_vp().
  */
 void
 vfs_register_for_notification(struct mount *mp, struct mount *ump,
     struct mount_upper_node *upper)
 {
 	upper->mp = ump;
 	MNT_ILOCK(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link);
 	MNT_IUNLOCK(mp);
 }
 
 static void
 vfs_drain_upper_locked(struct mount *mp)
 {
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 	while (mp->mnt_upper_pending != 0) {
 		mp->mnt_kern_flag |= MNTK_UPPER_WAITER;
 		msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0);
 	}
 }
 
 /*
  * Undo a previous call to vfs_register_for_notification().
  * The mount represented by upper must be currently registered
  * as an upper mount for mp.
  */
 void
 vfs_unregister_for_notification(struct mount *mp,
     struct mount_upper_node *upper)
 {
 	MNT_ILOCK(mp);
 	vfs_drain_upper_locked(mp);
 	TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Undo a previous call to vfs_register_upper_from_vp().
  * This must be done before mp can be unmounted.
  */
 void
 vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper)
 {
 	MNT_ILOCK(mp);
 	KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0,
 	    ("registered upper with pending unmount"));
 	vfs_drain_upper_locked(mp);
 	TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link);
 	if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 &&
 	    TAILQ_EMPTY(&mp->mnt_uppers)) {
 		mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER;
 		wakeup(&mp->mnt_taskqueue_link);
 	}
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 void
 vfs_rel(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Allocate and initialize the mount point struct.
  */
 struct mount *
 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
     struct ucred *cred)
 {
 	struct mount *mp;
 
 	mp = uma_zalloc(mount_zone, M_WAITOK);
 	bzero(&mp->mnt_startzero,
 	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
 	mp->mnt_kern_flag = 0;
 	mp->mnt_flag = 0;
 	mp->mnt_rootvnode = NULL;
 	mp->mnt_vnodecovered = NULL;
 	mp->mnt_op = NULL;
 	mp->mnt_vfc = NULL;
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
 	TAILQ_INIT(&mp->mnt_lazyvnodelist);
 	mp->mnt_lazyvnodelistsize = 0;
 	MPPASS(mp->mnt_ref == 0 && mp->mnt_lockref == 0 &&
 	    mp->mnt_writeopcount == 0, mp);
 	MPASSERT(mp->mnt_vfs_ops == 1, mp,
 	    ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops));
 	(void) vfs_busy(mp, MBF_NOWAIT);
 	atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_gen++;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_cred = crdup(cred);
 	mp->mnt_stat.f_owner = cred->cr_uid;
 	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_iosize_max = DFLTPHYS;
 #ifdef MAC
 	mac_mount_init(mp);
 	mac_mount_create(cred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 	mp->mnt_upper_pending = 0;
 	TAILQ_INIT(&mp->mnt_uppers);
 	TAILQ_INIT(&mp->mnt_notify);
 	mp->mnt_taskqueue_flags = 0;
 	mp->mnt_unmount_retries = 0;
 	return (mp);
 }
 
 /*
  * Destroy the mount struct previously allocated by vfs_mount_alloc().
  */
 void
 vfs_mount_destroy(struct mount *mp)
 {
 
 	MPPASS(mp->mnt_vfs_ops != 0, mp);
 
 	vfs_assert_mount_counters(mp);
 
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
 	if (mp->mnt_kern_flag & MNTK_MWAIT) {
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
 	while (mp->mnt_ref)
 		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
 	KASSERT(mp->mnt_ref == 0,
 	    ("%s: invalid refcount in the drain path @ %s:%d", __func__,
 	    __FILE__, __LINE__));
 	MPPASS(mp->mnt_writeopcount == 0, mp);
 	MPPASS(mp->mnt_secondary_writes == 0, mp);
 	atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 		struct vnode *vp;
 
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 			vn_printf(vp, "dangling vnode ");
 		panic("unmount: dangling vnode");
 	}
 	KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending"));
 	KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
 	KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify"));
 	MPPASS(mp->mnt_nvnodelistsize == 0, mp);
 	MPPASS(mp->mnt_lazyvnodelistsize == 0, mp);
 	MPPASS(mp->mnt_lockref == 0, mp);
 	MNT_IUNLOCK(mp);
 
 	MPASSERT(mp->mnt_vfs_ops == 1, mp,
 	    ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops));
 
 	MPASSERT(mp->mnt_rootvnode == NULL, mp,
 	    ("mount point still has a root vnode %p", mp->mnt_rootvnode));
 
 	if (mp->mnt_vnodecovered != NULL)
 		vrele(mp->mnt_vnodecovered);
 #ifdef MAC
 	mac_mount_destroy(mp);
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	if (mp->mnt_exjail != NULL) {
 		atomic_subtract_int(&mp->mnt_exjail->cr_prison->pr_exportcnt,
 		    1);
 		crfree(mp->mnt_exjail);
 	}
 	if (mp->mnt_export != NULL) {
 		vfs_free_addrlist(mp->mnt_export);
 		free(mp->mnt_export, M_MOUNT);
 	}
 	crfree(mp->mnt_cred);
 	uma_zfree(mount_zone, mp);
 }
 
 static bool
 vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
 {
 	/* This is an upgrade of an exisiting mount. */
 	if ((fsflags & MNT_UPDATE) != 0)
 		return (false);
 	/* This is already an R/O mount. */
 	if ((fsflags & MNT_RDONLY) != 0)
 		return (false);
 
 	switch (error) {
 	case ENODEV:	/* generic, geom, ... */
 	case EACCES:	/* cam/scsi, ... */
 	case EROFS:	/* md, mmcsd, ... */
 		/*
 		 * These errors can be returned by the storage layer to signal
 		 * that the media is read-only.  No harm in the R/O mount
 		 * attempt if the error was returned for some other reason.
 		 */
 		return (true);
 	default:
 		return (false);
 	}
 }
 
 int
 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
 {
 	struct vfsoptlist *optlist;
 	struct vfsopt *opt, *tmp_opt;
 	char *fstype, *fspath, *errmsg;
 	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 	bool autoro, has_nonexport, jail_export;
 
 	errmsg = fspath = NULL;
 	errmsg_len = fspathlen = 0;
 	errmsg_pos = -1;
 	autoro = default_autoro;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
 		return (error);
 
 	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
 		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 
 	/*
 	 * We need these two options before the others,
 	 * and they are mandatory for any filesystem.
 	 * Ensure they are NUL terminated as well.
 	 */
 	fstypelen = 0;
 	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 	if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 	fspathlen = 0;
 	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 	if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fspath", errmsg_len);
 		goto bail;
 	}
 
 	/*
 	 * Check to see that "export" is only used with the "update", "fstype",
 	 * "fspath", "from" and "errmsg" options when in a vnet jail.
 	 * These are the ones used to set/update exports by mountd(8).
 	 * If only the above options are set in a jail that can run mountd(8),
 	 * then the jail_export argument of vfs_domount() will be true.
 	 * When jail_export is true, the vfs_suser() check does not cause
 	 * failure, but limits the update to exports only.
 	 * This allows mountd(8) running within the vnet jail
 	 * to export file systems visible within the jail, but
 	 * mounted outside of the jail.
 	 */
 	/*
 	 * We need to see if we have the "update" option
 	 * before we call vfs_domount(), since vfs_domount() has special
 	 * logic based on MNT_UPDATE.  This is very important
 	 * when we want to update the root filesystem.
 	 */
 	has_nonexport = false;
 	jail_export = false;
 	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
 		int do_freeopt = 0;
 
 		if (jailed(td->td_ucred) &&
 		    strcmp(opt->name, "export") != 0 &&
 		    strcmp(opt->name, "update") != 0 &&
 		    strcmp(opt->name, "fstype") != 0 &&
 		    strcmp(opt->name, "fspath") != 0 &&
 		    strcmp(opt->name, "from") != 0 &&
 		    strcmp(opt->name, "errmsg") != 0)
 			has_nonexport = true;
 		if (strcmp(opt->name, "update") == 0) {
 			fsflags |= MNT_UPDATE;
 			do_freeopt = 1;
 		}
 		else if (strcmp(opt->name, "async") == 0)
 			fsflags |= MNT_ASYNC;
 		else if (strcmp(opt->name, "force") == 0) {
 			fsflags |= MNT_FORCE;
 			do_freeopt = 1;
 		}
 		else if (strcmp(opt->name, "reload") == 0) {
 			fsflags |= MNT_RELOAD;
 			do_freeopt = 1;
 		}
 		else if (strcmp(opt->name, "multilabel") == 0)
 			fsflags |= MNT_MULTILABEL;
 		else if (strcmp(opt->name, "noasync") == 0)
 			fsflags &= ~MNT_ASYNC;
 		else if (strcmp(opt->name, "noatime") == 0)
 			fsflags |= MNT_NOATIME;
 		else if (strcmp(opt->name, "atime") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoatime", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterr") == 0)
 			fsflags |= MNT_NOCLUSTERR;
 		else if (strcmp(opt->name, "clusterr") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterr", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterw") == 0)
 			fsflags |= MNT_NOCLUSTERW;
 		else if (strcmp(opt->name, "clusterw") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterw", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noexec") == 0)
 			fsflags |= MNT_NOEXEC;
 		else if (strcmp(opt->name, "exec") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoexec", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosuid") == 0)
 			fsflags |= MNT_NOSUID;
 		else if (strcmp(opt->name, "suid") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosuid", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosymfollow") == 0)
 			fsflags |= MNT_NOSYMFOLLOW;
 		else if (strcmp(opt->name, "symfollow") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosymfollow", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noro") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "rw") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "ro") == 0) {
 			fsflags |= MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "rdonly") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("ro", M_MOUNT);
 			fsflags |= MNT_RDONLY;
 			autoro = false;
 		}
 		else if (strcmp(opt->name, "autoro") == 0) {
 			do_freeopt = 1;
 			autoro = true;
 		}
 		else if (strcmp(opt->name, "suiddir") == 0)
 			fsflags |= MNT_SUIDDIR;
 		else if (strcmp(opt->name, "sync") == 0)
 			fsflags |= MNT_SYNCHRONOUS;
 		else if (strcmp(opt->name, "union") == 0)
 			fsflags |= MNT_UNION;
 		else if (strcmp(opt->name, "export") == 0) {
 			fsflags |= MNT_EXPORTED;
 			jail_export = true;
 		} else if (strcmp(opt->name, "automounted") == 0) {
 			fsflags |= MNT_AUTOMOUNTED;
 			do_freeopt = 1;
 		} else if (strcmp(opt->name, "nocover") == 0) {
 			fsflags |= MNT_NOCOVER;
 			do_freeopt = 1;
 		} else if (strcmp(opt->name, "cover") == 0) {
 			fsflags &= ~MNT_NOCOVER;
 			do_freeopt = 1;
 		} else if (strcmp(opt->name, "emptydir") == 0) {
 			fsflags |= MNT_EMPTYDIR;
 			do_freeopt = 1;
 		} else if (strcmp(opt->name, "noemptydir") == 0) {
 			fsflags &= ~MNT_EMPTYDIR;
 			do_freeopt = 1;
 		}
 		if (do_freeopt)
 			vfs_freeopt(optlist, opt);
 	}
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
 		error = ENAMETOOLONG;
 		goto bail;
 	}
 
 	/*
 	 * If has_nonexport is true or the caller is not running within a
 	 * vnet prison that can run mountd(8), set jail_export false.
 	 */
 	if (has_nonexport || !jailed(td->td_ucred) ||
 	    !prison_check_nfsd(td->td_ucred))
 		jail_export = false;
 
 	error = vfs_domount(td, fstype, fspath, fsflags, jail_export, &optlist);
 	if (error == ENODEV) {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 
 	/*
 	 * See if we can mount in the read-only mode if the error code suggests
 	 * that it could be possible and the mount options allow for that.
 	 * Never try it if "[no]{ro|rw}" has been explicitly requested and not
 	 * overridden by "autoro".
 	 */
 	if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
 		printf("%s: R/W mount failed, possibly R/O media,"
 		    " trying R/O mount\n", __func__);
 		fsflags |= MNT_RDONLY;
 		error = vfs_domount(td, fstype, fspath, fsflags, jail_export,
 		    &optlist);
 	}
 bail:
 	/* copyout the errmsg */
 	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
 	    && errmsg_len > 0 && errmsg != NULL) {
 		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
 			bcopy(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		} else {
 			(void)copyout(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		}
 	}
 
 	if (optlist != NULL)
 		vfs_freeopts(optlist);
 	return (error);
 }
 
 /*
  * Old mount API.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 sys_mount(struct thread *td, struct mount_args *uap)
 {
 	char *fstype;
 	struct vfsconf *vfsp = NULL;
 	struct mntarg *ma = NULL;
 	uint64_t flags;
 	int error;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit architectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
 	if (error) {
 		free(fstype, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_TEXT(fstype);
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	free(fstype, M_TEMP);
 	if (vfsp == NULL)
 		return (EINVAL);
 	if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 &&
 	    vfsp->vfc_vfsops_sd->vfs_cmount == NULL) ||
 	    ((vfsp->vfc_flags & VFCF_SBDRY) == 0 &&
 	    vfsp->vfc_vfsops->vfs_cmount == NULL))
 		return (EOPNOTSUPP);
 
 	ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
 	ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
 	ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
 	ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
 
 	if ((vfsp->vfc_flags & VFCF_SBDRY) != 0)
 		return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags));
 	return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags));
 }
 
 /*
  * vfs_domount_first(): first file system mount (not update)
  */
 static int
 vfs_domount_first(
 	struct thread *td,		/* Calling thread. */
 	struct vfsconf *vfsp,		/* File system type. */
 	char *fspath,			/* Mount path. */
 	struct vnode *vp,		/* Vnode to be covered. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct vattr va;
 	struct mount *mp;
 	struct vnode *newdp, *rootvp;
 	int error, error1;
 	bool unmounted;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
 
 	/*
 	 * If the jail of the calling thread lacks permission for this type of
 	 * file system, or is trying to cover its own root, deny immediately.
 	 */
 	if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred,
 	    vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) {
 		vput(vp);
 		return (EPERM);
 	}
 
 	/*
 	 * If the user is not root, ensure that they own the directory
 	 * onto which we are attempting to mount.
 	 */
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
 	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
 		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN);
 	if (error == 0)
 		error = vinvalbuf(vp, V_SAVE, 0, 0);
 	if (vfsp->vfc_flags & VFCF_FILEMOUNT) {
 		if (error == 0 && vp->v_type != VDIR && vp->v_type != VREG)
 			error = EINVAL;
 		/*
 		 * For file mounts, ensure that there is only one hardlink to the file.
 		 */
 		if (error == 0 && vp->v_type == VREG && va.va_nlink != 1)
 			error = EINVAL;
 	} else {
 		if (error == 0 && vp->v_type != VDIR)
 			error = ENOTDIR;
 	}
 	if (error == 0 && (fsflags & MNT_EMPTYDIR) != 0)
 		error = vn_dir_check_empty(vp);
 	if (error == 0) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
 			vp->v_iflag |= VI_MOUNT;
 		else
 			error = EBUSY;
 		VI_UNLOCK(vp);
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	vn_seqc_write_begin(vp);
 	VOP_UNLOCK(vp);
 
 	/* Allocate and initialize the filesystem. */
 	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
 	/* XXXMAC: pass to vfs_mount_alloc? */
 	mp->mnt_optnew = *optlist;
 	/* Set the mount level flags. */
 	mp->mnt_flag = (fsflags &
 	    (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY | MNT_FORCE));
 
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error1 = 0;
 	unmounted = true;
 	if ((error = VFS_MOUNT(mp)) != 0 ||
 	    (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 ||
 	    (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) {
 		rootvp = NULL;
 		if (error1 != 0) {
 			MPASS(error == 0);
 			rootvp = vfs_cache_root_clear(mp);
 			if (rootvp != NULL) {
 				vhold(rootvp);
 				vrele(rootvp);
 			}
 			(void)vn_start_write(NULL, &mp, V_WAIT);
 			MNT_ILOCK(mp);
 			mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_UNMOUNTF;
 			MNT_IUNLOCK(mp);
 			VFS_PURGE(mp);
 			error = VFS_UNMOUNT(mp, 0);
 			vn_finished_write(mp);
 			if (error != 0) {
 				printf(
 		    "failed post-mount (%d): rollback unmount returned %d\n",
 				    error1, error);
 				unmounted = false;
 			}
 			error = error1;
 		}
 		vfs_unbusy(mp);
 		mp->mnt_vnodecovered = NULL;
 		if (unmounted) {
 			/* XXXKIB wait for mnt_lockref drain? */
 			vfs_mount_destroy(mp);
 		}
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		if (rootvp != NULL) {
 			vn_seqc_write_end(rootvp);
 			vdrop(rootvp);
 		}
 		vn_seqc_write_end(vp);
 		vrele(vp);
 		return (error);
 	}
 	vn_seqc_write_begin(newdp);
 	VOP_UNLOCK(newdp);
 
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	*optlist = NULL;
 
 	/*
 	 * Prevent external consumers of mount options from reading mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 
 	/*
 	 * VIRF_MOUNTPOINT and v_mountedhere need to be set under the
 	 * vp lock to satisfy vfs_lookup() requirements.
 	 */
 	VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY);
 	VI_LOCK(vp);
 	vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
 	vp->v_mountedhere = mp;
 	VI_UNLOCK(vp);
 	VOP_UNLOCK(vp);
 	cache_purge(vp);
 
 	/*
 	 * We need to lock both vnodes.
 	 *
 	 * Use vn_lock_pair to avoid establishing an ordering between vnodes
 	 * from different filesystems.
 	 */
 	vn_lock_pair(vp, false, LK_EXCLUSIVE, newdp, false, LK_EXCLUSIVE);
 
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp);
 	/* Place the new filesystem at the end of the mount list. */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	vfs_event_signal(NULL, VQ_MOUNT, 0);
 	VOP_UNLOCK(vp);
 	EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
 	VOP_UNLOCK(newdp);
 	mount_devctl_event("MOUNT", mp, false);
 	mountcheckdirs(vp, newdp);
 	vn_seqc_write_end(vp);
 	vn_seqc_write_end(newdp);
 	vrele(newdp);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
 	vfs_op_exit(mp);
 	vfs_unbusy(mp);
 	return (0);
 }
 
 /*
  * vfs_domount_update(): update of mounted file system
  */
 static int
 vfs_domount_update(
 	struct thread *td,		/* Calling thread. */
 	struct vnode *vp,		/* Mount point vnode. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	bool jail_export,		/* Got export option in vnet prison. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct export_args export;
 	struct o2export_args o2export;
 	struct vnode *rootvp;
 	void *bufp;
 	struct mount *mp;
 	int error, export_error, i, len, fsid_up_len;
 	uint64_t flag;
 	gid_t *grps;
 	fsid_t *fsid_up;
 	bool vfs_suser_failed;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
 	mp = vp->v_mount;
 
 	if ((vp->v_vflag & VV_ROOT) == 0) {
 		if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
 		    == 0)
 			error = EXDEV;
 		else
 			error = EINVAL;
 		vput(vp);
 		return (error);
 	}
 
 	/*
 	 * We only allow the filesystem to be reloaded if it
 	 * is currently mounted read-only.
 	 */
 	flag = mp->mnt_flag;
 	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
 		vput(vp);
 		return (EOPNOTSUPP);	/* Needs translation */
 	}
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that
 	 * did the original mount is permitted to update it.
 	 */
 	/*
 	 * For the case of mountd(8) doing exports in a jail, the vfs_suser()
 	 * call does not cause failure.  vfs_domount() has already checked
 	 * that "root" is doing this and vfs_suser() will fail when
 	 * the file system has been mounted outside the jail.
 	 * jail_export set true indicates that "export" is not mixed
 	 * with other options that change mount behaviour.
 	 */
 	vfs_suser_failed = false;
 	error = vfs_suser(mp, td);
 	if (jail_export && error != 0) {
 		error = 0;
 		vfs_suser_failed = true;
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	if (vfs_busy(mp, MBF_NOWAIT)) {
 		vput(vp);
 		return (EBUSY);
 	}
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
 		VI_UNLOCK(vp);
 		vfs_unbusy(mp);
 		vput(vp);
 		return (EBUSY);
 	}
 	vp->v_iflag |= VI_MOUNT;
 	VI_UNLOCK(vp);
 	VOP_UNLOCK(vp);
 
 	rootvp = NULL;
 	vfs_op_enter(mp);
 	vn_seqc_write_begin(vp);
 
 	if (vfs_getopt(*optlist, "fsid", (void **)&fsid_up,
 	    &fsid_up_len) == 0) {
 		if (fsid_up_len != sizeof(*fsid_up)) {
 			error = EINVAL;
 			goto end;
 		}
 		if (fsidcmp(fsid_up, &mp->mnt_stat.f_fsid) != 0) {
 			error = ENOENT;
 			goto end;
 		}
 		vfs_deleteopt(*optlist, "fsid");
 	}
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		MNT_IUNLOCK(mp);
 		error = EBUSY;
 		goto end;
 	}
 	if (vfs_suser_failed) {
 		KASSERT((fsflags & (MNT_EXPORTED | MNT_UPDATE)) ==
 		    (MNT_EXPORTED | MNT_UPDATE),
 		    ("%s: jailed export did not set expected fsflags",
 		     __func__));
 		/*
 		 * For this case, only MNT_UPDATE and
 		 * MNT_EXPORTED have been set in fsflags
 		 * by the options.  Only set MNT_UPDATE,
 		 * since that is the one that would be set
 		 * when set in fsflags, below.
 		 */
 		mp->mnt_flag |= MNT_UPDATE;
 	} else {
 		mp->mnt_flag &= ~MNT_UPDATEMASK;
 		mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
 		    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
 		if ((mp->mnt_flag & MNT_ASYNC) == 0)
 			mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	}
 	rootvp = vfs_cache_root_clear(mp);
 	MNT_IUNLOCK(mp);
 	mp->mnt_optnew = *optlist;
 	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
 
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	/*
 	 * For the case of mountd(8) doing exports from within a vnet jail,
 	 * "from" is typically not set correctly such that VFS_MOUNT() will
 	 * return ENOENT. It is not obvious that VFS_MOUNT() ever needs to be
 	 * called when mountd is doing exports, but this check only applies to
 	 * the specific case where it is running inside a vnet jail, to
 	 * avoid any POLA violation.
 	 */
 	error = 0;
 	if (!jail_export)
 		error = VFS_MOUNT(mp);
 
 	export_error = 0;
 	/* Process the export option. */
 	if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp,
 	    &len) == 0) {
 		/* Assume that there is only 1 ABI for each length. */
 		switch (len) {
 		case (sizeof(struct oexport_args)):
 			bzero(&o2export, sizeof(o2export));
 			/* FALLTHROUGH */
 		case (sizeof(o2export)):
 			bcopy(bufp, &o2export, len);
 			export.ex_flags = (uint64_t)o2export.ex_flags;
 			export.ex_root = o2export.ex_root;
 			export.ex_uid = o2export.ex_anon.cr_uid;
 			export.ex_groups = NULL;
 			export.ex_ngroups = o2export.ex_anon.cr_ngroups;
 			if (export.ex_ngroups > 0) {
 				if (export.ex_ngroups <= XU_NGROUPS) {
 					export.ex_groups = malloc(
 					    export.ex_ngroups * sizeof(gid_t),
 					    M_TEMP, M_WAITOK);
 					for (i = 0; i < export.ex_ngroups; i++)
 						export.ex_groups[i] =
 						  o2export.ex_anon.cr_groups[i];
 				} else
 					export_error = EINVAL;
 			} else if (export.ex_ngroups < 0)
 				export_error = EINVAL;
 			export.ex_addr = o2export.ex_addr;
 			export.ex_addrlen = o2export.ex_addrlen;
 			export.ex_mask = o2export.ex_mask;
 			export.ex_masklen = o2export.ex_masklen;
 			export.ex_indexfile = o2export.ex_indexfile;
 			export.ex_numsecflavors = o2export.ex_numsecflavors;
 			if (export.ex_numsecflavors < MAXSECFLAVORS) {
 				for (i = 0; i < export.ex_numsecflavors; i++)
 					export.ex_secflavors[i] =
 					    o2export.ex_secflavors[i];
 			} else
 				export_error = EINVAL;
 			if (export_error == 0)
 				export_error = vfs_export(mp, &export, true);
 			free(export.ex_groups, M_TEMP);
 			break;
 		case (sizeof(export)):
 			bcopy(bufp, &export, len);
 			grps = NULL;
 			if (export.ex_ngroups > 0) {
 				if (export.ex_ngroups <= NGROUPS_MAX) {
 					grps = malloc(export.ex_ngroups *
 					    sizeof(gid_t), M_TEMP, M_WAITOK);
 					export_error = copyin(export.ex_groups,
 					    grps, export.ex_ngroups *
 					    sizeof(gid_t));
 					if (export_error == 0)
 						export.ex_groups = grps;
 				} else
 					export_error = EINVAL;
 			} else if (export.ex_ngroups == 0)
 				export.ex_groups = NULL;
 			else
 				export_error = EINVAL;
 			if (export_error == 0)
 				export_error = vfs_export(mp, &export, true);
 			free(grps, M_TEMP);
 			break;
 		default:
 			export_error = EINVAL;
 			break;
 		}
 	}
 
 	MNT_ILOCK(mp);
 	if (error == 0) {
 		mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
 		    MNT_SNAPSHOT);
 	} else {
 		/*
 		 * If we fail, restore old mount flags. MNT_QUOTA is special,
 		 * because it is not part of MNT_UPDATEMASK, but it could have
 		 * changed in the meantime if quotactl(2) was called.
 		 * All in all we want current value of MNT_QUOTA, not the old
 		 * one.
 		 */
 		mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
 	}
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 
 	if (error != 0)
 		goto end;
 
 	mount_devctl_event("REMOUNT", mp, true);
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	*optlist = NULL;
 	(void)VFS_STATFS(mp, &mp->mnt_stat);
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
 	else
 		vfs_deallocate_syncvnode(mp);
 end:
 	vfs_op_exit(mp);
 	if (rootvp != NULL) {
 		vn_seqc_write_end(rootvp);
 		vrele(rootvp);
 	}
 	vn_seqc_write_end(vp);
 	vfs_unbusy(mp);
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp);
 	vrele(vp);
 	return (error != 0 ? error : export_error);
 }
 
 /*
  * vfs_domount(): actually attempt a filesystem mount.
  */
 static int
 vfs_domount(
 	struct thread *td,		/* Calling thread. */
 	const char *fstype,		/* Filesystem type. */
 	char *fspath,			/* Mount path. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	bool jail_export,		/* Got export option in vnet prison. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct vfsconf *vfsp;
 	struct nameidata nd;
 	struct vnode *vp;
 	char *pathbuf;
 	int error;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	if (jail_export) {
 		error = priv_check(td, PRIV_NFS_DAEMON);
 		if (error)
 			return (error);
 	} else if (jailed(td->td_ucred) || usermount == 0) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
 	 */
 	if (fsflags & MNT_EXPORTED) {
 		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
 		if (error)
 			return (error);
 	}
 	if (fsflags & MNT_SUIDDIR) {
 		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
 	 */
 	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
 		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
 			fsflags |= MNT_NOSUID | MNT_USER;
 	}
 
 	/* Load KLDs before we lock the covered vnode to avoid reversals. */
 	vfsp = NULL;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		/* Don't try to load KLDs if we're mounting the root. */
 		if (fsflags & MNT_ROOTFS) {
 			if ((vfsp = vfs_byname(fstype)) == NULL)
 				return (ENODEV);
 		} else {
 			if ((vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
 				return (error);
 		}
 	}
 
 	/*
 	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | WANTPARENT,
 	    UIO_SYSSPACE, fspath);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	vp = nd.ni_vp;
 	/*
 	 * Don't allow stacking file mounts to work around problems with the way
 	 * that namei sets nd.ni_dvp to vp_crossmp for these.
 	 */
 	if (vp->v_type == VREG)
 		fsflags |= MNT_NOCOVER;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		if ((vp->v_vflag & VV_ROOT) != 0 &&
 		    (fsflags & MNT_NOCOVER) != 0) {
 			vput(vp);
 			error = EBUSY;
 			goto out;
 		}
 		pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 		strcpy(pathbuf, fspath);
 		/*
 		 * Note: we allow any vnode type here. If the path sanity check
 		 * succeeds, the type will be validated in vfs_domount_first
 		 * above.
 		 */
 		if (vp->v_type == VDIR)
 			error = vn_path_to_global_path(td, vp, pathbuf,
 			    MNAMELEN);
 		else
 			error = vn_path_to_global_path_hardlink(td, vp,
 			    nd.ni_dvp, pathbuf, MNAMELEN,
 			    nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen);
 		if (error == 0) {
 			error = vfs_domount_first(td, vfsp, pathbuf, vp,
 			    fsflags, optlist);
 		}
 		free(pathbuf, M_TEMP);
 	} else
 		error = vfs_domount_update(td, vp, fsflags, jail_export,
 		    optlist);
 
 out:
 	NDFREE_PNBUF(&nd);
 	vrele(nd.ni_dvp);
 
 	return (error);
 }
 
 /*
  * Unmount a filesystem.
  *
  * Note: unmount takes a path to the vnode mounted on as argument, not
  * special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 sys_unmount(struct thread *td, struct unmount_args *uap)
 {
 
 	return (kern_unmount(td, uap->path, uap->flags));
 }
 
 int
 kern_unmount(struct thread *td, const char *path, int flags)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	char *fsidbuf, *pathbuf;
 	fsid_t fsid;
 	int error;
 
 	AUDIT_ARG_VALUE(flags);
 	if (jailed(td->td_ucred) || usermount == 0) {
 		error = priv_check(td, PRIV_VFS_UNMOUNT);
 		if (error)
 			return (error);
 	}
 
 	if (flags & MNT_BYFSID) {
 		fsidbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 		error = copyinstr(path, fsidbuf, MNAMELEN, NULL);
 		if (error) {
 			free(fsidbuf, M_TEMP);
 			return (error);
 		}
 
 		AUDIT_ARG_TEXT(fsidbuf);
 		/* Decode the filesystem ID. */
 		if (sscanf(fsidbuf, "FSID:%d:%d", &fsid.val[0], &fsid.val[1]) != 2) {
 			free(fsidbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		mp = vfs_getvfs(&fsid);
 		free(fsidbuf, M_TEMP);
 		if (mp == NULL) {
 			return (ENOENT);
 		}
 	} else {
 		pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 		error = copyinstr(path, pathbuf, MNAMELEN, NULL);
 		if (error) {
 			free(pathbuf, M_TEMP);
 			return (error);
 		}
 
 		/*
 		 * Try to find global path for path argument.
 		 */
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 		    UIO_SYSSPACE, pathbuf);
 		if (namei(&nd) == 0) {
 			NDFREE_PNBUF(&nd);
 			error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
 			    MNAMELEN);
 			if (error == 0)
 				vput(nd.ni_vp);
 		}
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
 				vfs_ref(mp);
 				break;
 			}
 		}
 		mtx_unlock(&mountlist_mtx);
 		free(pathbuf, M_TEMP);
 		if (mp == NULL) {
 			/*
 			 * Previously we returned ENOENT for a nonexistent path and
 			 * EINVAL for a non-mountpoint.  We cannot tell these apart
 			 * now, so in the !MNT_BYFSID case return the more likely
 			 * EINVAL for compatibility.
 			 */
 			return (EINVAL);
 		}
 	}
 
 	/*
 	 * Don't allow unmounting the root filesystem.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	error = dounmount(mp, flags, td);
 	return (error);
 }
 
 /*
  * Return error if any of the vnodes, ignoring the root vnode
  * and the syncer vnode, have non-zero usecount.
  *
  * This function is purely advisory - it can return false positives
  * and negatives.
  */
 static int
 vfs_check_usecounts(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
 
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON &&
 		    vp->v_usecount != 0) {
 			VI_UNLOCK(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (EBUSY);
 		}
 		VI_UNLOCK(vp);
 	}
 
 	return (0);
 }
 
 static void
 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
 {
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 	mp->mnt_kern_flag &= ~mntkflags;
 	if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) {
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
 	vfs_op_exit_locked(mp);
 	MNT_IUNLOCK(mp);
 	if (coveredvp != NULL) {
 		VOP_UNLOCK(coveredvp);
 		vdrop(coveredvp);
 	}
 	vn_finished_write(mp);
 	vfs_rel(mp);
 }
 
 /*
  * There are various reference counters associated with the mount point.
  * Normally it is permitted to modify them without taking the mnt ilock,
  * but this behavior can be temporarily disabled if stable value is needed
  * or callers are expected to block (e.g. to not allow new users during
  * forced unmount).
  */
 void
 vfs_op_enter(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int cpu;
 
 	MNT_ILOCK(mp);
 	mp->mnt_vfs_ops++;
 	if (mp->mnt_vfs_ops > 1) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	vfs_op_barrier_wait(mp);
 	CPU_FOREACH(cpu) {
 		mpcpu = vfs_mount_pcpu_remote(mp, cpu);
 
 		mp->mnt_ref += mpcpu->mntp_ref;
 		mpcpu->mntp_ref = 0;
 
 		mp->mnt_lockref += mpcpu->mntp_lockref;
 		mpcpu->mntp_lockref = 0;
 
 		mp->mnt_writeopcount += mpcpu->mntp_writeopcount;
 		mpcpu->mntp_writeopcount = 0;
 	}
 	MPASSERT(mp->mnt_ref > 0 && mp->mnt_lockref >= 0 &&
 	    mp->mnt_writeopcount >= 0, mp,
 	    ("invalid count(s): ref %d lockref %d writeopcount %d",
 	    mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount));
 	MNT_IUNLOCK(mp);
 	vfs_assert_mount_counters(mp);
 }
 
 void
 vfs_op_exit_locked(struct mount *mp)
 {
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	MPASSERT(mp->mnt_vfs_ops > 0, mp,
 	    ("invalid vfs_ops count %d", mp->mnt_vfs_ops));
 	MPASSERT(mp->mnt_vfs_ops > 1 ||
 	    (mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_SUSPEND)) == 0, mp,
 	    ("vfs_ops too low %d in unmount or suspend", mp->mnt_vfs_ops));
 	mp->mnt_vfs_ops--;
 }
 
 void
 vfs_op_exit(struct mount *mp)
 {
 
 	MNT_ILOCK(mp);
 	vfs_op_exit_locked(mp);
 	MNT_IUNLOCK(mp);
 }
 
 struct vfs_op_barrier_ipi {
 	struct mount *mp;
 	struct smp_rendezvous_cpus_retry_arg srcra;
 };
 
 static void
 vfs_op_action_func(void *arg)
 {
 	struct vfs_op_barrier_ipi *vfsopipi;
 	struct mount *mp;
 
 	vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra);
 	mp = vfsopipi->mp;
 
 	if (!vfs_op_thread_entered(mp))
 		smp_rendezvous_cpus_done(arg);
 }
 
 static void
 vfs_op_wait_func(void *arg, int cpu)
 {
 	struct vfs_op_barrier_ipi *vfsopipi;
 	struct mount *mp;
 	struct mount_pcpu *mpcpu;
 
 	vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra);
 	mp = vfsopipi->mp;
 
 	mpcpu = vfs_mount_pcpu_remote(mp, cpu);
 	while (atomic_load_int(&mpcpu->mntp_thread_in_ops))
 		cpu_spinwait();
 }
 
 void
 vfs_op_barrier_wait(struct mount *mp)
 {
 	struct vfs_op_barrier_ipi vfsopipi;
 
 	vfsopipi.mp = mp;
 
 	smp_rendezvous_cpus_retry(all_cpus,
 	    smp_no_rendezvous_barrier,
 	    vfs_op_action_func,
 	    smp_no_rendezvous_barrier,
 	    vfs_op_wait_func,
 	    &vfsopipi.srcra);
 }
 
 #ifdef DIAGNOSTIC
 void
 vfs_assert_mount_counters(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int cpu;
 
 	if (mp->mnt_vfs_ops == 0)
 		return;
 
 	CPU_FOREACH(cpu) {
 		mpcpu = vfs_mount_pcpu_remote(mp, cpu);
 		if (mpcpu->mntp_ref != 0 ||
 		    mpcpu->mntp_lockref != 0 ||
 		    mpcpu->mntp_writeopcount != 0)
 			vfs_dump_mount_counters(mp);
 	}
 }
 
 void
 vfs_dump_mount_counters(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int ref, lockref, writeopcount;
 	int cpu;
 
 	printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops);
 
 	printf("        ref : ");
 	ref = mp->mnt_ref;
 	CPU_FOREACH(cpu) {
 		mpcpu = vfs_mount_pcpu_remote(mp, cpu);
 		printf("%d ", mpcpu->mntp_ref);
 		ref += mpcpu->mntp_ref;
 	}
 	printf("\n");
 	printf("    lockref : ");
 	lockref = mp->mnt_lockref;
 	CPU_FOREACH(cpu) {
 		mpcpu = vfs_mount_pcpu_remote(mp, cpu);
 		printf("%d ", mpcpu->mntp_lockref);
 		lockref += mpcpu->mntp_lockref;
 	}
 	printf("\n");
 	printf("writeopcount: ");
 	writeopcount = mp->mnt_writeopcount;
 	CPU_FOREACH(cpu) {
 		mpcpu = vfs_mount_pcpu_remote(mp, cpu);
 		printf("%d ", mpcpu->mntp_writeopcount);
 		writeopcount += mpcpu->mntp_writeopcount;
 	}
 	printf("\n");
 
 	printf("counter       struct total\n");
 	printf("ref             %-5d  %-5d\n", mp->mnt_ref, ref);
 	printf("lockref         %-5d  %-5d\n", mp->mnt_lockref, lockref);
 	printf("writeopcount    %-5d  %-5d\n", mp->mnt_writeopcount, writeopcount);
 
 	panic("invalid counts on struct mount");
 }
 #endif
 
 int
 vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which)
 {
 	struct mount_pcpu *mpcpu;
 	int cpu, sum;
 
 	switch (which) {
 	case MNT_COUNT_REF:
 		sum = mp->mnt_ref;
 		break;
 	case MNT_COUNT_LOCKREF:
 		sum = mp->mnt_lockref;
 		break;
 	case MNT_COUNT_WRITEOPCOUNT:
 		sum = mp->mnt_writeopcount;
 		break;
 	}
 
 	CPU_FOREACH(cpu) {
 		mpcpu = vfs_mount_pcpu_remote(mp, cpu);
 		switch (which) {
 		case MNT_COUNT_REF:
 			sum += mpcpu->mntp_ref;
 			break;
 		case MNT_COUNT_LOCKREF:
 			sum += mpcpu->mntp_lockref;
 			break;
 		case MNT_COUNT_WRITEOPCOUNT:
 			sum += mpcpu->mntp_writeopcount;
 			break;
 		}
 	}
 	return (sum);
 }
 
 static bool
 deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue,
     int timeout_ticks)
 {
 	bool enqueued;
 
 	enqueued = false;
 	mtx_lock(&deferred_unmount_lock);
 	if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) {
 		mp->mnt_taskqueue_flags = flags | MNT_DEFERRED;
 		STAILQ_INSERT_TAIL(&deferred_unmount_list, mp,
 		    mnt_taskqueue_link);
 		enqueued = true;
 	}
 	mtx_unlock(&deferred_unmount_lock);
 
 	if (enqueued) {
 		taskqueue_enqueue_timeout(taskqueue_deferred_unmount,
 		    &deferred_unmount_task, timeout_ticks);
 	}
 
 	return (enqueued);
 }
 
 /*
  * Taskqueue handler for processing async/recursive unmounts
  */
 static void
 vfs_deferred_unmount(void *argi __unused, int pending __unused)
 {
 	STAILQ_HEAD(, mount) local_unmounts;
 	uint64_t flags;
 	struct mount *mp, *tmp;
 	int error;
 	unsigned int retries;
 	bool unmounted;
 
 	STAILQ_INIT(&local_unmounts);
 	mtx_lock(&deferred_unmount_lock);
 	STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); 
 	mtx_unlock(&deferred_unmount_lock);
 
 	STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) {
 		flags = mp->mnt_taskqueue_flags;
 		KASSERT((flags & MNT_DEFERRED) != 0,
 		    ("taskqueue unmount without MNT_DEFERRED"));
 		error = dounmount(mp, flags, curthread);
 		if (error != 0) {
 			MNT_ILOCK(mp);
 			unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0);
 			MNT_IUNLOCK(mp);
 
 			/*
 			 * The deferred unmount thread is the only thread that
 			 * modifies the retry counts, so locking/atomics aren't
 			 * needed here.
 			 */
 			retries = (mp->mnt_unmount_retries)++;
 			deferred_unmount_total_retries++;
 			if (!unmounted && retries < deferred_unmount_retry_limit) {
 				deferred_unmount_enqueue(mp, flags, true,
 				    -deferred_unmount_retry_delay_hz);
 			} else {
 				if (retries >= deferred_unmount_retry_limit) {
 					printf("giving up on deferred unmount "
 					    "of %s after %d retries, error %d\n",
 					    mp->mnt_stat.f_mntonname, retries, error);
 				}
 				vfs_rel(mp);
 			}
 		}
 	}
 }
 
 /*
  * Do the actual filesystem unmount.
  */
 int
 dounmount(struct mount *mp, uint64_t flags, struct thread *td)
 {
 	struct mount_upper_node *upper;
 	struct vnode *coveredvp, *rootvp;
 	int error;
 	uint64_t async_flag;
 	int mnt_gen_r;
 	unsigned int retries;
 
 	KASSERT((flags & MNT_DEFERRED) == 0 ||
 	    (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE),
 	    ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE"));
 
 	/*
 	 * If the caller has explicitly requested the unmount to be handled by
 	 * the taskqueue and we're not already in taskqueue context, queue
 	 * up the unmount request and exit.  This is done prior to any
 	 * credential checks; MNT_DEFERRED should be used only for kernel-
 	 * initiated unmounts and will therefore be processed with the
 	 * (kernel) credentials of the taskqueue thread.  Still, callers
 	 * should be sure this is the behavior they want.
 	 */
 	if ((flags & MNT_DEFERRED) != 0 &&
 	    taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) {
 		if (!deferred_unmount_enqueue(mp, flags, false, 0))
 			vfs_rel(mp);
 		return (EINPROGRESS);
 	}
 
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that did the
 	 * original mount is permitted to unmount this filesystem.
 	 * This check should be made prior to queueing up any recursive
 	 * unmounts of upper filesystems.  Those unmounts will be executed
 	 * with kernel thread credentials and are expected to succeed, so
 	 * we must at least ensure the originating context has sufficient
 	 * privilege to unmount the base filesystem before proceeding with
 	 * the uppers.
 	 */
 	error = vfs_suser(mp, td);
 	if (error != 0) {
 		KASSERT((flags & MNT_DEFERRED) == 0,
 		    ("taskqueue unmount with insufficient privilege"));
 		vfs_rel(mp);
 		return (error);
 	}
 
 	if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0))
 		flags |= MNT_RECURSE;
 
 	if ((flags & MNT_RECURSE) != 0) {
 		KASSERT((flags & MNT_FORCE) != 0,
 		    ("MNT_RECURSE requires MNT_FORCE"));
 
 		MNT_ILOCK(mp);
 		/*
 		 * Set MNTK_RECURSE to prevent new upper mounts from being
 		 * added, and note that an operation on the uppers list is in
 		 * progress.  This will ensure that unregistration from the
 		 * uppers list, and therefore any pending unmount of the upper
 		 * FS, can't complete until after we finish walking the list.
 		 */
 		mp->mnt_kern_flag |= MNTK_RECURSE;
 		mp->mnt_upper_pending++;
 		TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) {
 			retries = upper->mp->mnt_unmount_retries;
 			if (retries > deferred_unmount_retry_limit) {
 				error = EBUSY;
 				continue;
 			}
 			MNT_IUNLOCK(mp);
 
 			vfs_ref(upper->mp);
 			if (!deferred_unmount_enqueue(upper->mp, flags,
 			    false, 0))
 				vfs_rel(upper->mp);
 			MNT_ILOCK(mp);
 		}
 		mp->mnt_upper_pending--;
 		if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
 		    mp->mnt_upper_pending == 0) {
 			mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
 			wakeup(&mp->mnt_uppers);
 		}
 
 		/*
 		 * If we're not on the taskqueue, wait until the uppers list
 		 * is drained before proceeding with unmount.  Otherwise, if
 		 * we are on the taskqueue and there are still pending uppers,
 		 * just re-enqueue on the end of the taskqueue.
 		 */
 		if ((flags & MNT_DEFERRED) == 0) {
 			while (error == 0 && !TAILQ_EMPTY(&mp->mnt_uppers)) {
 				mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER;
 				error = msleep(&mp->mnt_taskqueue_link,
 				    MNT_MTX(mp), PCATCH, "umntqw", 0);
 			}
 			if (error != 0) {
 				MNT_REL(mp);
 				MNT_IUNLOCK(mp);
 				return (error);
 			}
 		} else if (!TAILQ_EMPTY(&mp->mnt_uppers)) {
 			MNT_IUNLOCK(mp);
 			if (error == 0)
 				deferred_unmount_enqueue(mp, flags, true, 0);
 			return (error);
 		}
 		MNT_IUNLOCK(mp);
 		KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty"));
 	}
 
 	/* Allow the taskqueue to safely re-enqueue on failure */
 	if ((flags & MNT_DEFERRED) != 0)
 		vfs_ref(mp);
 
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
 		mnt_gen_r = mp->mnt_gen;
 		VI_LOCK(coveredvp);
 		vholdl(coveredvp);
 		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
 		/*
 		 * Check for mp being unmounted while waiting for the
 		 * covered vnode lock.
 		 */
 		if (coveredvp->v_mountedhere != mp ||
 		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
 			VOP_UNLOCK(coveredvp);
 			vdrop(coveredvp);
 			vfs_rel(mp);
 			return (EBUSY);
 		}
 	}
 
 	vfs_op_enter(mp);
 
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
 	    (mp->mnt_flag & MNT_UPDATE) != 0 ||
 	    !TAILQ_EMPTY(&mp->mnt_uppers)) {
 		dounmount_cleanup(mp, coveredvp, 0);
 		return (EBUSY);
 	}
 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
 	rootvp = vfs_cache_root_clear(mp);
 	if (coveredvp != NULL)
 		vn_seqc_write_begin(coveredvp);
 	if (flags & MNT_NONBUSY) {
 		MNT_IUNLOCK(mp);
 		error = vfs_check_usecounts(mp);
 		MNT_ILOCK(mp);
 		if (error != 0) {
 			vn_seqc_write_end(coveredvp);
 			dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
 			if (rootvp != NULL) {
 				vn_seqc_write_end(rootvp);
 				vrele(rootvp);
 			}
 			return (error);
 		}
 	}
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE) {
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 		MNT_IUNLOCK(mp);
 		/*
 		 * Must be done after setting MNTK_UNMOUNTF and before
 		 * waiting for mnt_lockref to become 0.
 		 */
 		VFS_PURGE(mp);
 		MNT_ILOCK(mp);
 	}
 	error = 0;
 	if (mp->mnt_lockref) {
 		mp->mnt_kern_flag |= MNTK_DRAINING;
 		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
 		    "mount drain", 0);
 	}
 	MNT_IUNLOCK(mp);
 	KASSERT(mp->mnt_lockref == 0,
 	    ("%s: invalid lock refcount in the drain path @ %s:%d",
 	    __func__, __FILE__, __LINE__));
 	KASSERT(error == 0,
 	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
 	    __func__, __FILE__, __LINE__));
 
 	/*
 	 * We want to keep the vnode around so that we can vn_seqc_write_end
 	 * after we are done with unmount. Downgrade our reference to a mere
 	 * hold count so that we don't interefere with anything.
 	 */
 	if (rootvp != NULL) {
 		vhold(rootvp);
 		vrele(rootvp);
 	}
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	vfs_periodic(mp, MNT_WAIT);
 	MNT_ILOCK(mp);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	vfs_deallocate_syncvnode(mp);
 	error = VFS_UNMOUNT(mp, flags);
 	vn_finished_write(mp);
 	vfs_rel(mp);
 	/*
 	 * If we failed to flush the dirty blocks for this mount point,
 	 * undo all the cdir/rdir and rootvnode changes we made above.
 	 * Unless we failed to do so because the device is reporting that
 	 * it doesn't exist anymore.
 	 */
 	if (error && error != ENXIO) {
 		MNT_ILOCK(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			MNT_IUNLOCK(mp);
 			vfs_allocate_syncvnode(mp);
 			MNT_ILOCK(mp);
 		}
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		mp->mnt_flag |= async_flag;
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 		    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
 			wakeup(mp);
 		}
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		if (coveredvp) {
 			vn_seqc_write_end(coveredvp);
 			VOP_UNLOCK(coveredvp);
 			vdrop(coveredvp);
 		}
 		if (rootvp != NULL) {
 			vn_seqc_write_end(rootvp);
 			vdrop(rootvp);
 		}
 		return (error);
 	}
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
 	if (coveredvp != NULL) {
 		VI_LOCK(coveredvp);
 		vn_irflag_unset_locked(coveredvp, VIRF_MOUNTPOINT);
 		coveredvp->v_mountedhere = NULL;
 		vn_seqc_write_end_locked(coveredvp);
 		VI_UNLOCK(coveredvp);
 		VOP_UNLOCK(coveredvp);
 		vdrop(coveredvp);
 	}
 	mount_devctl_event("UNMOUNT", mp, false);
 	if (rootvp != NULL) {
 		vn_seqc_write_end(rootvp);
 		vdrop(rootvp);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
 	if (rootvnode != NULL && mp == rootvnode->v_mount) {
 		vrele(rootvnode);
 		rootvnode = NULL;
 	}
 	if (mp == rootdevmp)
 		rootdevmp = NULL;
 	if ((flags & MNT_DEFERRED) != 0)
 		vfs_rel(mp);
 	vfs_mount_destroy(mp);
 	return (0);
 }
 
 /*
  * Report errors during filesystem mounting.
  */
 void
 vfs_mount_error(struct mount *mp, const char *fmt, ...)
 {
 	struct vfsoptlist *moptlist = mp->mnt_optnew;
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 void
 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
 {
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Functions for querying mount options/arguments from filesystems.
  */
 
 /*
  * Check that no unknown options are given
  */
 int
 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
 {
 	struct vfsopt *opt;
 	char errmsg[255];
 	const char **t, *p, *q;
 	int ret = 0;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		p = opt->name;
 		q = NULL;
 		if (p[0] == 'n' && p[1] == 'o')
 			q = p + 2;
 		for(t = global_opts; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		for(t = legal; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		snprintf(errmsg, sizeof(errmsg),
 		    "mount option <%s> is unknown", p);
 		ret = EINVAL;
 	}
 	if (ret != 0) {
 		TAILQ_FOREACH(opt, opts, link) {
 			if (strcmp(opt->name, "errmsg") == 0) {
 				strncpy((char *)opt->value, errmsg, opt->len);
 				break;
 			}
 		}
 		if (opt == NULL)
 			printf("%s\n", errmsg);
 	}
 	return (ret);
 }
 
 /*
  * Get a mount option by its name.
  *
  * Return 0 if the option was found, ENOENT otherwise.
  * If len is non-NULL it will be filled with the length
  * of the option. If buf is non-NULL, it will be filled
  * with the address of the option.
  */
 int
 vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len)
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (len != NULL)
 				*len = opt->len;
 			if (buf != NULL)
 				*buf = opt->value;
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 int
 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt;
 
 	if (opts == NULL)
 		return (-1);
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			return (opt->pos);
 		}
 	}
 	return (-1);
 }
 
 int
 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
 {
 	char *opt_value, *vtp;
 	quad_t iv;
 	int error, opt_len;
 
 	error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
 	if (error != 0)
 		return (error);
 	if (opt_len == 0 || opt_value == NULL)
 		return (EINVAL);
 	if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
 		return (EINVAL);
 	iv = strtoq(opt_value, &vtp, 0);
 	if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
 		return (EINVAL);
 	if (iv < 0)
 		return (EINVAL);
 	switch (vtp[0]) {
 	case 't': case 'T':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'g': case 'G':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'm': case 'M':
 		iv *= 1024;
 		/* FALLTHROUGH */
 	case 'k': case 'K':
 		iv *= 1024;
 	case '\0':
 		break;
 	default:
 		return (EINVAL);
 	}
 	*value = iv;
 
 	return (0);
 }
 
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
 	struct vfsopt *opt;
 
 	*error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->len == 0 ||
 		    ((char *)opt->value)[opt->len - 1] != '\0') {
 			*error = EINVAL;
 			return (NULL);
 		}
 		return (opt->value);
 	}
 	*error = ENOENT;
 	return (NULL);
 }
 
 int
 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
 	uint64_t val)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (w != NULL)
 				*w |= val;
 			return (1);
 		}
 	}
 	if (w != NULL)
 		*w &= ~val;
 	return (0);
 }
 
 int
 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct vfsopt *opt;
 	int ret;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->len == 0 || opt->value == NULL)
 			return (0);
 		if (((char *)opt->value)[opt->len - 1] != '\0')
 			return (0);
 		va_start(ap, fmt);
 		ret = vsscanf(opt->value, fmt, ap);
 		va_end(ap);
 		return (ret);
 	}
 	return (0);
 }
 
 int
 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = len;
 		else {
 			if (opt->len != len)
 				return (EINVAL);
 			bcopy(value, opt->value, len);
 		}
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = len;
 		else {
 			if (opt->len < len)
 				return (EINVAL);
 			opt->len = len;
 			bcopy(value, opt->value, len);
 		}
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = strlen(value) + 1;
 		else if (strlcpy(opt->value, value, opt->len) >= opt->len)
 			return (EINVAL);
 		return (0);
 	}
 	return (ENOENT);
 }
 
 /*
  * Find and copy a mount option.
  *
  * The size of the buffer has to be specified
  * in len, if it is not the same length as the
  * mount option, EINVAL is returned.
  * Returns ENOENT if the option is not found.
  */
 int
 vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len)
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (len != opt->len)
 				return (EINVAL);
 			bcopy(opt->value, dest, opt->len);
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 int
 __vfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	/*
 	 * Filesystems only fill in part of the structure for updates, we
 	 * have to read the entirety first to get all content.
 	 */
 	if (sbp != &mp->mnt_stat)
 		memcpy(sbp, &mp->mnt_stat, sizeof(*sbp));
 
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sbp->f_version = STATFS_VERSION;
 	sbp->f_namemax = NAME_MAX;
 	sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	sbp->f_nvnodelistsize = mp->mnt_nvnodelistsize;
 
 	return (mp->mnt_op->vfs_statfs(mp, sbp));
 }
 
 void
 vfs_mountedfrom(struct mount *mp, const char *from)
 {
 
 	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
 	strlcpy(mp->mnt_stat.f_mntfromname, from,
 	    sizeof mp->mnt_stat.f_mntfromname);
 }
 
 /*
  * ---------------------------------------------------------------------
  * This is the api for building mount args and mounting filesystems from
  * inside the kernel.
  *
  * The API works by accumulation of individual args.  First error is
  * latched.
  *
  * XXX: should be documented in new manpage kernel_mount(9)
  */
 
 /* A memory allocation which must be freed when we are done */
 struct mntaarg {
 	SLIST_ENTRY(mntaarg)	next;
 };
 
 /* The header for the mount arguments */
 struct mntarg {
 	struct iovec *v;
 	int len;
 	int error;
 	SLIST_HEAD(, mntaarg)	list;
 };
 
 /*
  * Add a boolean argument.
  *
  * flag is the boolean value.
  * name must start with "no".
  */
 struct mntarg *
 mount_argb(struct mntarg *ma, int flag, const char *name)
 {
 
 	KASSERT(name[0] == 'n' && name[1] == 'o',
 	    ("mount_argb(...,%s): name must start with 'no'", name));
 
 	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
 }
 
 /*
  * Add an argument printf style
  */
 struct mntarg *
 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct mntaarg *maa;
 	struct sbuf *sb;
 	int len;
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	sb = sbuf_new_auto();
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	len = sbuf_len(sb) + 1;
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	bcopy(sbuf_data(sb), maa + 1, len);
 	sbuf_delete(sb);
 
 	ma->v[ma->len].iov_base = maa + 1;
 	ma->v[ma->len].iov_len = len;
 	ma->len++;
 
 	return (ma);
 }
 
 /*
  * Add an argument which is a userland string.
  */
 struct mntarg *
 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
 {
 	struct mntaarg *maa;
 	char *tbuf;
 
 	if (val == NULL)
 		return (ma);
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	tbuf = (void *)(maa + 1);
 	ma->error = copyinstr(val, tbuf, len, NULL);
 	return (mount_arg(ma, name, tbuf, -1));
 }
 
 /*
  * Plain argument.
  *
  * If length is -1, treat value as a C string.
  */
 struct mntarg *
 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
 {
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
 	if (len < 0)
 		ma->v[ma->len].iov_len = strlen(val) + 1;
 	else
 		ma->v[ma->len].iov_len = len;
 	ma->len++;
 	return (ma);
 }
 
 /*
  * Free a mntarg structure
  */
 static void
 free_mntarg(struct mntarg *ma)
 {
 	struct mntaarg *maa;
 
 	while (!SLIST_EMPTY(&ma->list)) {
 		maa = SLIST_FIRST(&ma->list);
 		SLIST_REMOVE_HEAD(&ma->list, next);
 		free(maa, M_MOUNT);
 	}
 	free(ma->v, M_MOUNT);
 	free(ma, M_MOUNT);
 }
 
 /*
  * Mount a filesystem
  */
 int
 kernel_mount(struct mntarg *ma, uint64_t flags)
 {
 	struct uio auio;
 	int error;
 
 	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
 	KASSERT(ma->error != 0 || ma->v != NULL, ("kernel_mount NULL ma->v"));
 	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
 
 	error = ma->error;
 	if (error == 0) {
 		auio.uio_iov = ma->v;
 		auio.uio_iovcnt = ma->len;
 		auio.uio_segflg = UIO_SYSSPACE;
 		error = vfs_donmount(curthread, flags, &auio);
 	}
 	free_mntarg(ma);
 	return (error);
 }
 
 /* Map from mount options to printable formats. */
 static struct mntoptnames optnames[] = {
 	MNTOPT_NAMES
 };
 
 #define DEVCTL_LEN 1024
 static void
 mount_devctl_event(const char *type, struct mount *mp, bool donew)
 {
 	const uint8_t *cp;
 	struct mntoptnames *fp;
 	struct sbuf sb;
 	struct statfs *sfp = &mp->mnt_stat;
 	char *buf;
 
 	buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT);
 	if (buf == NULL)
 		return;
 	sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN);
 	sbuf_cpy(&sb, "mount-point=\"");
 	devctl_safe_quote_sb(&sb, sfp->f_mntonname);
 	sbuf_cat(&sb, "\" mount-dev=\"");
 	devctl_safe_quote_sb(&sb, sfp->f_mntfromname);
 	sbuf_cat(&sb, "\" mount-type=\"");
 	devctl_safe_quote_sb(&sb, sfp->f_fstypename);
 	sbuf_cat(&sb, "\" fsid=0x");
 	cp = (const uint8_t *)&sfp->f_fsid.val[0];
 	for (int i = 0; i < sizeof(sfp->f_fsid); i++)
 		sbuf_printf(&sb, "%02x", cp[i]);
 	sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner);
 	for (fp = optnames; fp->o_opt != 0; fp++) {
 		if ((mp->mnt_flag & fp->o_opt) != 0) {
 			sbuf_cat(&sb, fp->o_name);
 			sbuf_putc(&sb, ';');
 		}
 	}
 	sbuf_putc(&sb, '"');
 	sbuf_finish(&sb);
 
 	/*
 	 * Options are not published because the form of the options depends on
 	 * the file system and may include binary data. In addition, they don't
 	 * necessarily provide enough useful information to be actionable when
 	 * devd processes them.
 	 */
 
 	if (sbuf_error(&sb) == 0)
 		devctl_notify("VFS", "FS", type, sbuf_data(&sb));
 	sbuf_delete(&sb);
 	free(buf, M_MOUNT);
 }
 
 /*
  * Force remount specified mount point to read-only.  The argument
  * must be busied to avoid parallel unmount attempts.
  *
  * Intended use is to prevent further writes if some metadata
  * inconsistency is detected.  Note that the function still flushes
  * all cached metadata and data for the mount point, which might be
  * not always suitable.
  */
 int
 vfs_remount_ro(struct mount *mp)
 {
 	struct vfsoptlist *opts;
 	struct vfsopt *opt;
 	struct vnode *vp_covered, *rootvp;
 	int error;
 
 	vfs_op_enter(mp);
 	KASSERT(mp->mnt_lockref > 0,
 	    ("vfs_remount_ro: mp %p is not busied", mp));
 	KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0,
 	    ("vfs_remount_ro: mp %p is being unmounted (and busy?)", mp));
 
 	rootvp = NULL;
 	vp_covered = mp->mnt_vnodecovered;
 	error = vget(vp_covered, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		vfs_op_exit(mp);
 		return (error);
 	}
 	VI_LOCK(vp_covered);
 	if ((vp_covered->v_iflag & VI_MOUNT) != 0) {
 		VI_UNLOCK(vp_covered);
 		vput(vp_covered);
 		vfs_op_exit(mp);
 		return (EBUSY);
 	}
 	vp_covered->v_iflag |= VI_MOUNT;
 	VI_UNLOCK(vp_covered);
 	vn_seqc_write_begin(vp_covered);
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_RDONLY) != 0) {
 		MNT_IUNLOCK(mp);
 		error = EBUSY;
 		goto out;
 	}
 	mp->mnt_flag |= MNT_UPDATE | MNT_FORCE | MNT_RDONLY;
 	rootvp = vfs_cache_root_clear(mp);
 	MNT_IUNLOCK(mp);
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK | M_ZERO);
 	TAILQ_INIT(opts);
 	opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK | M_ZERO);
 	opt->name = strdup("ro", M_MOUNT);
 	opt->value = NULL;
 	TAILQ_INSERT_TAIL(opts, opt, link);
 	vfs_mergeopts(opts, mp->mnt_opt);
 	mp->mnt_optnew = opts;
 
 	error = VFS_MOUNT(mp);
 
 	if (error == 0) {
 		MNT_ILOCK(mp);
 		mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE);
 		MNT_IUNLOCK(mp);
 		vfs_deallocate_syncvnode(mp);
 		if (mp->mnt_opt != NULL)
 			vfs_freeopts(mp->mnt_opt);
 		mp->mnt_opt = mp->mnt_optnew;
 	} else {
 		MNT_ILOCK(mp);
 		mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE | MNT_RDONLY);
 		MNT_IUNLOCK(mp);
 		vfs_freeopts(mp->mnt_optnew);
 	}
 	mp->mnt_optnew = NULL;
 
 out:
 	vfs_op_exit(mp);
 	VI_LOCK(vp_covered);
 	vp_covered->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp_covered);
 	vput(vp_covered);
 	vn_seqc_write_end(vp_covered);
 	if (rootvp != NULL) {
 		vn_seqc_write_end(rootvp);
 		vrele(rootvp);
 	}
 	return (error);
 }
 
 /*
  * Suspend write operations on all local writeable filesystems.  Does
  * full sync of them in the process.
  *
  * Iterate over the mount points in reverse order, suspending most
  * recently mounted filesystems first.  It handles a case where a
  * filesystem mounted from a md(4) vnode-backed device should be
  * suspended before the filesystem that owns the vnode.
  */
 void
 suspend_all_fs(void)
 {
 	struct mount *mp;
 	int error;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 		error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT);
 		if (error != 0)
 			continue;
 		if ((mp->mnt_flag & (MNT_RDONLY | MNT_LOCAL)) != MNT_LOCAL ||
 		    (mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 			mtx_lock(&mountlist_mtx);
 			vfs_unbusy(mp);
 			continue;
 		}
 		error = vfs_write_suspend(mp, 0);
 		if (error == 0) {
 			MNT_ILOCK(mp);
 			MPASS((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0);
 			mp->mnt_kern_flag |= MNTK_SUSPEND_ALL;
 			MNT_IUNLOCK(mp);
 			mtx_lock(&mountlist_mtx);
 		} else {
 			printf("suspend of %s failed, error %d\n",
 			    mp->mnt_stat.f_mntonname, error);
 			mtx_lock(&mountlist_mtx);
 			vfs_unbusy(mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 
 /*
  * Clone the mnt_exjail field to a new mount point.
  */
 void
 vfs_exjail_clone(struct mount *inmp, struct mount *outmp)
 {
 	struct ucred *cr;
 	struct prison *pr;
 
 	MNT_ILOCK(inmp);
 	cr = inmp->mnt_exjail;
 	if (cr != NULL) {
 		crhold(cr);
 		MNT_IUNLOCK(inmp);
 		pr = cr->cr_prison;
 		sx_slock(&allprison_lock);
 		if (!prison_isalive(pr)) {
 			sx_sunlock(&allprison_lock);
 			crfree(cr);
 			return;
 		}
 		MNT_ILOCK(outmp);
 		if (outmp->mnt_exjail == NULL) {
 			outmp->mnt_exjail = cr;
 			atomic_add_int(&pr->pr_exportcnt, 1);
 			cr = NULL;
 		}
 		MNT_IUNLOCK(outmp);
 		sx_sunlock(&allprison_lock);
 		if (cr != NULL)
 			crfree(cr);
 	} else
 		MNT_IUNLOCK(inmp);
 }
 
 void
 resume_all_fs(void)
 {
 	struct mount *mp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if ((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0)
 			continue;
 		mtx_unlock(&mountlist_mtx);
 		MNT_ILOCK(mp);
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) != 0);
 		mp->mnt_kern_flag &= ~MNTK_SUSPEND_ALL;
 		MNT_IUNLOCK(mp);
 		vfs_write_resume(mp, 0);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 }
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index b650b32ccc55..62339387d7d0 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1,4241 +1,4241 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/disk.h>
 #include <sys/fail.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/prng.h>
 #include <sys/sx.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/ktrace.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
 static fo_kqfilter_t	vn_kqfilter;
 static fo_close_t	vn_closefile;
 static fo_mmap_t	vn_mmap;
 static fo_fallocate_t	vn_fallocate;
 static fo_fspacectl_t	vn_fspacectl;
 
 struct 	fileops vnops = {
 	.fo_read = vn_io_fault,
 	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
 	.fo_kqfilter = vn_kqfilter,
 	.fo_stat = vn_statfile,
 	.fo_close = vn_closefile,
 	.fo_chmod = vn_chmod,
 	.fo_chown = vn_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap = vn_mmap,
 	.fo_fallocate = vn_fallocate,
 	.fo_fspacectl = vn_fspacectl,
 	.fo_cmp = vn_cmp,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 const u_int io_hold_cnt = 16;
 static int vn_io_fault_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 static int vn_io_fault_prefault = 0;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 static int vn_io_pgcache_read_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
     &vn_io_pgcache_read_enable, 0,
     "Enable copying from page cache for reads, avoiding fs");
 static u_long vn_io_faults_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 
 static int vfs_allow_read_dir = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
     &vfs_allow_read_dir, 0,
     "Enable read(2) of directory by root for filesystems that support it");
 
 /*
  * Returns true if vn_io_fault mode of handling the i/o request should
  * be used.
  */
 static bool
 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 {
 	struct mount *mp;
 
 	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 	    (mp = vp->v_mount) != NULL &&
 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 }
 
 /*
  * Structure used to pass arguments to vn_io_fault1(), to do either
  * file- or vnode-based I/O calls.
  */
 struct vn_io_fault_args {
 	enum {
 		VN_IO_FAULT_FOP,
 		VN_IO_FAULT_VOP
 	} kind;
 	struct ucred *cred;
 	int flags;
 	union {
 		struct fop_args_tag {
 			struct file *fp;
 			fo_rdwr_t *doio;
 		} fop_args;
 		struct vop_args_tag {
 			struct vnode *vp;
 		} vop_args;
 	} args;
 };
 
 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
     struct vn_io_fault_args *args, struct thread *td);
 
 int
 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 {
 	struct thread *td = curthread;
 
 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 }
 
 static uint64_t
 open2nameif(int fmode, u_int vn_open_flags)
 {
 	uint64_t res;
 
 	res = ISOPEN | LOCKLEAF;
 	if ((fmode & O_RESOLVE_BENEATH) != 0)
 		res |= RBENEATH;
 	if ((fmode & O_EMPTY_PATH) != 0)
 		res |= EMPTYPATH;
 	if ((fmode & FREAD) != 0)
 		res |= OPENREAD;
 	if ((fmode & FWRITE) != 0)
 		res |= OPENWRITE;
 	if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 		res |= AUDITVNODE1;
 	if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 		res |= NOCAPCHECK;
 	if ((vn_open_flags & VN_OPEN_WANTIOCTLCAPS) != 0)
 		res |= WANTIOCTLCAPS;
 	return (res);
 }
 
 /*
  * Common code for vnode open operations via a name lookup.
  * Lookup the vnode and invoke VOP_CREATE if needed.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  *
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
     struct ucred *cred, struct file *fp)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode, error;
 	bool first_open;
 
 restart:
 	first_open = false;
 	fmode = *flagp;
 	if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 	    O_EXCL | O_DIRECTORY) ||
 	    (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH))
 		return (EINVAL);
 	else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 		/*
 		 * Set NOCACHE to avoid flushing the cache when
 		 * rolling in many files at once.
 		 *
 		 * Set NC_KEEPPOSENTRY to keep positive entries if they already
 		 * exist despite NOCACHE.
 		 */
 		ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 			bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE_PNBUF(ndp);
 				vput(ndp->ni_dvp);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | V_PCATCH)) != 0)
 					return (error);
 				NDREINIT(ndp);
 				goto restart;
 			}
 			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 				ndp->ni_cnd.cn_flags |= MAKEENTRY;
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
 			    &ndp->ni_cnd, vap);
 			if (error == 0)
 #endif
 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 				    &ndp->ni_cnd, vap);
 			vp = ndp->ni_vp;
 			if (error == 0 && (fmode & O_EXCL) != 0 &&
 			    (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
 				VI_LOCK(vp);
 				vp->v_iflag |= VI_FOPENING;
 				VI_UNLOCK(vp);
 				first_open = true;
 			}
 			VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
 			    false);
 			vn_finished_write(mp);
 			if (error) {
 				NDFREE_PNBUF(ndp);
 				if (error == ERELOOKUP) {
 					NDREINIT(ndp);
 					goto restart;
 				}
 				return (error);
 			}
 			fmode &= ~O_TRUNC;
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			if (vp->v_type == VDIR) {
 				error = EISDIR;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 		ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 		    FOLLOW;
 		if ((fmode & FWRITE) == 0)
 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	error = vn_open_vnode(vp, fmode, cred, curthread, fp);
 	if (first_open) {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_FOPENING;
 		wakeup(vp);
 		VI_UNLOCK(vp);
 	}
 	if (error)
 		goto bad;
 	*flagp = fmode;
 	return (0);
 bad:
 	NDFREE_PNBUF(ndp);
 	vput(vp);
 	*flagp = fmode;
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 static int
 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 {
 	struct flock lf;
 	int error, lock_flags, type;
 
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 	if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 		return (0);
 	KASSERT(fp != NULL, ("open with flock requires fp"));
 	if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 		return (EOPNOTSUPP);
 
 	lock_flags = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 	type = F_FLOCK;
 	if ((fmode & FNONBLOCK) == 0)
 		type |= F_WAIT;
 	if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 		type |= F_FIRSTOPEN;
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 	if (error == 0)
 		fp->f_flag |= FHASLOCK;
 
 	vn_lock(vp, lock_flags | LK_RETRY);
 	return (error);
 }
 
 /*
  * Common code for vnode open operations once a vnode is located.
  * Check permissions, and call the VOP_OPEN routine.
  */
 int
 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
     struct thread *td, struct file *fp)
 {
 	accmode_t accmode;
 	int error;
 
 	if (vp->v_type == VLNK) {
 		if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0)
 			return (EMLINK);
 	}
 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 		return (ENOTDIR);
 
 	accmode = 0;
 	if ((fmode & O_PATH) == 0) {
 		if (vp->v_type == VSOCK)
 			return (EOPNOTSUPP);
 		if ((fmode & (FWRITE | O_TRUNC)) != 0) {
 			if (vp->v_type == VDIR)
 				return (EISDIR);
 			accmode |= VWRITE;
 		}
 		if ((fmode & FREAD) != 0)
 			accmode |= VREAD;
 		if ((fmode & O_APPEND) && (fmode & FWRITE))
 			accmode |= VAPPEND;
 #ifdef MAC
 		if ((fmode & O_CREAT) != 0)
 			accmode |= VCREAT;
 #endif
 	}
 	if ((fmode & FEXEC) != 0)
 		accmode |= VEXEC;
 #ifdef MAC
 	if ((fmode & O_VERIFY) != 0)
 		accmode |= VVERIFY;
 	error = mac_vnode_check_open(cred, vp, accmode);
 	if (error != 0)
 		return (error);
 
 	accmode &= ~(VCREAT | VVERIFY);
 #endif
 	if ((fmode & O_CREAT) == 0 && accmode != 0) {
 		error = VOP_ACCESS(vp, accmode, cred, td);
 		if (error != 0)
 			return (error);
 	}
 	if ((fmode & O_PATH) != 0) {
 		if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
 		    VOP_ACCESS(vp, VREAD, cred, td) == 0)
 			fp->f_flag |= FKQALLOWED;
 		return (0);
 	}
 
 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 	error = VOP_OPEN(vp, fmode, cred, td, fp);
 	if (error != 0)
 		return (error);
 
 	error = vn_open_vnode_advlock(vp, fmode, fp);
 	if (error == 0 && (fmode & FWRITE) != 0) {
 		error = VOP_ADD_WRITECOUNT(vp, 1);
 		if (error == 0) {
 			CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 			     __func__, vp, vp->v_writecount);
 		}
 	}
 
 	/*
 	 * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 	 * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 	 */
 	if (error != 0) {
 		if (fp != NULL) {
 			/*
 			 * Arrange the call by having fdrop() to use
 			 * vn_closefile().  This is to satisfy
 			 * filesystems like devfs or tmpfs, which
 			 * override fo_close().
 			 */
 			fp->f_flag |= FOPENFAILED;
 			fp->f_vnode = vp;
 			if (fp->f_ops == &badfileops) {
 				fp->f_type = DTYPE_VNODE;
 				fp->f_ops = &vnops;
 			}
 			vref(vp);
 		} else {
 			/*
 			 * If there is no fp, due to kernel-mode open,
 			 * we can call VOP_CLOSE() now.
 			 */
 			if ((vp->v_type == VFIFO ||
 			    !MNT_EXTENDED_SHARED(vp->v_mount)) &&
 			    VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 				vn_lock(vp, LK_UPGRADE | LK_RETRY);
 			(void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC),
 			    cred, td);
 		}
 	}
 
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (error);
 
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  * It is racy.
  */
 int
 vn_writechk(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (VOP_IS_TEXT(vp))
 		return (ETXTBSY);
 
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 static int
 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
     struct thread *td, bool keep_ref)
 {
 	struct mount *mp;
 	int error, lock_flags;
 
 	lock_flags = vp->v_type != VFIFO && MNT_EXTENDED_SHARED(vp->v_mount) ?
 	    LK_SHARED : LK_EXCLUSIVE;
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, lock_flags | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	if (keep_ref)
 		VOP_UNLOCK(vp);
 	else
 		vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
     struct thread *td)
 {
 
 	return (vn_close1(vp, flags, file_cred, td, false));
 }
 
 /*
  * Heuristic to detect sequential operation.
  */
 static int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
 	enum uio_rw rw;
 
 	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 
 	rw = uio->uio_rw;
 	if (fp->f_flag & FRDAHEAD)
 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 
 	/*
 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 	 * that the first I/O is normally considered to be slightly
 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
 	 * unless previous seeks have reduced f_seqcount to 0, in which
 	 * case offset 0 is not special.
 	 */
 	if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 	    uio->uio_offset == fp->f_nextoff[rw]) {
 		/*
 		 * f_seqcount is in units of fixed-size blocks so that it
 		 * depends mainly on the amount of sequential I/O and not
 		 * much on the number of sequential I/O's.  The fixed size
 		 * of 16384 is hard-coded here since it is (not quite) just
 		 * a magic size that works well here.  This size is more
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
 		if (uio->uio_resid >= IO_SEQMAX * 16384)
 			fp->f_seqcount[rw] = IO_SEQMAX;
 		else {
 			fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 			if (fp->f_seqcount[rw] > IO_SEQMAX)
 				fp->f_seqcount[rw] = IO_SEQMAX;
 		}
 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 	}
 
 	/* Not sequential.  Quickly draw-down sequentiality. */
 	if (fp->f_seqcount[rw] > 1)
 		fp->f_seqcount[rw] = 1;
 	else
 		fp->f_seqcount[rw] = 0;
 	return (0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error, lock_flags;
 
 	if (offset < 0 && vp->v_type != VCHR)
 		return (EINVAL);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((ioflg & IO_RANGELOCKED) == 0) {
 			if (rw == UIO_READ) {
 				rl_cookie = vn_rangelock_rlock(vp, offset,
 				    offset + len);
 			} else if ((ioflg & IO_APPEND) != 0) {
 				rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 			} else {
 				rl_cookie = vn_rangelock_wlock(vp, offset,
 				    offset + len);
 			}
 		} else
 			rl_cookie = NULL;
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH))
 			    != 0)
 				goto out;
 			lock_flags = vn_lktype_write(mp, vp);
 		} else
 			lock_flags = LK_SHARED;
 		vn_lock(vp, lock_flags | LK_RETRY);
 	} else
 		rl_cookie = NULL;
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
 			error = mac_vnode_check_read(active_cred, file_cred,
 			    vp);
 		else
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
 		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (do_vn_io_fault(vp, &auio)) {
 			args.kind = VN_IO_FAULT_VOP;
 			args.cred = cred;
 			args.flags = ioflg;
 			args.args.vop_args.vp = vp;
 			error = vn_io_fault1(vp, &auio, &args, td);
 		} else if (rw == UIO_READ) {
 			error = VOP_READ(vp, &auio, ioflg, cred);
 		} else /* if (rw == UIO_WRITE) */ {
 			error = VOP_WRITE(vp, &auio, ioflg, cred);
 		}
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		VOP_UNLOCK(vp);
 		if (mp != NULL)
 			vn_finished_write(mp);
 	}
  out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, size_t *aresid, struct thread *td)
 {
 	int error = 0;
 	ssize_t iaresid;
 
 	do {
 		int chunk;
 
 		/*
 		 * Force `offset' to a multiple of MAXBSIZE except possibly
 		 * for the first chunk, so that filesystems only need to
 		 * write full blocks except possibly for the first and last
 		 * chunks.
 		 */
 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 
 		if (chunk > len)
 			chunk = len;
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		iaresid = 0;
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, active_cred, file_cred, &iaresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base = (char *)base + chunk;
 		kern_yield(PRI_USER);
 	} while (len);
 	if (aresid)
 		*aresid = len + iaresid;
 	return (error);
 }
 
 #if OFF_MAX <= LONG_MAX
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	volatile short *flagsp;
 	off_t res;
 	short state;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOLOCK) != 0)
 		return (atomic_load_long(&fp->f_offset));
 
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	flagsp = &fp->f_vnread_flags;
 	if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 		return (atomic_load_long(&fp->f_offset));
 
 	sleepq_lock(&fp->f_vnread_flags);
 	state = atomic_load_16(flagsp);
 	for (;;) {
 		if ((state & FOFFSET_LOCKED) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
 			    FOFFSET_LOCKED))
 				continue;
 			break;
 		}
 		if ((state & FOFFSET_LOCK_WAITING) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
 			    state | FOFFSET_LOCK_WAITING))
 				continue;
 		}
 		DROP_GIANT();
 		sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 		sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 		PICKUP_GIANT();
 		sleepq_lock(&fp->f_vnread_flags);
 		state = atomic_load_16(flagsp);
 	}
 	res = atomic_load_long(&fp->f_offset);
 	sleepq_release(&fp->f_vnread_flags);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	volatile short *flagsp;
 	short state;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOUPDATE) == 0)
 		atomic_store_long(&fp->f_offset, val);
 	if ((flags & FOF_NEXTOFF_R) != 0)
 		fp->f_nextoff[UIO_READ] = val;
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 
 	if ((flags & FOF_NOLOCK) != 0)
 		return;
 
 	flagsp = &fp->f_vnread_flags;
 	state = atomic_load_16(flagsp);
 	if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 	    atomic_cmpset_rel_16(flagsp, state, 0))
 		return;
 
 	sleepq_lock(&fp->f_vnread_flags);
 	MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 	MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 	fp->f_vnread_flags = 0;
 	sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 	sleepq_release(&fp->f_vnread_flags);
 }
 
 static off_t
 foffset_read(struct file *fp)
 {
 
 	return (atomic_load_long(&fp->f_offset));
 }
 #else
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	struct mtx *mtxp;
 	off_t res;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOLOCK) == 0) {
 		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 			    "vofflock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
 	}
 	res = fp->f_offset;
 	mtx_unlock(mtxp);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	struct mtx *mtxp;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOUPDATE) == 0)
 		fp->f_offset = val;
 	if ((flags & FOF_NEXTOFF_R) != 0)
 		fp->f_nextoff[UIO_READ] = val;
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 	if ((flags & FOF_NOLOCK) == 0) {
 		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 		    ("Lost FOFFSET_LOCKED"));
 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 			wakeup(&fp->f_vnread_flags);
 		fp->f_vnread_flags = 0;
 	}
 	mtx_unlock(mtxp);
 }
 
 static off_t
 foffset_read(struct file *fp)
 {
 
 	return (foffset_lock(fp, FOF_NOLOCK));
 }
 #endif
 
 void
 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = foffset_lock(fp, flags);
 }
 
 void
 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		foffset_unlock(fp, uio->uio_offset, flags);
 }
 
 static int
 get_advice(struct file *fp, struct uio *uio)
 {
 	struct mtx *mtxp;
 	int ret;
 
 	ret = POSIX_FADV_NORMAL;
 	if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 		return (ret);
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if (fp->f_advice != NULL &&
 	    uio->uio_offset >= fp->f_advice->fa_start &&
 	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 		ret = fp->f_advice->fa_advice;
 	mtx_unlock(mtxp);
 	return (ret);
 }
 
 static int
 get_write_ioflag(struct file *fp)
 {
 	int ioflag;
 	struct mount *mp;
 	struct vnode *vp;
 
 	ioflag = 0;
 	vp = fp->f_vnode;
 	mp = atomic_load_ptr(&vp->v_mount);
 
 	if ((fp->f_flag & O_DIRECT) != 0)
 		ioflag |= IO_DIRECT;
 
 	if ((fp->f_flag & O_FSYNC) != 0 ||
 	    (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0))
 		ioflag |= IO_SYNC;
 
 	/*
 	 * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
 	 * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC
 	 * fall back to full O_SYNC behavior.
 	 */
 	if ((fp->f_flag & O_DSYNC) != 0)
 		ioflag |= IO_SYNC | IO_DATASYNC;
 
 	return (ioflag);
 }
 
 int
 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 {
 	vm_object_t obj;
 	vm_page_t ma[io_hold_cnt + 2];
 	off_t off, vsz;
 	ssize_t resid;
 	int error, i, j;
 
 	MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 	obj = atomic_load_ptr(&vp->v_object);
 	if (obj == NULL)
 		return (EJUSTRETURN);
 
 	/*
 	 * Depends on type stability of vm_objects.
 	 */
 	vm_object_pip_add(obj, 1);
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		/*
 		 * Note that object might be already reused from the
 		 * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 		 * we recheck for DOOMED vnode state after all pages
 		 * are busied, and retract then.
 		 *
 		 * But we check for OBJ_DEAD to ensure that we do not
 		 * busy pages while vm_object_terminate_pages()
 		 * processes the queue.
 		 */
 		error = EJUSTRETURN;
 		goto out_pip;
 	}
 
 	resid = uio->uio_resid;
 	off = uio->uio_offset;
 	for (i = 0; resid > 0; i++) {
 		MPASS(i < io_hold_cnt + 2);
 		ma[i] = vm_page_grab_unlocked(obj, atop(off),
 		    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_NOWAIT);
 		if (ma[i] == NULL)
 			break;
 
 		/*
 		 * Skip invalid pages.  Valid mask can be partial only
 		 * at EOF, and we clip later.
 		 */
 		if (vm_page_none_valid(ma[i])) {
 			vm_page_sunbusy(ma[i]);
 			break;
 		}
 
 		resid -= PAGE_SIZE;
 		off += PAGE_SIZE;
 	}
 	if (i == 0) {
 		error = EJUSTRETURN;
 		goto out_pip;
 	}
 
 	/*
 	 * Check VIRF_DOOMED after we busied our pages.  Since
 	 * vgonel() terminates the vnode' vm_object, it cannot
 	 * process past pages busied by us.
 	 */
 	if (VN_IS_DOOMED(vp)) {
 		error = EJUSTRETURN;
 		goto out;
 	}
 
 	resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 	if (resid > uio->uio_resid)
 		resid = uio->uio_resid;
 
 	/*
 	 * Unlocked read of vnp_size is safe because truncation cannot
 	 * pass busied page.  But we load vnp_size into a local
 	 * variable so that possible concurrent extension does not
 	 * break calculation.
 	 */
 #if defined(__powerpc__) && !defined(__powerpc64__)
 	vsz = obj->un_pager.vnp.vnp_size;
 #else
 	vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
 #endif
 	if (uio->uio_offset >= vsz) {
 		error = EJUSTRETURN;
 		goto out;
 	}
 	if (uio->uio_offset + resid > vsz)
 		resid = vsz - uio->uio_offset;
 
 	error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
 
 out:
 	for (j = 0; j < i; j++) {
 		if (error == 0)
 			vm_page_reference(ma[j]);
 		vm_page_sunbusy(ma[j]);
 	}
 out_pip:
 	vm_object_pip_wakeup(obj);
 	if (error != 0)
 		return (error);
 	return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
     struct thread *td)
 {
 	struct vnode *vp;
 	off_t orig_offset;
 	int error, ioflag;
 	int advice;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	/*
 	 * Try to read from page cache.  VIRF_DOOMED check is racy but
 	 * allows us to avoid unneeded work outright.
 	 */
 	if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
 	    (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
 		error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
 		if (error == 0) {
 			fp->f_nextoff[UIO_READ] = uio->uio_offset;
 			return (0);
 		}
 		if (error != EJUSTRETURN)
 			return (error);
 	}
 
 	advice = get_advice(fp, uio);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* Disable read-ahead for random I/O. */
 		break;
 	}
 	orig_offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff[UIO_READ] = uio->uio_offset;
 	VOP_UNLOCK(vp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    orig_offset != uio->uio_offset)
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
 		 * for the backing file after a POSIX_FADV_NOREUSE
 		 * read(2).
 		 */
 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
     struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	off_t orig_offset;
 	int error, ioflag;
 	int advice;
 	bool need_finished_write;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	if (vp->v_type == VREG)
 		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0)
 		ioflag |= IO_APPEND;
 	if ((fp->f_flag & FNONBLOCK) != 0)
 		ioflag |= IO_NDELAY;
 	ioflag |= get_write_ioflag(fp);
 
 	mp = NULL;
 	need_finished_write = false;
 	if (vp->v_type != VCHR) {
 		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 		if (error != 0)
 			goto unlock;
 		need_finished_write = true;
 	}
 
 	advice = get_advice(fp, uio);
 
 	vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
 	}
 	orig_offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
 	VOP_UNLOCK(vp);
 	if (need_finished_write)
 		vn_finished_write(mp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    orig_offset != uio->uio_offset)
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
 		 * for the backing file after a POSIX_FADV_NOREUSE
 		 * write(2).
 		 */
 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 unlock:
 	return (error);
 }
 
 /*
  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
  * prevent the following deadlock:
  *
  * Assume that the thread A reads from the vnode vp1 into userspace
  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
  * currently not resident, then system ends up with the call chain
  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
  * backed by the pages of vnode vp1, and some page in buf2 is not
  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
  *
  * To prevent the lock order reversal and deadlock, vn_io_fault() does
  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
  * Instead, it first tries to do the whole range i/o with pagefaults
  * disabled. If all pages in the i/o buffer are resident and mapped,
  * VOP will succeed (ignoring the genuine filesystem errors).
  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
  * i/o in chunks, with all pages in the chunk prefaulted and held
  * using vm_fault_quick_hold_pages().
  *
  * Filesystems using this deadlock avoidance scheme should use the
  * array of the held pages from uio, saved in the curthread->td_ma,
  * instead of doing uiomove().  A helper function
  * vn_io_fault_uiomove() converts uiomove request into
  * uiomove_fromphys() over td_ma array.
  *
  * Since vnode locks do not cover the whole i/o anymore, rangelocks
  * make the current i/o request atomic with respect to other i/os and
  * truncations.
  */
 
 /*
  * Decode vn_io_fault_args and perform the corresponding i/o.
  */
 static int
 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
     struct thread *td)
 {
 	int error, save;
 
 	error = 0;
 	save = vm_fault_disable_pagefaults();
 	switch (args->kind) {
 	case VN_IO_FAULT_FOP:
 		error = (args->args.fop_args.doio)(args->args.fop_args.fp,
 		    uio, args->cred, args->flags, td);
 		break;
 	case VN_IO_FAULT_VOP:
 		if (uio->uio_rw == UIO_READ) {
 			error = VOP_READ(args->args.vop_args.vp, uio,
 			    args->flags, args->cred);
 		} else if (uio->uio_rw == UIO_WRITE) {
 			error = VOP_WRITE(args->args.vop_args.vp, uio,
 			    args->flags, args->cred);
 		}
 		break;
 	default:
 		panic("vn_io_fault_doio: unknown kind of io %d %d",
 		    args->kind, uio->uio_rw);
 	}
 	vm_fault_enable_pagefaults(save);
 	return (error);
 }
 
 static int
 vn_io_fault_touch(char *base, const struct uio *uio)
 {
 	int r;
 
 	r = fubyte(base);
 	if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
 		return (EFAULT);
 	return (0);
 }
 
 static int
 vn_io_fault_prefault_user(const struct uio *uio)
 {
 	char *base;
 	const struct iovec *iov;
 	size_t len;
 	ssize_t resid;
 	int error, i;
 
 	KASSERT(uio->uio_segflg == UIO_USERSPACE,
 	    ("vn_io_fault_prefault userspace"));
 
 	error = i = 0;
 	iov = uio->uio_iov;
 	resid = uio->uio_resid;
 	base = iov->iov_base;
 	len = iov->iov_len;
 	while (resid > 0) {
 		error = vn_io_fault_touch(base, uio);
 		if (error != 0)
 			break;
 		if (len < PAGE_SIZE) {
 			if (len != 0) {
 				error = vn_io_fault_touch(base + len - 1, uio);
 				if (error != 0)
 					break;
 				resid -= len;
 			}
 			if (++i >= uio->uio_iovcnt)
 				break;
 			iov = uio->uio_iov + i;
 			base = iov->iov_base;
 			len = iov->iov_len;
 		} else {
 			len -= PAGE_SIZE;
 			base += PAGE_SIZE;
 			resid -= PAGE_SIZE;
 		}
 	}
 	return (error);
 }
 
 /*
  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
  * into args and call vn_io_fault1() to handle faults during the user
  * mode buffer accesses.
  */
 static int
 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
     struct thread *td)
 {
 	vm_page_t ma[io_hold_cnt + 2];
 	struct uio *uio_clone, short_uio;
 	struct iovec short_iovec[1];
 	vm_page_t *prev_td_ma;
 	vm_prot_t prot;
 	vm_offset_t addr, end;
 	size_t len, resid;
 	ssize_t adv;
 	int error, cnt, saveheld, prev_td_ma_cnt;
 
 	if (vn_io_fault_prefault) {
 		error = vn_io_fault_prefault_user(uio);
 		if (error != 0)
 			return (error); /* Or ignore ? */
 	}
 
 	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
 
 	/*
 	 * The UFS follows IO_UNIT directive and replays back both
 	 * uio_offset and uio_resid if an error is encountered during the
 	 * operation.  But, since the iovec may be already advanced,
 	 * uio is still in an inconsistent state.
 	 *
 	 * Cache a copy of the original uio, which is advanced to the redo
 	 * point using UIO_NOCOPY below.
 	 */
 	uio_clone = cloneuio(uio);
 	resid = uio->uio_resid;
 
 	short_uio.uio_segflg = UIO_USERSPACE;
 	short_uio.uio_rw = uio->uio_rw;
 	short_uio.uio_td = uio->uio_td;
 
 	error = vn_io_fault_doio(args, uio, td);
 	if (error != EFAULT)
 		goto out;
 
 	atomic_add_long(&vn_io_faults_cnt, 1);
 	uio_clone->uio_segflg = UIO_NOCOPY;
 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
 	uio_clone->uio_segflg = uio->uio_segflg;
 
 	saveheld = curthread_pflags_set(TDP_UIOHELD);
 	prev_td_ma = td->td_ma;
 	prev_td_ma_cnt = td->td_ma_cnt;
 
 	while (uio_clone->uio_resid != 0) {
 		len = uio_clone->uio_iov->iov_len;
 		if (len == 0) {
 			KASSERT(uio_clone->uio_iovcnt >= 1,
 			    ("iovcnt underflow"));
 			uio_clone->uio_iov++;
 			uio_clone->uio_iovcnt--;
 			continue;
 		}
 		if (len > ptoa(io_hold_cnt))
 			len = ptoa(io_hold_cnt);
 		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
 		end = round_page(addr + len);
 		if (end < addr) {
 			error = EFAULT;
 			break;
 		}
 		/*
 		 * A perfectly misaligned address and length could cause
 		 * both the start and the end of the chunk to use partial
 		 * page.  +2 accounts for such a situation.
 		 */
 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
 		    addr, len, prot, ma, io_hold_cnt + 2);
 		if (cnt == -1) {
 			error = EFAULT;
 			break;
 		}
 		short_uio.uio_iov = &short_iovec[0];
 		short_iovec[0].iov_base = (void *)addr;
 		short_uio.uio_iovcnt = 1;
 		short_uio.uio_resid = short_iovec[0].iov_len = len;
 		short_uio.uio_offset = uio_clone->uio_offset;
 		td->td_ma = ma;
 		td->td_ma_cnt = cnt;
 
 		error = vn_io_fault_doio(args, &short_uio, td);
 		vm_page_unhold_pages(ma, cnt);
 		adv = len - short_uio.uio_resid;
 
 		uio_clone->uio_iov->iov_base =
 		    (char *)uio_clone->uio_iov->iov_base + adv;
 		uio_clone->uio_iov->iov_len -= adv;
 		uio_clone->uio_resid -= adv;
 		uio_clone->uio_offset += adv;
 
 		uio->uio_resid -= adv;
 		uio->uio_offset += adv;
 
 		if (error != 0 || adv == 0)
 			break;
 	}
 	td->td_ma = prev_td_ma;
 	td->td_ma_cnt = prev_td_ma_cnt;
 	curthread_pflags_restore(saveheld);
 out:
-	free(uio_clone, M_IOV);
+	freeuio(uio_clone);
 	return (error);
 }
 
 static int
 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	fo_rdwr_t *doio;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error;
 	bool do_io_fault, do_rangelock;
 
 	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
 	vp = fp->f_vnode;
 
 	/*
 	 * The ability to read(2) on a directory has historically been
 	 * allowed for all users, but this can and has been the source of
 	 * at least one security issue in the past.  As such, it is now hidden
 	 * away behind a sysctl for those that actually need it to use it, and
 	 * restricted to root when it's turned on to make it relatively safe to
 	 * leave on for longer sessions of need.
 	 */
 	if (vp->v_type == VDIR) {
 		KASSERT(uio->uio_rw == UIO_READ,
 		    ("illegal write attempted on a directory"));
 		if (!vfs_allow_read_dir)
 			return (EISDIR);
 		if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
 			return (EISDIR);
 	}
 
 	do_io_fault = do_vn_io_fault(vp, uio);
 	do_rangelock = do_io_fault || (vn_irflag_read(vp) & VIRF_PGREAD) != 0;
 	foffset_lock_uio(fp, uio, flags);
 	if (do_rangelock) {
 		if (uio->uio_rw == UIO_READ) {
 			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		} else if ((fp->f_flag & O_APPEND) != 0 ||
 		    (flags & FOF_OFFSET) == 0) {
 			/* For appenders, punt and lock the whole range. */
 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 		} else {
 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		}
 	}
 	if (do_io_fault) {
 		args.kind = VN_IO_FAULT_FOP;
 		args.args.fop_args.fp = fp;
 		args.args.fop_args.doio = doio;
 		args.cred = active_cred;
 		args.flags = flags | FOF_OFFSET;
 		error = vn_io_fault1(vp, uio, &args, td);
 	} else {
 		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 	}
 	if (do_rangelock)
 		vn_rangelock_unlock(vp, rl_cookie);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 /*
  * Helper function to perform the requested uiomove operation using
  * the held pages for io->uio_iov[0].iov_base buffer instead of
  * copyin/copyout.  Access to the pages with uiomove_fromphys()
  * instead of iov_base prevents page faults that could occur due to
  * pmap_collect() invalidating the mapping created by
  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
  * object cleanup revoking the write access from page mappings.
  *
  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
  * instead of plain uiomove().
  */
 int
 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
 {
 	struct uio transp_uio;
 	struct iovec transp_iov[1];
 	struct thread *td;
 	size_t adv;
 	int error, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove(data, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	transp_iov[0].iov_base = data;
 	transp_uio.uio_iov = &transp_iov[0];
 	transp_uio.uio_iovcnt = 1;
 	if (xfersize > uio->uio_resid)
 		xfersize = uio->uio_resid;
 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
 	transp_uio.uio_offset = 0;
 	transp_uio.uio_segflg = UIO_SYSSPACE;
 	/*
 	 * Since transp_iov points to data, and td_ma page array
 	 * corresponds to original uio->uio_iov, we need to invert the
 	 * direction of the i/o operation as passed to
 	 * uiomove_fromphys().
 	 */
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		transp_uio.uio_rw = UIO_READ;
 		break;
 	case UIO_READ:
 		transp_uio.uio_rw = UIO_WRITE;
 		break;
 	}
 	transp_uio.uio_td = uio->uio_td;
 	error = uiomove_fromphys(td->td_ma,
 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
 	    xfersize, &transp_uio);
 	adv = xfersize - transp_uio.uio_resid;
 	pgadv =
 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
 	uio->uio_iov->iov_len -= adv;
 	uio->uio_resid -= adv;
 	uio->uio_offset += adv;
 	return (error);
 }
 
 int
 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
     struct uio *uio)
 {
 	struct thread *td;
 	vm_offset_t iov_base;
 	int cnt, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove_fromphys(ma, offset, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
 	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
 		    offset, cnt);
 		break;
 	case UIO_READ:
 		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
 		    cnt);
 		break;
 	}
 	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
 	uio->uio_iov->iov_len -= cnt;
 	uio->uio_resid -= cnt;
 	uio->uio_offset += cnt;
 	return (0);
 }
 
 /*
  * File table truncate routine.
  */
 static int
 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	int error;
 
 	vp = fp->f_vnode;
 
 retry:
 	/*
 	 * Lock the whole range for truncation.  Otherwise split i/o
 	 * might happen partly before and partly after the truncation.
 	 */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 	if (error)
 		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error)
 		goto out;
 #endif
 	error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
 	    fp->f_cred);
 out:
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 out1:
 	vn_rangelock_unlock(vp, rl_cookie);
 	if (error == ERELOOKUP)
 		goto retry;
 	return (error);
 }
 
 /*
  * Truncate a file that is already locked.
  */
 int
 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
     struct ucred *cred)
 {
 	struct vattr vattr;
 	int error;
 
 	error = VOP_ADD_WRITECOUNT(vp, 1);
 	if (error == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		if (sync)
 			vattr.va_vaflags |= VA_SYNC;
 		error = VOP_SETATTR(vp, &vattr, cred);
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 	}
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 int
 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct vnode *vp = fp->f_vnode;
 	int error;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_STAT(vp, sb, active_cred, fp->f_cred);
 	VOP_UNLOCK(vp);
 
 	return (error);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 	struct fiobmap2_arg *bmarg;
 	off_t size;
 	int error;
 
 	vp = fp->f_vnode;
 	switch (vp->v_type) {
 	case VDIR:
 	case VREG:
 		switch (com) {
 		case FIONREAD:
 			error = vn_getsize(vp, &size, active_cred);
 			if (error == 0)
 				*(int *)data = size - fp->f_offset;
 			return (error);
 		case FIOBMAP2:
 			bmarg = (struct fiobmap2_arg *)data;
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 #ifdef MAC
 			error = mac_vnode_check_read(active_cred, fp->f_cred,
 			    vp);
 			if (error == 0)
 #endif
 				error = VOP_BMAP(vp, bmarg->bn, NULL,
 				    &bmarg->bn, &bmarg->runp, &bmarg->runb);
 			VOP_UNLOCK(vp);
 			return (error);
 		case FIONBIO:
 		case FIOASYNC:
 			return (0);
 		default:
 			return (VOP_IOCTL(vp, com, data, fp->f_flag,
 			    active_cred, td));
 		}
 		break;
 	case VCHR:
 		return (VOP_IOCTL(vp, com, data, fp->f_flag,
 		    active_cred, td));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 	int error;
 
 	vp = fp->f_vnode;
 #if defined(MAC) || defined(AUDIT)
 	if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(vp);
 		error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 		VOP_UNLOCK(vp);
 		if (error != 0)
 			return (error);
 	}
 #endif
 	error = VOP_POLL(vp, events, fp->f_cred, td);
 	return (error);
 }
 
 /*
  * Acquire the requested lock and then check for validity.  LK_RETRY
  * permits vn_lock to return doomed vnodes.
  */
 static int __noinline
 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
     int error)
 {
 
 	KASSERT((flags & LK_RETRY) == 0 || error == 0,
 	    ("vn_lock: error %d incompatible with flags %#x", error, flags));
 
 	if (error == 0)
 		VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
 
 	if ((flags & LK_RETRY) == 0) {
 		if (error == 0) {
 			VOP_UNLOCK(vp);
 			error = ENOENT;
 		}
 		return (error);
 	}
 
 	/*
 	 * LK_RETRY case.
 	 *
 	 * Nothing to do if we got the lock.
 	 */
 	if (error == 0)
 		return (0);
 
 	/*
 	 * Interlock was dropped by the call in _vn_lock.
 	 */
 	flags &= ~LK_INTERLOCK;
 	do {
 		error = VOP_LOCK1(vp, flags, file, line);
 	} while (error != 0);
 	return (0);
 }
 
 int
 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
 {
 	int error;
 
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vn_lock: no locktype (%d passed)", flags));
 	VNPASS(vp->v_holdcnt > 0, vp);
 	error = VOP_LOCK1(vp, flags, file, line);
 	if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
 		return (_vn_lock_fallback(vp, flags, file, line, error));
 	return (0);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 	bool ref;
 
 	vp = fp->f_vnode;
 	fp->f_ops = &badfileops;
 	ref = (fp->f_flag & FHASLOCK) != 0;
 
 	error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
 
 	if (__predict_false(ref)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 		vrele(vp);
 	}
 	return (error);
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 static int
 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
 {
 	struct mount_pcpu *mpcpu;
 	int error, mflags;
 
 	if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
 	    vfs_op_thread_enter(mp, mpcpu)) {
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 		vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return (0);
 	}
 
 	if (mplocked)
 		mtx_assert(MNT_MTX(mp), MA_OWNED);
 	else
 		MNT_ILOCK(mp);
 
 	error = 0;
 
 	/*
 	 * Check on status of suspension.
 	 */
 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 	    mp->mnt_susp_owner != curthread) {
 		mflags = 0;
 		if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) {
 			if (flags & V_PCATCH)
 				mflags |= PCATCH;
 		}
 		mflags |= (PUSER - 1);
 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 			if ((flags & V_NOWAIT) != 0) {
 				error = EWOULDBLOCK;
 				goto unlock;
 			}
 			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
 			    "suspfs", 0);
 			if (error != 0)
 				goto unlock;
 		}
 	}
 	if ((flags & V_XSLEEP) != 0)
 		goto unlock;
 	mp->mnt_writeopcount++;
 unlock:
 	if (error != 0 || (flags & V_XSLEEP) != 0)
 		MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 int
 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error;
 
 	KASSERT((flags & ~V_VALID_FLAGS) == 0,
 	    ("%s: invalid flags passed %d\n", __func__, flags));
 
 	error = 0;
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	if (vp == NULL)
 		vfs_ref(mp);
 
 	error = vn_start_write_refed(mp, flags, false);
 	if (error != 0 && (flags & V_NOWAIT) == 0)
 		*mpp = NULL;
 	return (error);
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error, mflags;
 
 	KASSERT((flags & (~V_VALID_FLAGS | V_XSLEEP)) == 0,
 	    ("%s: invalid flags passed %d\n", __func__, flags));
 
  retry:
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL)
 		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 		mp->mnt_secondary_writes++;
 		mp->mnt_secondary_accwrites++;
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if ((flags & V_NOWAIT) != 0) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		*mpp = NULL;
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	mflags = 0;
 	if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) {
 		if ((flags & V_PCATCH) != 0)
 			mflags |= PCATCH;
 	}
 	mflags |= (PUSER - 1) | PDROP;
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0);
 	vfs_rel(mp);
 	if (error == 0)
 		goto retry;
 	*mpp = NULL;
 	return (error);
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int c;
 
 	if (mp == NULL)
 		return;
 
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
 		vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REL(mp);
 	c = --mp->mnt_writeopcount;
 	if (mp->mnt_vfs_ops == 0) {
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	if (c < 0)
 		vfs_dump_mount_counters(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
 		wakeup(&mp->mnt_writeopcount);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Filesystem secondary write operation has completed. If we are
  * suspending and this operation is the last one, notify the suspender
  * that the suspension is now in effect.
  */
 void
 vn_finished_secondary_write(struct mount *mp)
 {
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	mp->mnt_secondary_writes--;
 	if (mp->mnt_secondary_writes < 0)
 		panic("vn_finished_secondary_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_secondary_writes <= 0)
 		wakeup(&mp->mnt_secondary_writes);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Request a filesystem to suspend write operations.
  */
 int
 vfs_write_suspend(struct mount *mp, int flags)
 {
 	int error;
 
 	vfs_op_enter(mp);
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	if (mp->mnt_susp_owner == curthread) {
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		return (EALREADY);
 	}
 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 
 	/*
 	 * Unmount holds a write reference on the mount point.  If we
 	 * own busy reference and drain for writers, we deadlock with
 	 * the reference draining in the unmount path.  Callers of
 	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
 	 * vfs_busy() reference is owned and caller is not in the
 	 * unmount context.
 	 */
 	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		return (EBUSY);
 	}
 
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	mp->mnt_susp_owner = curthread;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
 		vfs_write_resume(mp, 0);
 		/* vfs_write_resume does vfs_op_exit() for us */
 	}
 	return (error);
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(struct mount *mp, int flags)
 {
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 				       MNTK_SUSPENDED);
 		mp->mnt_susp_owner = NULL;
 		wakeup(&mp->mnt_writeopcount);
 		wakeup(&mp->mnt_flag);
 		curthread->td_pflags &= ~TDP_IGNSUSP;
 		if ((flags & VR_START_WRITE) != 0) {
 			MNT_REF(mp);
 			mp->mnt_writeopcount++;
 		}
 		MNT_IUNLOCK(mp);
 		if ((flags & VR_NO_SUSPCLR) == 0)
 			VFS_SUSP_CLEAN(mp);
 		vfs_op_exit(mp);
 	} else if ((flags & VR_START_WRITE) != 0) {
 		MNT_REF(mp);
 		vn_start_write_refed(mp, 0, true);
 	} else {
 		MNT_IUNLOCK(mp);
 	}
 }
 
 /*
  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
  * methods.
  */
 int
 vfs_write_suspend_umnt(struct mount *mp)
 {
 	int error;
 
 	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
 	    ("vfs_write_suspend_umnt: recursed"));
 
 	/* dounmount() already called vn_start_write(). */
 	for (;;) {
 		vn_finished_write(mp);
 		error = vfs_write_suspend(mp, 0);
 		if (error != 0) {
 			vn_start_write(NULL, &mp, V_WAIT);
 			return (error);
 		}
 		MNT_ILOCK(mp);
 		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			break;
 		MNT_IUNLOCK(mp);
 		vn_start_write(NULL, &mp, V_WAIT);
 	}
 	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 	wakeup(&mp->mnt_flag);
 	MNT_IUNLOCK(mp);
 	curthread->td_pflags |= TDP_IGNSUSP;
 	return (0);
 }
 
 /*
  * Implement kqueues for files by translating it to vnode operation.
  */
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (VOP_KQFILTER(fp->f_vnode, kn));
 }
 
 int
 vn_kqfilter_opath(struct file *fp, struct knote *kn)
 {
 	if ((fp->f_flag & FKQALLOWED) == 0)
 		return (EBADF);
 	return (vn_kqfilter(fp, kn));
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 	    td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute removal as kernel */
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp);
 	}
 
 	return (error);
 }
 
 static int
 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 
 	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
 }
 
 int
 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 {
 
 	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
 	    lkflags, rvp));
 }
 
 int
 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
     int lkflags, struct vnode **rvp)
 {
 	struct mount *mp;
 	int ltype, error;
 
 	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
 	mp = vp->v_mount;
 	ltype = VOP_ISLOCKED(vp);
 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 	    ("vn_vget_ino: vp not locked"));
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, ltype | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0)
 			return (ENOENT);
 		if (VN_IS_DOOMED(vp)) {
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp);
 	error = alloc(mp, alloc_arg, lkflags, rvp);
 	vfs_unbusy(mp);
 	if (error != 0 || *rvp != vp)
 		vn_lock(vp, ltype | LK_RETRY);
 	if (VN_IS_DOOMED(vp)) {
 		if (error == 0) {
 			if (*rvp == vp)
 				vunref(vp);
 			else
 				vput(*rvp);
 		}
 		error = ENOENT;
 	}
 	return (error);
 }
 
 static void
 vn_send_sigxfsz(struct proc *p)
 {
 	PROC_LOCK(p);
 	kern_psignal(p, SIGXFSZ);
 	PROC_UNLOCK(p);
 }
 
 int
 vn_rlimit_trunc(u_quad_t size, struct thread *td)
 {
 	if (size <= lim_cur(td, RLIMIT_FSIZE))
 		return (0);
 	vn_send_sigxfsz(td->td_proc);
 	return (EFBIG);
 }
 
 static int
 vn_rlimit_fsizex1(const struct vnode *vp, struct uio *uio, off_t maxfsz,
     bool adj, struct thread *td)
 {
 	off_t lim;
 	bool ktr_write;
 
 	if (vp->v_type != VREG)
 		return (0);
 
 	/*
 	 * Handle file system maximum file size.
 	 */
 	if (maxfsz != 0 && uio->uio_offset + uio->uio_resid > maxfsz) {
 		if (!adj || uio->uio_offset >= maxfsz)
 			return (EFBIG);
 		uio->uio_resid = maxfsz - uio->uio_offset;
 	}
 
 	/*
 	 * This is kernel write (e.g. vnode_pager) or accounting
 	 * write, ignore limit.
 	 */
 	if (td == NULL || (td->td_pflags2 & TDP2_ACCT) != 0)
 		return (0);
 
 	/*
 	 * Calculate file size limit.
 	 */
 	ktr_write = (td->td_pflags & TDP_INKTRACE) != 0;
 	lim = __predict_false(ktr_write) ? td->td_ktr_io_lim :
 	    lim_cur(td, RLIMIT_FSIZE);
 
 	/*
 	 * Is the limit reached?
 	 */
 	if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim))
 		return (0);
 
 	/*
 	 * Prepared filesystems can handle writes truncated to the
 	 * file size limit.
 	 */
 	if (adj && (uoff_t)uio->uio_offset < lim) {
 		uio->uio_resid = lim - (uoff_t)uio->uio_offset;
 		return (0);
 	}
 
 	if (!ktr_write || ktr_filesize_limit_signal)
 		vn_send_sigxfsz(td->td_proc);
 	return (EFBIG);
 }
 
 /*
  * Helper for VOP_WRITE() implementations, the common code to
  * handle maximum supported file size on the filesystem, and
  * RLIMIT_FSIZE, except for special writes from accounting subsystem
  * and ktrace.
  *
  * For maximum file size (maxfsz argument):
  * - return EFBIG if uio_offset is beyond it
  * - otherwise, clamp uio_resid if write would extend file beyond maxfsz.
  *
  * For RLIMIT_FSIZE:
  * - return EFBIG and send SIGXFSZ if uio_offset is beyond the limit
  * - otherwise, clamp uio_resid if write would extend file beyond limit.
  *
  * If clamping occured, the adjustment for uio_resid is stored in
  * *resid_adj, to be re-applied by vn_rlimit_fsizex_res() on return
  * from the VOP.
  */
 int
 vn_rlimit_fsizex(const struct vnode *vp, struct uio *uio, off_t maxfsz,
     ssize_t *resid_adj, struct thread *td)
 {
 	ssize_t resid_orig;
 	int error;
 	bool adj;
 
 	resid_orig = uio->uio_resid;
 	adj = resid_adj != NULL;
 	error = vn_rlimit_fsizex1(vp, uio, maxfsz, adj, td);
 	if (adj)
 		*resid_adj = resid_orig - uio->uio_resid;
 	return (error);
 }
 
 void
 vn_rlimit_fsizex_res(struct uio *uio, ssize_t resid_adj)
 {
 	uio->uio_resid += resid_adj;
 }
 
 int
 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
     struct thread *td)
 {
 	return (vn_rlimit_fsizex(vp, __DECONST(struct uio *, uio), 0, NULL,
 	    td));
 }
 
 int
 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp);
 #endif
 	return (setfmode(td, active_cred, vp, mode));
 }
 
 int
 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp);
 #endif
 	return (setfown(td, active_cred, vp, uid, gid));
 }
 
 /*
  * Remove pages in the range ["start", "end") from the vnode's VM object.  If
  * "end" is 0, then the range extends to the end of the object.
  */
 void
 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_object_t object;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  * Like vn_pages_remove(), but skips invalid pages, which by definition are not
  * mapped into any process' address space.  Filesystems may use this in
  * preference to vn_pages_remove() to avoid blocking on pages busied in
  * preparation for a VOP_GETPAGES.
  */
 void
 vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_object_t object;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_page_remove(object, start, end, OBJPR_VALIDONLY);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 int
 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
     struct ucred *cred)
 {
 	off_t size;
 	daddr_t bn, bnp;
 	uint64_t bsize;
 	off_t noff;
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("%s: Wrong command %lu", __func__, cmd));
 	ASSERT_VOP_ELOCKED(vp, "vn_bmap_seekhole_locked");
 
 	if (vp->v_type != VREG) {
 		error = ENOTTY;
 		goto out;
 	}
 	error = vn_getsize_locked(vp, &size, cred);
 	if (error != 0)
 		goto out;
 	noff = *off;
 	if (noff < 0 || noff >= size) {
 		error = ENXIO;
 		goto out;
 	}
 
 	/* See the comment in ufs_bmap_seekdata(). */
 	vnode_pager_clean_sync(vp);
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	for (bn = noff / bsize; noff < size; bn++, noff += bsize -
 	    noff % bsize) {
 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 		if (error == EOPNOTSUPP) {
 			error = ENOTTY;
 			goto out;
 		}
 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
 			noff = bn * bsize;
 			if (noff < *off)
 				noff = *off;
 			goto out;
 		}
 	}
 	if (noff > size)
 		noff = size;
 	/* noff == size. There is an implicit hole at the end of file. */
 	if (cmd == FIOSEEKDATA)
 		error = ENXIO;
 out:
 	if (error == 0)
 		*off = noff;
 	return (error);
 }
 
 int
 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 {
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("%s: Wrong command %lu", __func__, cmd));
 
 	if (vn_lock(vp, LK_EXCLUSIVE) != 0)
 		return (EBADF);
 	error = vn_bmap_seekhole_locked(vp, cmd, off, cred);
 	VOP_UNLOCK(vp);
 	return (error);
 }
 
 int
 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct ucred *cred;
 	struct vnode *vp;
 	off_t foffset, fsize, size;
 	int error, noneg;
 
 	cred = td->td_ucred;
 	vp = fp->f_vnode;
 	noneg = (vp->v_type != VCHR);
 	/*
 	 * Try to dodge locking for common case of querying the offset.
 	 */
 	if (whence == L_INCR && offset == 0) {
 		foffset = foffset_read(fp);
 		if (__predict_false(foffset < 0 && noneg)) {
 			return (EOVERFLOW);
 		}
 		td->td_uretoff.tdu_off = foffset;
 		return (0);
 	}
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (noneg &&
 		    (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		error = vn_getsize(vp, &fsize, cred);
 		if (error != 0)
 			break;
 
 		/*
 		 * If the file references a disk device, then fetch
 		 * the media size and use that to determine the ending
 		 * offset.
 		 */
 		if (fsize == 0 && vp->v_type == VCHR &&
 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
 			fsize = size;
 		if (noneg && offset > 0 && fsize > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += fsize;
 		break;
 	case L_SET:
 		break;
 	case SEEK_DATA:
 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 		if (error == ENOTTY)
 			error = EINVAL;
 		break;
 	case SEEK_HOLE:
 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 		if (error == ENOTTY)
 			error = EINVAL;
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0 && noneg && offset < 0)
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
 	VFS_KNOTE_UNLOCKED(vp, 0);
 	td->td_uretoff.tdu_off = offset;
 drop:
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 int
 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	int error;
 
 	/*
 	 * Grant permission if the caller is the owner of the file, or
 	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
 	 * on the file.  If the time pointer is null, then write
 	 * permission on the file is also sufficient.
 	 *
 	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
 	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
 	 * will be allowed to set the times [..] to the current
 	 * server time.
 	 */
 	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
 	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
 		error = VOP_ACCESS(vp, VWRITE, cred, td);
 	return (error);
 }
 
 int
 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct vnode *vp;
 	int error;
 
 	if (fp->f_type == DTYPE_FIFO)
 		kif->kf_type = KF_TYPE_FIFO;
 	else
 		kif->kf_type = KF_TYPE_VNODE;
 	vp = fp->f_vnode;
 	vref(vp);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fill_kinfo_vnode(vp, kif);
 	vrele(vp);
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 static inline void
 vn_fill_junk(struct kinfo_file *kif)
 {
 	size_t len, olen;
 
 	/*
 	 * Simulate vn_fullpath returning changing values for a given
 	 * vp during e.g. coredump.
 	 */
 	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
 	olen = strlen(kif->kf_path);
 	if (len < olen)
 		strcpy(&kif->kf_path[len - 1], "$");
 	else
 		for (; olen < len; olen++)
 			strcpy(&kif->kf_path[olen], "A");
 }
 
 int
 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
 {
 	struct vattr va;
 	char *fullpath, *freepath;
 	int error;
 
 	kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
 	freepath = NULL;
 	fullpath = "-";
 	error = vn_fullpath(vp, &fullpath, &freepath);
 	if (error == 0) {
 		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 
 	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
 		vn_fill_junk(kif);
 	);
 
 	/*
 	 * Retrieve vnode attributes.
 	 */
 	va.va_fsid = VNOVAL;
 	va.va_rdev = NODEV;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 	VOP_UNLOCK(vp);
 	if (error != 0)
 		return (error);
 	if (va.va_fsid != VNOVAL)
 		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 	else
 		kif->kf_un.kf_file.kf_file_fsid =
 		    vp->v_mount->mnt_stat.f_fsid.val[0];
 	kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
 	    kif->kf_un.kf_file.kf_file_fsid; /* truncate */
 	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
 	kif->kf_un.kf_file.kf_file_size = va.va_size;
 	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
 	kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
 	    kif->kf_un.kf_file.kf_file_rdev; /* truncate */
 	kif->kf_un.kf_file.kf_file_nlink = va.va_nlink;
 	return (0);
 }
 
 int
 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_in pkm;
 #endif
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	boolean_t writecounted;
 	int error;
 
 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
 	/*
 	 * POSIX shared-memory objects are defined to have
 	 * kernel persistence, and are not defined to support
 	 * read(2)/write(2) -- or even open(2).  Thus, we can
 	 * use MAP_ASYNC to trade on-disk coherence for speed.
 	 * The shm_open(3) library routine turns on the FPOSIXSHM
 	 * flag to request this behavior.
 	 */
 	if ((fp->f_flag & FPOSIXSHM) != 0)
 		flags |= MAP_NOSYNC;
 #endif
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.  Note that we only worry about
 	 * writability if mapping is shared; in this case,
 	 * current and max prot are dictated by the open file.
 	 * XXX use the vnode instead?  Problem is: what
 	 * credentials do we use for determination? What if
 	 * proc does a setuid?
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	} else {
 		maxprot |= VM_PROT_WRITE;
 		cap_maxprot |= VM_PROT_WRITE;
 	}
 	maxprot &= cap_maxprot;
 
 	/*
 	 * For regular files and shared memory, POSIX requires that
 	 * the value of foff be a legitimate offset within the data
 	 * object.  In particular, negative offsets are invalid.
 	 * Blocking negative offsets and overflows here avoids
 	 * possible wraparound or user-level access into reserved
 	 * ranges of the data object later.  In contrast, POSIX does
 	 * not dictate how offsets are used by device drivers, so in
 	 * the case of a device mapping a negative offset is passed
 	 * on.
 	 */
 	if (
 #ifdef _LP64
 	    size > OFF_MAX ||
 #endif
 	    foff > OFF_MAX - size)
 		return (EINVAL);
 
 	writecounted = FALSE;
 	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
 	    &foff, &object, &writecounted);
 	if (error != 0)
 		return (error);
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, writecounted, td);
 	if (error != 0) {
 		/*
 		 * If this mapping was accounted for in the vnode's
 		 * writecount, then undo that now.
 		 */
 		if (writecounted)
 			vm_pager_release_writecount(object, 0, size);
 		vm_object_deallocate(object);
 	}
 #ifdef HWPMC_HOOKS
 	/* Inform hwpmc(4) if an executable is being mapped. */
 	if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
 		if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
 			pkm.pm_file = vp;
 			pkm.pm_address = (uintptr_t) *addr;
 			PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
 		}
 	}
 #endif
 	return (error);
 }
 
 void
 vn_fsid(struct vnode *vp, struct vattr *va)
 {
 	fsid_t *f;
 
 	f = &vp->v_mount->mnt_stat.f_fsid;
 	va->va_fsid = (uint32_t)f->val[1];
 	va->va_fsid <<= sizeof(f->val[1]) * NBBY;
 	va->va_fsid += (uint32_t)f->val[0];
 }
 
 int
 vn_fsync_buf(struct vnode *vp, int waitfor)
 {
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 	struct mount *mp;
 	int error, maxretry;
 
 	error = 0;
 	maxretry = 10000;     /* large, arbitrarily chosen */
 	mp = NULL;
 	if (vp->v_type == VCHR) {
 		VI_LOCK(vp);
 		mp = vp->v_rdev->si_mountpt;
 		VI_UNLOCK(vp);
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 loop1:
 	/*
 	 * MARK/SCAN initialization to avoid infinite loops.
 	 */
         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 		bp->b_vflags &= ~BV_SCANNED;
 		bp->b_error = 0;
 	}
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 loop2:
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
 			    BO_LOCKPTR(bo)) != 0) {
 				BO_LOCK(bo);
 				goto loop1;
 			}
 			BO_LOCK(bo);
 		}
 		BO_UNLOCK(bo);
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("fsync: not dirty");
 		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
 			vfs_bio_awrite(bp);
 		} else {
 			bremfree(bp);
 			bawrite(bp);
 		}
 		if (maxretry < 1000)
 			pause("dirty", hz < 1000 ? 1 : hz / 1000);
 		BO_LOCK(bo);
 		goto loop2;
 	}
 
 	/*
 	 * If synchronous the caller expects us to completely resolve all
 	 * dirty buffers in the system.  Wait for in-progress I/O to
 	 * complete (which could include background bitmap writes), then
 	 * retry if dirty blocks still exist.
 	 */
 	if (waitfor == MNT_WAIT) {
 		bufobj_wwait(bo, 0, 0);
 		if (bo->bo_dirty.bv_cnt > 0) {
 			/*
 			 * If we are unable to write any of these buffers
 			 * then we fail now rather than trying endlessly
 			 * to write them out.
 			 */
 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 				if ((error = bp->b_error) != 0)
 					break;
 			if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
 			    (error == 0 && --maxretry >= 0))
 				goto loop1;
 			if (error == 0)
 				error = EAGAIN;
 		}
 	}
 	BO_UNLOCK(bo);
 	if (error != 0)
 		vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
 
 	return (error);
 }
 
 /*
  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
  * to do the actual copy.
  * vn_generic_copy_file_range() is factored out, so it can be called
  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
  * different file systems.
  */
 int
 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
     struct ucred *outcred, struct thread *fsize_td)
 {
 	struct mount *inmp, *outmp;
 	struct vnode *invpl, *outvpl;
 	int error;
 	size_t len;
 	uint64_t uval;
 
 	invpl = outvpl = NULL;
 	len = *lenp;
 	*lenp = 0;		/* For error returns. */
 	error = 0;
 
 	/* Do some sanity checks on the arguments. */
 	if (invp->v_type == VDIR || outvp->v_type == VDIR)
 		error = EISDIR;
 	else if (*inoffp < 0 || *outoffp < 0 ||
 	    invp->v_type != VREG || outvp->v_type != VREG)
 		error = EINVAL;
 	if (error != 0)
 		goto out;
 
 	/* Ensure offset + len does not wrap around. */
 	uval = *inoffp;
 	uval += len;
 	if (uval > INT64_MAX)
 		len = INT64_MAX - *inoffp;
 	uval = *outoffp;
 	uval += len;
 	if (uval > INT64_MAX)
 		len = INT64_MAX - *outoffp;
 	if (len == 0)
 		goto out;
 
 	error = VOP_GETLOWVNODE(invp, &invpl, FREAD);
 	if (error != 0)
 		goto out;
 	error = VOP_GETLOWVNODE(outvp, &outvpl, FWRITE);
 	if (error != 0)
 		goto out1;
 
 	inmp = invpl->v_mount;
 	outmp = outvpl->v_mount;
 	if (inmp == NULL || outmp == NULL)
 		goto out2;
 
 	for (;;) {
 		error = vfs_busy(inmp, 0);
 		if (error != 0)
 			goto out2;
 		if (inmp == outmp)
 			break;
 		error = vfs_busy(outmp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_unbusy(inmp);
 			error = vfs_busy(outmp, 0);
 			if (error == 0) {
 				vfs_unbusy(outmp);
 				continue;
 			}
 			goto out2;
 		}
 		break;
 	}
 
 	/*
 	 * If the two vnodes are for the same file system type, call
 	 * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
 	 * which can handle copies across multiple file system types.
 	 */
 	*lenp = len;
 	if (inmp == outmp || inmp->mnt_vfc == outmp->mnt_vfc)
 		error = VOP_COPY_FILE_RANGE(invpl, inoffp, outvpl, outoffp,
 		    lenp, flags, incred, outcred, fsize_td);
 	else
 		error = ENOSYS;
 	if (error == ENOSYS)
 		error = vn_generic_copy_file_range(invpl, inoffp, outvpl,
 		    outoffp, lenp, flags, incred, outcred, fsize_td);
 	vfs_unbusy(outmp);
 	if (inmp != outmp)
 		vfs_unbusy(inmp);
 out2:
 	if (outvpl != NULL)
 		vrele(outvpl);
 out1:
 	if (invpl != NULL)
 		vrele(invpl);
 out:
 	return (error);
 }
 
 /*
  * Test len bytes of data starting at dat for all bytes == 0.
  * Return true if all bytes are zero, false otherwise.
  * Expects dat to be well aligned.
  */
 static bool
 mem_iszero(void *dat, int len)
 {
 	int i;
 	const u_int *p;
 	const char *cp;
 
 	for (p = dat; len > 0; len -= sizeof(*p), p++) {
 		if (len >= sizeof(*p)) {
 			if (*p != 0)
 				return (false);
 		} else {
 			cp = (const char *)p;
 			for (i = 0; i < len; i++, cp++)
 				if (*cp != '\0')
 					return (false);
 		}
 	}
 	return (true);
 }
 
 /*
  * Look for a hole in the output file and, if found, adjust *outoffp
  * and *xferp to skip past the hole.
  * *xferp is the entire hole length to be written and xfer2 is how many bytes
  * to be written as 0's upon return.
  */
 static off_t
 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
 {
 	int error;
 	off_t delta;
 
 	if (*holeoffp == 0 || *holeoffp <= *outoffp) {
 		*dataoffp = *outoffp;
 		error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
 		    curthread);
 		if (error == 0) {
 			*holeoffp = *dataoffp;
 			error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
 			    curthread);
 		}
 		if (error != 0 || *holeoffp == *dataoffp) {
 			/*
 			 * Since outvp is unlocked, it may be possible for
 			 * another thread to do a truncate(), lseek(), write()
 			 * creating a hole at startoff between the above
 			 * VOP_IOCTL() calls, if the other thread does not do
 			 * rangelocking.
 			 * If that happens, *holeoffp == *dataoffp and finding
 			 * the hole has failed, so disable vn_skip_hole().
 			 */
 			*holeoffp = -1;	/* Disable use of vn_skip_hole(). */
 			return (xfer2);
 		}
 		KASSERT(*dataoffp >= *outoffp,
 		    ("vn_skip_hole: dataoff=%jd < outoff=%jd",
 		    (intmax_t)*dataoffp, (intmax_t)*outoffp));
 		KASSERT(*holeoffp > *dataoffp,
 		    ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
 		    (intmax_t)*holeoffp, (intmax_t)*dataoffp));
 	}
 
 	/*
 	 * If there is a hole before the data starts, advance *outoffp and
 	 * *xferp past the hole.
 	 */
 	if (*dataoffp > *outoffp) {
 		delta = *dataoffp - *outoffp;
 		if (delta >= *xferp) {
 			/* Entire *xferp is a hole. */
 			*outoffp += *xferp;
 			*xferp = 0;
 			return (0);
 		}
 		*xferp -= delta;
 		*outoffp += delta;
 		xfer2 = MIN(xfer2, *xferp);
 	}
 
 	/*
 	 * If a hole starts before the end of this xfer2, reduce this xfer2 so
 	 * that the write ends at the start of the hole.
 	 * *holeoffp should always be greater than *outoffp, but for the
 	 * non-INVARIANTS case, check this to make sure xfer2 remains a sane
 	 * value.
 	 */
 	if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
 		xfer2 = *holeoffp - *outoffp;
 	return (xfer2);
 }
 
 /*
  * Write an xfer sized chunk to outvp in blksize blocks from dat.
  * dat is a maximum of blksize in length and can be written repeatedly in
  * the chunk.
  * If growfile == true, just grow the file via vn_truncate_locked() instead
  * of doing actual writes.
  * If checkhole == true, a hole is being punched, so skip over any hole
  * already in the output file.
  */
 static int
 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
 {
 	struct mount *mp;
 	off_t dataoff, holeoff, xfer2;
 	int error;
 
 	/*
 	 * Loop around doing writes of blksize until write has been completed.
 	 * Lock/unlock on each loop iteration so that a bwillwrite() can be
 	 * done for each iteration, since the xfer argument can be very
 	 * large if there is a large hole to punch in the output file.
 	 */
 	error = 0;
 	holeoff = 0;
 	do {
 		xfer2 = MIN(xfer, blksize);
 		if (checkhole) {
 			/*
 			 * Punching a hole.  Skip writing if there is
 			 * already a hole in the output file.
 			 */
 			xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
 			    &dataoff, &holeoff, cred);
 			if (xfer == 0)
 				break;
 			if (holeoff < 0)
 				checkhole = false;
 			KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
 			    (intmax_t)xfer2));
 		}
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(outvp, &mp, V_WAIT);
 		if (error != 0)
 			break;
 		if (growfile) {
 			error = vn_lock(outvp, LK_EXCLUSIVE);
 			if (error == 0) {
 				error = vn_truncate_locked(outvp, outoff + xfer,
 				    false, cred);
 				VOP_UNLOCK(outvp);
 			}
 		} else {
 			error = vn_lock(outvp, vn_lktype_write(mp, outvp));
 			if (error == 0) {
 				error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
 				    outoff, UIO_SYSSPACE, IO_NODELOCKED,
 				    curthread->td_ucred, cred, NULL, curthread);
 				outoff += xfer2;
 				xfer -= xfer2;
 				VOP_UNLOCK(outvp);
 			}
 		}
 		if (mp != NULL)
 			vn_finished_write(mp);
 	} while (!growfile && xfer > 0 && error == 0);
 	return (error);
 }
 
 /*
  * Copy a byte range of one file to another.  This function can handle the
  * case where invp and outvp are on different file systems.
  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
  * is no better file system specific way to do it.
  */
 int
 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
 {
 	struct mount *mp;
 	off_t startoff, endoff, xfer, xfer2;
 	u_long blksize;
 	int error, interrupted;
 	bool cantseek, readzeros, eof, lastblock, holetoeof;
 	ssize_t aresid, r = 0;
 	size_t copylen, len, savlen;
 	off_t insize, outsize;
 	char *dat;
 	long holein, holeout;
 	struct timespec curts, endts;
 
 	holein = holeout = 0;
 	savlen = len = *lenp;
 	error = 0;
 	interrupted = 0;
 	dat = NULL;
 
 	error = vn_lock(invp, LK_SHARED);
 	if (error != 0)
 		goto out;
 	if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
 		holein = 0;
 	error = vn_getsize_locked(invp, &insize, incred);
 	VOP_UNLOCK(invp);
 	if (error != 0)
 		goto out;
 
 	mp = NULL;
 	error = vn_start_write(outvp, &mp, V_WAIT);
 	if (error == 0)
 		error = vn_lock(outvp, LK_EXCLUSIVE);
 	if (error == 0) {
 		/*
 		 * If fsize_td != NULL, do a vn_rlimit_fsizex() call,
 		 * now that outvp is locked.
 		 */
 		if (fsize_td != NULL) {
 			struct uio io;
 
 			io.uio_offset = *outoffp;
 			io.uio_resid = len;
 			error = vn_rlimit_fsizex(outvp, &io, 0, &r, fsize_td);
 			len = savlen = io.uio_resid;
 			/*
 			 * No need to call vn_rlimit_fsizex_res before return,
 			 * since the uio is local.
 			 */
 		}
 		if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
 			holeout = 0;
 		/*
 		 * Holes that are past EOF do not need to be written as a block
 		 * of zero bytes.  So, truncate the output file as far as
 		 * possible and then use size to decide if writing 0
 		 * bytes is necessary in the loop below.
 		 */
 		if (error == 0)
 			error = vn_getsize_locked(outvp, &outsize, outcred);
 		if (error == 0 && outsize > *outoffp &&
 		    *outoffp <= OFF_MAX - len && outsize <= *outoffp + len &&
 		    *inoffp < insize &&
 		    *outoffp <= OFF_MAX - (insize - *inoffp) &&
 		    outsize <= *outoffp + (insize - *inoffp)) {
 #ifdef MAC
 			error = mac_vnode_check_write(curthread->td_ucred,
 			    outcred, outvp);
 			if (error == 0)
 #endif
 				error = vn_truncate_locked(outvp, *outoffp,
 				    false, outcred);
 			if (error == 0)
 				outsize = *outoffp;
 		}
 		VOP_UNLOCK(outvp);
 	}
 	if (mp != NULL)
 		vn_finished_write(mp);
 	if (error != 0)
 		goto out;
 
 	if (holein == 0 && holeout > 0) {
 		/*
 		 * For this special case, the input data will be scanned
 		 * for blocks of all 0 bytes.  For these blocks, the
 		 * write can be skipped for the output file to create
 		 * an unallocated region.
 		 * Therefore, use the appropriate size for the output file.
 		 */
 		blksize = holeout;
 		if (blksize <= 512) {
 			/*
 			 * Use f_iosize, since ZFS reports a _PC_MIN_HOLE_SIZE
 			 * of 512, although it actually only creates
 			 * unallocated regions for blocks >= f_iosize.
 			 */
 			blksize = outvp->v_mount->mnt_stat.f_iosize;
 		}
 	} else {
 		/*
 		 * Use the larger of the two f_iosize values.  If they are
 		 * not the same size, one will normally be an exact multiple of
 		 * the other, since they are both likely to be a power of 2.
 		 */
 		blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
 		    outvp->v_mount->mnt_stat.f_iosize);
 	}
 
 	/* Clip to sane limits. */
 	if (blksize < 4096)
 		blksize = 4096;
 	else if (blksize > maxphys)
 		blksize = maxphys;
 	dat = malloc(blksize, M_TEMP, M_WAITOK);
 
 	/*
 	 * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
 	 * to find holes.  Otherwise, just scan the read block for all 0s
 	 * in the inner loop where the data copying is done.
 	 * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
 	 * support holes on the server, but do not support FIOSEEKHOLE.
 	 * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate
 	 * that this function should return after 1second with a partial
 	 * completion.
 	 */
 	if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) {
 		getnanouptime(&endts);
 		endts.tv_sec++;
 	} else
 		timespecclear(&endts);
 	holetoeof = eof = false;
 	while (len > 0 && error == 0 && !eof && interrupted == 0) {
 		endoff = 0;			/* To shut up compilers. */
 		cantseek = true;
 		startoff = *inoffp;
 		copylen = len;
 
 		/*
 		 * Find the next data area.  If there is just a hole to EOF,
 		 * FIOSEEKDATA should fail with ENXIO.
 		 * (I do not know if any file system will report a hole to
 		 *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
 		 *  will fail for those file systems.)
 		 *
 		 * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
 		 * the code just falls through to the inner copy loop.
 		 */
 		error = EINVAL;
 		if (holein > 0) {
 			error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
 			    incred, curthread);
 			if (error == ENXIO) {
 				startoff = endoff = insize;
 				eof = holetoeof = true;
 				error = 0;
 			}
 		}
 		if (error == 0 && !holetoeof) {
 			endoff = startoff;
 			error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
 			    incred, curthread);
 			/*
 			 * Since invp is unlocked, it may be possible for
 			 * another thread to do a truncate(), lseek(), write()
 			 * creating a hole at startoff between the above
 			 * VOP_IOCTL() calls, if the other thread does not do
 			 * rangelocking.
 			 * If that happens, startoff == endoff and finding
 			 * the hole has failed, so set an error.
 			 */
 			if (error == 0 && startoff == endoff)
 				error = EINVAL; /* Any error. Reset to 0. */
 		}
 		if (error == 0) {
 			if (startoff > *inoffp) {
 				/* Found hole before data block. */
 				xfer = MIN(startoff - *inoffp, len);
 				if (*outoffp < outsize) {
 					/* Must write 0s to punch hole. */
 					xfer2 = MIN(outsize - *outoffp,
 					    xfer);
 					memset(dat, 0, MIN(xfer2, blksize));
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer2, blksize, false,
 					    holeout > 0, outcred);
 				}
 
 				if (error == 0 && *outoffp + xfer >
 				    outsize && (xfer == len || holetoeof)) {
 					/* Grow output file (hole at end). */
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize, true,
 					    false, outcred);
 				}
 				if (error == 0) {
 					*inoffp += xfer;
 					*outoffp += xfer;
 					len -= xfer;
 					if (len < savlen) {
 						interrupted = sig_intr();
 						if (timespecisset(&endts) &&
 						    interrupted == 0) {
 							getnanouptime(&curts);
 							if (timespeccmp(&curts,
 							    &endts, >=))
 								interrupted =
 								    EINTR;
 						}
 					}
 				}
 			}
 			copylen = MIN(len, endoff - startoff);
 			cantseek = false;
 		} else {
 			cantseek = true;
 			startoff = *inoffp;
 			copylen = len;
 			error = 0;
 		}
 
 		xfer = blksize;
 		if (cantseek) {
 			/*
 			 * Set first xfer to end at a block boundary, so that
 			 * holes are more likely detected in the loop below via
 			 * the for all bytes 0 method.
 			 */
 			xfer -= (*inoffp % blksize);
 		}
 		/* Loop copying the data block. */
 		while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
 			if (copylen < xfer)
 				xfer = copylen;
 			error = vn_lock(invp, LK_SHARED);
 			if (error != 0)
 				goto out;
 			error = vn_rdwr(UIO_READ, invp, dat, xfer,
 			    startoff, UIO_SYSSPACE, IO_NODELOCKED,
 			    curthread->td_ucred, incred, &aresid,
 			    curthread);
 			VOP_UNLOCK(invp);
 			lastblock = false;
 			if (error == 0 && aresid > 0) {
 				/* Stop the copy at EOF on the input file. */
 				xfer -= aresid;
 				eof = true;
 				lastblock = true;
 			}
 			if (error == 0) {
 				/*
 				 * Skip the write for holes past the initial EOF
 				 * of the output file, unless this is the last
 				 * write of the output file at EOF.
 				 */
 				readzeros = cantseek ? mem_iszero(dat, xfer) :
 				    false;
 				if (xfer == len)
 					lastblock = true;
 				if (!cantseek || *outoffp < outsize ||
 				    lastblock || !readzeros)
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize,
 					    readzeros && lastblock &&
 					    *outoffp >= outsize, false,
 					    outcred);
 				if (error == 0) {
 					*inoffp += xfer;
 					startoff += xfer;
 					*outoffp += xfer;
 					copylen -= xfer;
 					len -= xfer;
 					if (len < savlen) {
 						interrupted = sig_intr();
 						if (timespecisset(&endts) &&
 						    interrupted == 0) {
 							getnanouptime(&curts);
 							if (timespeccmp(&curts,
 							    &endts, >=))
 								interrupted =
 								    EINTR;
 						}
 					}
 				}
 			}
 			xfer = blksize;
 		}
 	}
 out:
 	*lenp = savlen - len;
 	free(dat, M_TEMP);
 	return (error);
 }
 
 static int
 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	off_t olen, ooffset;
 	int error;
 #ifdef AUDIT
 	int audited_vnode1 = 0;
 #endif
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	/* Allocating blocks may take a long time, so iterate. */
 	for (;;) {
 		olen = len;
 		ooffset = offset;
 
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 		if (error != 0)
 			break;
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vn_finished_write(mp);
 			break;
 		}
 #ifdef AUDIT
 		if (!audited_vnode1) {
 			AUDIT_ARG_VNODE1(vp);
 			audited_vnode1 = 1;
 		}
 #endif
 #ifdef MAC
 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
 		if (error == 0)
 #endif
 			error = VOP_ALLOCATE(vp, &offset, &len, 0,
 			    td->td_ucred);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 
 		if (olen + ooffset != offset + len) {
 			panic("offset + len changed from %jx/%jx to %jx/%jx",
 			    ooffset, olen, offset, len);
 		}
 		if (error != 0 || len == 0)
 			break;
 		KASSERT(olen > len, ("Iteration did not make progress?"));
 		maybe_yield();
 	}
 
 	return (error);
 }
 
 static int
 vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
     int ioflag, struct ucred *cred, struct ucred *active_cred,
     struct ucred *file_cred)
 {
 	struct mount *mp;
 	void *rl_cookie;
 	off_t off, len;
 	int error;
 #ifdef AUDIT
 	bool audited_vnode1 = false;
 #endif
 
 	rl_cookie = NULL;
 	error = 0;
 	mp = NULL;
 	off = *offset;
 	len = *length;
 
 	if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0)
 		rl_cookie = vn_rangelock_wlock(vp, off, off + len);
 	while (len > 0 && error == 0) {
 		/*
 		 * Try to deallocate the longest range in one pass.
 		 * In case a pass takes too long to be executed, it returns
 		 * partial result. The residue will be proceeded in the next
 		 * pass.
 		 */
 
 		if ((ioflag & IO_NODELOCKED) == 0) {
 			bwillwrite();
 			if ((error = vn_start_write(vp, &mp,
 			    V_WAIT | V_PCATCH)) != 0)
 				goto out;
 			vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 		}
 #ifdef AUDIT
 		if (!audited_vnode1) {
 			AUDIT_ARG_VNODE1(vp);
 			audited_vnode1 = true;
 		}
 #endif
 
 #ifdef MAC
 		if ((ioflag & IO_NOMACCHECK) == 0)
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 #endif
 		if (error == 0)
 			error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag,
 			    cred);
 
 		if ((ioflag & IO_NODELOCKED) == 0) {
 			VOP_UNLOCK(vp);
 			if (mp != NULL) {
 				vn_finished_write(mp);
 				mp = NULL;
 			}
 		}
 		if (error == 0 && len != 0)
 			maybe_yield();
 	}
 out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	*offset = off;
 	*length = len;
 	return (error);
 }
 
 /*
  * This function is supposed to be used in the situations where the deallocation
  * is not triggered by a user request.
  */
 int
 vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
     int ioflag, struct ucred *active_cred, struct ucred *file_cred)
 {
 	struct ucred *cred;
 
 	if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
 	    flags != 0)
 		return (EINVAL);
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	cred = file_cred != NOCRED ? file_cred : active_cred;
 	return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred,
 	    active_cred, file_cred));
 }
 
 static int
 vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
     struct ucred *active_cred, struct thread *td)
 {
 	int error;
 	struct vnode *vp;
 	int ioflag;
 
 	KASSERT(cmd == SPACECTL_DEALLOC, ("vn_fspacectl: Invalid cmd"));
 	KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0,
 	    ("vn_fspacectl: non-zero flags"));
 	KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset,
 	    ("vn_fspacectl: offset/length overflow or underflow"));
 	vp = fp->f_vnode;
 
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	ioflag = get_write_ioflag(fp);
 
 	switch (cmd) {
 	case SPACECTL_DEALLOC:
 		error = vn_deallocate_impl(vp, offset, length, flags, ioflag,
 		    active_cred, active_cred, fp->f_cred);
 		break;
 	default:
 		panic("vn_fspacectl: unknown cmd %d", cmd);
 	}
 
 	return (error);
 }
 
 /*
  * Keep this assert as long as sizeof(struct dirent) is used as the maximum
  * entry size.
  */
 _Static_assert(_GENERIC_MAXDIRSIZ == sizeof(struct dirent),
     "'struct dirent' size must be a multiple of its alignment "
     "(see _GENERIC_DIRLEN())");
 
 /*
  * Returns successive directory entries through some caller's provided buffer.
  *
  * This function automatically refills the provided buffer with calls to
  * VOP_READDIR() (after MAC permission checks).
  *
  * 'td' is used for credentials and passed to uiomove().  'dirbuf' is the
  * caller's buffer to fill and 'dirbuflen' its allocated size.  'dirbuf' must
  * be properly aligned to access 'struct dirent' structures and 'dirbuflen'
  * must be greater than GENERIC_MAXDIRSIZ to avoid VOP_READDIR() returning
  * EINVAL (the latter is not a strong guarantee (yet); but EINVAL will always
  * be returned if this requirement is not verified).  '*dpp' points to the
  * current directory entry in the buffer and '*len' contains the remaining
  * valid bytes in 'dirbuf' after 'dpp' (including the pointed entry).
  *
  * At first call (or when restarting the read), '*len' must have been set to 0,
  * '*off' to 0 (or any valid start offset) and '*eofflag' to 0.  There are no
  * more entries as soon as '*len' is 0 after a call that returned 0.  Calling
  * again this function after such a condition is considered an error and EINVAL
  * will be returned.  Other possible error codes are those of VOP_READDIR(),
  * EINTEGRITY if the returned entries do not pass coherency tests, or EINVAL
  * (bad call).  All errors are unrecoverable, i.e., the state ('*len', '*off'
  * and '*eofflag') must be re-initialized before a subsequent call.  On error
  * or at end of directory, '*dpp' is reset to NULL.
  *
  * '*len', '*off' and '*eofflag' are internal state the caller should not
  * tamper with except as explained above.  '*off' is the next directory offset
  * to read from to refill the buffer.  '*eofflag' is set to 0 or 1 by the last
  * internal call to VOP_READDIR() that returned without error, indicating
  * whether it reached the end of the directory, and to 2 by this function after
  * all entries have been read.
  */
 int
 vn_dir_next_dirent(struct vnode *vp, struct thread *td,
     char *dirbuf, size_t dirbuflen,
     struct dirent **dpp, size_t *len, off_t *off, int *eofflag)
 {
 	struct dirent *dp = NULL;
 	int reclen;
 	int error;
 	struct uio uio;
 	struct iovec iov;
 
 	ASSERT_VOP_LOCKED(vp, "vnode not locked");
 	VNASSERT(vp->v_type == VDIR, vp, ("vnode is not a directory"));
 	MPASS2((uintptr_t)dirbuf < (uintptr_t)dirbuf + dirbuflen,
 	    "Address space overflow");
 
 	if (__predict_false(dirbuflen < GENERIC_MAXDIRSIZ)) {
 		/* Don't take any chances in this case */
 		error = EINVAL;
 		goto out;
 	}
 
 	if (*len != 0) {
 		dp = *dpp;
 
 		/*
 		 * The caller continued to call us after an error (we set dp to
 		 * NULL in a previous iteration).  Bail out right now.
 		 */
 		if (__predict_false(dp == NULL))
 			return (EINVAL);
 
 		MPASS(*len <= dirbuflen);
 		MPASS2((uintptr_t)dirbuf <= (uintptr_t)dp &&
 		    (uintptr_t)dp + *len <= (uintptr_t)dirbuf + dirbuflen,
 		    "Filled range not inside buffer");
 
 		reclen = dp->d_reclen;
 		if (reclen >= *len) {
 			/* End of buffer reached */
 			*len = 0;
 		} else {
 			dp = (struct dirent *)((char *)dp + reclen);
 			*len -= reclen;
 		}
 	}
 
 	if (*len == 0) {
 		dp = NULL;
 
 		/* Have to refill. */
 		switch (*eofflag) {
 		case 0:
 			break;
 
 		case 1:
 			/* Nothing more to read. */
 			*eofflag = 2; /* Remember the caller reached EOF. */
 			goto success;
 
 		default:
 			/* The caller didn't test for EOF. */
 			error = EINVAL;
 			goto out;
 		}
 
 		iov.iov_base = dirbuf;
 		iov.iov_len = dirbuflen;
 
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = *off;
 		uio.uio_resid = dirbuflen;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_READ;
 		uio.uio_td = td;
 
 #ifdef MAC
 		error = mac_vnode_check_readdir(td->td_ucred, vp);
 		if (error == 0)
 #endif
 			error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
 			    NULL, NULL);
 		if (error != 0)
 			goto out;
 
 		*len = dirbuflen - uio.uio_resid;
 		*off = uio.uio_offset;
 
 		if (*len == 0) {
 			/* Sanity check on INVARIANTS. */
 			MPASS(*eofflag != 0);
 			*eofflag = 1;
 			goto success;
 		}
 
 		/*
 		 * Normalize the flag returned by VOP_READDIR(), since we use 2
 		 * as a sentinel value.
 		 */
 		if (*eofflag != 0)
 			*eofflag = 1;
 
 		dp = (struct dirent *)dirbuf;
 	}
 
 	if (__predict_false(*len < GENERIC_MINDIRSIZ ||
 	    dp->d_reclen < GENERIC_MINDIRSIZ)) {
 		error = EINTEGRITY;
 		dp = NULL;
 		goto out;
 	}
 
 success:
 	error = 0;
 out:
 	*dpp = dp;
 	return (error);
 }
 
 /*
  * Checks whether a directory is empty or not.
  *
  * If the directory is empty, returns 0, and if it is not, ENOTEMPTY.  Other
  * values are genuine errors preventing the check.
  */
 int
 vn_dir_check_empty(struct vnode *vp)
 {
 	struct thread *const td = curthread;
 	char *dirbuf;
 	size_t dirbuflen, len;
 	off_t off;
 	int eofflag, error;
 	struct dirent *dp;
 	struct vattr va;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
 	VNPASS(vp->v_type == VDIR, vp);
 
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
 	if (error != 0)
 		return (error);
 
 	dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ);
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
 	dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK);
 
 	len = 0;
 	off = 0;
 	eofflag = 0;
 
 	for (;;) {
 		error = vn_dir_next_dirent(vp, td, dirbuf, dirbuflen,
 		    &dp, &len, &off, &eofflag);
 		if (error != 0)
 			goto end;
 
 		if (len == 0) {
 			/* EOF */
 			error = 0;
 			goto end;
 		}
 
 		/*
 		 * Skip whiteouts.  Unionfs operates on filesystems only and
 		 * not on hierarchies, so these whiteouts would be shadowed on
 		 * the system hierarchy but not for a union using the
 		 * filesystem of their directories as the upper layer.
 		 * Additionally, unionfs currently transparently exposes
 		 * union-specific metadata of its upper layer, meaning that
 		 * whiteouts can be seen through the union view in empty
 		 * directories.  Taking into account these whiteouts would then
 		 * prevent mounting another filesystem on such effectively
 		 * empty directories.
 		 */
 		if (dp->d_type == DT_WHT)
 			continue;
 
 		/*
 		 * Any file in the directory which is not '.' or '..' indicates
 		 * the directory is not empty.
 		 */
 		switch (dp->d_namlen) {
 		case 2:
 			if (dp->d_name[1] != '.') {
 				/* Can't be '..' (nor '.') */
 				error = ENOTEMPTY;
 				goto end;
 			}
 			/* FALLTHROUGH */
 		case 1:
 			if (dp->d_name[0] != '.') {
 				/* Can't be '..' nor '.' */
 				error = ENOTEMPTY;
 				goto end;
 			}
 			break;
 
 		default:
 			error = ENOTEMPTY;
 			goto end;
 		}
 	}
 
 end:
 	free(dirbuf, M_TEMP);
 	return (error);
 }
 
 
 static u_long vn_lock_pair_pause_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
     &vn_lock_pair_pause_cnt, 0,
     "Count of vn_lock_pair deadlocks");
 
 u_int vn_lock_pair_pause_max;
 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
     &vn_lock_pair_pause_max, 0,
     "Max ticks for vn_lock_pair deadlock avoidance sleep");
 
 static void
 vn_lock_pair_pause(const char *wmesg)
 {
 	atomic_add_long(&vn_lock_pair_pause_cnt, 1);
 	pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
 }
 
 /*
  * Lock pair of (possibly same) vnodes vp1, vp2, avoiding lock order
  * reversal.  vp1_locked indicates whether vp1 is locked; if not, vp1
  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
  * can be NULL.
  *
  * The function returns with both vnodes exclusively or shared locked,
  * according to corresponding lkflags, and guarantees that it does not
  * create lock order reversal with other threads during its execution.
  * Both vnodes could be unlocked temporary (and reclaimed).
  *
  * If requesting shared locking, locked vnode lock must not be recursed.
  *
  * Only one of LK_SHARED and LK_EXCLUSIVE must be specified.
  * LK_NODDLKTREAT can be optionally passed.
  *
  * If vp1 == vp2, only one, most exclusive, lock is obtained on it.
  */
 void
 vn_lock_pair(struct vnode *vp1, bool vp1_locked, int lkflags1,
     struct vnode *vp2, bool vp2_locked, int lkflags2)
 {
 	int error, locked1;
 
 	MPASS(((lkflags1 & LK_SHARED) != 0) ^ ((lkflags1 & LK_EXCLUSIVE) != 0));
 	MPASS((lkflags1 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0);
 	MPASS(((lkflags2 & LK_SHARED) != 0) ^ ((lkflags2 & LK_EXCLUSIVE) != 0));
 	MPASS((lkflags2 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0);
 
 	if (vp1 == NULL && vp2 == NULL)
 		return;
 
 	if (vp1 == vp2) {
 		MPASS(vp1_locked == vp2_locked);
 
 		/* Select the most exclusive mode for lock. */
 		if ((lkflags1 & LK_TYPE_MASK) != (lkflags2 & LK_TYPE_MASK))
 			lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE;
 
 		if (vp1_locked) {
 			ASSERT_VOP_LOCKED(vp1, "vp1");
 
 			/* No need to relock if any lock is exclusive. */
 			if ((vp1->v_vnlock->lock_object.lo_flags &
 			    LK_NOSHARE) != 0)
 				return;
 
 			locked1 = VOP_ISLOCKED(vp1);
 			if (((lkflags1 & LK_SHARED) != 0 &&
 			    locked1 != LK_EXCLUSIVE) ||
 			    ((lkflags1 & LK_EXCLUSIVE) != 0 &&
 			    locked1 == LK_EXCLUSIVE))
 				return;
 			VOP_UNLOCK(vp1);
 		}
 
 		ASSERT_VOP_UNLOCKED(vp1, "vp1");
 		vn_lock(vp1, lkflags1 | LK_RETRY);
 		return;
 	}		
 
 	if (vp1 != NULL) {
 		if ((lkflags1 & LK_SHARED) != 0 &&
 		    (vp1->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0)
 			lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE;
 		if (vp1_locked && VOP_ISLOCKED(vp1) != LK_EXCLUSIVE) {
 			ASSERT_VOP_LOCKED(vp1, "vp1");
 			if ((lkflags1 & LK_EXCLUSIVE) != 0) {
 				VOP_UNLOCK(vp1);
 				ASSERT_VOP_UNLOCKED(vp1,
 				    "vp1 shared recursed");
 				vp1_locked = false;
 			}
 		} else if (!vp1_locked)
 			ASSERT_VOP_UNLOCKED(vp1, "vp1");
 	} else {
 		vp1_locked = true;
 	}
 
 	if (vp2 != NULL) {
 		if ((lkflags2 & LK_SHARED) != 0 &&
 		    (vp2->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0)
 			lkflags2 = (lkflags2 & ~LK_SHARED) | LK_EXCLUSIVE;
 		if (vp2_locked && VOP_ISLOCKED(vp2) != LK_EXCLUSIVE) {
 			ASSERT_VOP_LOCKED(vp2, "vp2");
 			if ((lkflags2 & LK_EXCLUSIVE) != 0) {
 				VOP_UNLOCK(vp2);
 				ASSERT_VOP_UNLOCKED(vp2,
 				    "vp2 shared recursed");
 				vp2_locked = false;
 			}
 		} else if (!vp2_locked)
 			ASSERT_VOP_UNLOCKED(vp2, "vp2");
 	} else {
 		vp2_locked = true;
 	}
 
 	if (!vp1_locked && !vp2_locked) {
 		vn_lock(vp1, lkflags1 | LK_RETRY);
 		vp1_locked = true;
 	}
 
 	while (!vp1_locked || !vp2_locked) {
 		if (vp1_locked && vp2 != NULL) {
 			if (vp1 != NULL) {
 				error = VOP_LOCK1(vp2, lkflags2 | LK_NOWAIT,
 				    __FILE__, __LINE__);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(vp1);
 				vp1_locked = false;
 				vn_lock_pair_pause("vlp1");
 			}
 			vn_lock(vp2, lkflags2 | LK_RETRY);
 			vp2_locked = true;
 		}
 		if (vp2_locked && vp1 != NULL) {
 			if (vp2 != NULL) {
 				error = VOP_LOCK1(vp1, lkflags1 | LK_NOWAIT,
 				    __FILE__, __LINE__);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(vp2);
 				vp2_locked = false;
 				vn_lock_pair_pause("vlp2");
 			}
 			vn_lock(vp1, lkflags1 | LK_RETRY);
 			vp1_locked = true;
 		}
 	}
 	if (vp1 != NULL) {
 		if (lkflags1 == LK_EXCLUSIVE)
 			ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
 		else
 			ASSERT_VOP_LOCKED(vp1, "vp1 ret");
 	}
 	if (vp2 != NULL) {
 		if (lkflags2 == LK_EXCLUSIVE)
 			ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
 		else
 			ASSERT_VOP_LOCKED(vp2, "vp2 ret");
 	}
 }
 
 int
 vn_lktype_write(struct mount *mp, struct vnode *vp)
 {
 	if (MNT_SHARED_WRITES(mp) ||
 	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount)))
 		return (LK_SHARED);
 	return (LK_EXCLUSIVE);
 }
 
 int
 vn_cmp(struct file *fp1, struct file *fp2, struct thread *td)
 {
 	if (fp2->f_type != DTYPE_VNODE)
 		return (3);
 	return (kcmp_cmp((uintptr_t)fp1->f_vnode, (uintptr_t)fp2->f_vnode));
 }
diff --git a/sys/sys/uio.h b/sys/sys/uio.h
index 6250b3cfbf03..e09a41ab1045 100644
--- a/sys/sys/uio.h
+++ b/sys/sys/uio.h
@@ -1,113 +1,115 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uio.h	8.5 (Berkeley) 2/22/94
  */
 
 #ifndef _SYS_UIO_H_
 #define	_SYS_UIO_H_
 
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 #include <sys/_iovec.h>
 #include <sys/_uio.h>
 
 #ifndef _SSIZE_T_DECLARED
 typedef	__ssize_t	ssize_t;
 #define	_SSIZE_T_DECLARED
 #endif
 
 #ifndef _OFF_T_DECLARED
 typedef	__off_t	off_t;
 #define	_OFF_T_DECLARED
 #endif
 
 #ifdef _KERNEL
 
 struct uio {
 	struct	iovec *uio_iov;		/* scatter/gather list */
 	int	uio_iovcnt;		/* length of scatter/gather list */
 	off_t	uio_offset;		/* offset in target object */
 	ssize_t	uio_resid;		/* remaining bytes to process */
 	enum	uio_seg uio_segflg;	/* address space */
 	enum	uio_rw uio_rw;		/* operation */
 	struct	thread *uio_td;		/* owner */
 };
 
 /*
  * Limits
  *
  * N.B.: UIO_MAXIOV must be no less than IOV_MAX from <sys/syslimits.h>
  * which in turn must be no less than _XOPEN_IOV_MAX from <limits.h>.  If
  * we ever make this tunable (probably pointless), then IOV_MAX should be
  * removed from <sys/syslimits.h> and applications would be expected to use
  * sysconf(3) to find out the correct value, or else assume the worst
  * (_XOPEN_IOV_MAX).  Perhaps UIO_MAXIOV should be simply defined as
  * IOV_MAX.
  */
 #define UIO_MAXIOV	1024		/* max 1K of iov's */
 
 struct vm_object;
 struct vm_page;
 struct bus_dma_segment;
 
+struct uio *allocuio(u_int iovcnt);
+void	freeuio(struct uio *uio);
 struct uio *cloneuio(struct uio *uiop);
 int	copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov,
 	    int error);
 int	copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop);
 int	copyout_map(struct thread *td, vm_offset_t *addr, size_t sz);
 int	copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz);
 int	physcopyin(void *src, vm_paddr_t dst, size_t len);
 int	physcopyout(vm_paddr_t src, void *dst, size_t len);
 int	physcopyin_vlist(struct bus_dma_segment *src, off_t offset,
 	    vm_paddr_t dst, size_t len);
 int	physcopyout_vlist(vm_paddr_t src, struct bus_dma_segment *dst,
 	    off_t offset, size_t len);
 int	uiomove(void *cp, int n, struct uio *uio);
 int	uiomove_frombuf(void *buf, int buflen, struct uio *uio);
 int	uiomove_fromphys(struct vm_page *ma[], vm_offset_t offset, int n,
 	    struct uio *uio);
 int	uiomove_nofault(void *cp, int n, struct uio *uio);
 int	uiomove_object(struct vm_object *obj, off_t obj_size, struct uio *uio);
 
 #else /* !_KERNEL */
 
 __BEGIN_DECLS
 ssize_t	readv(int, const struct iovec *, int);
 ssize_t	writev(int, const struct iovec *, int);
 #if __BSD_VISIBLE
 ssize_t	preadv(int, const struct iovec *, int, off_t);
 ssize_t	pwritev(int, const struct iovec *, int, off_t);
 #endif
 __END_DECLS
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_UIO_H_ */