diff --git a/sys/amd64/sgx/sgx_linux.c b/sys/amd64/sgx/sgx_linux.c
index 60ebec0e0278..c2975b0d2c84 100644
--- a/sys/amd64/sgx/sgx_linux.c
+++ b/sys/amd64/sgx/sgx_linux.c
@@ -1,115 +1,116 @@
 /*-
  * Copyright (c) 2017 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This software was developed by BAE Systems, the University of Cambridge
  * Computer Laboratory, and Memorial University under DARPA/AFRL contract
  * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing
  * (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 
 #include <machine/sgx.h>
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #include <compat/linux/linux_ioctl.h>
 
 #include <amd64/sgx/sgxvar.h>
 
 #include <sys/ioccom.h>
 
 #define	SGX_LINUX_IOCTL_MIN	(SGX_IOC_ENCLAVE_CREATE & 0xffff)
 #define	SGX_LINUX_IOCTL_MAX	(SGX_IOC_ENCLAVE_INIT & 0xffff)
 
 static int
 sgx_linux_ioctl(struct thread *td, struct linux_ioctl_args *args)
 {
 	uint8_t data[SGX_IOCTL_MAX_DATA_LEN];
 	cap_rights_t rights;
 	struct file *fp;
 	u_long cmd;
 	int error;
 	int len;
 
-	error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	error = fget(td, args->fd, cap_rights_init_one(&rights, CAP_IOCTL),
+	    &fp);
 	if (error != 0)
 		return (error);
 
 	cmd = args->cmd;
 
 	args->cmd &= ~(LINUX_IOC_IN | LINUX_IOC_OUT);
 	if ((cmd & LINUX_IOC_IN) != 0)
 		args->cmd |= IOC_IN;
 	if ((cmd & LINUX_IOC_OUT) != 0)
 		args->cmd |= IOC_OUT;
 
 	len = IOCPARM_LEN(cmd);
 	if (len > SGX_IOCTL_MAX_DATA_LEN) {
 		error = EINVAL;
 		goto out;
 	}
 
 	if ((cmd & LINUX_IOC_IN) != 0) {
 		error = copyin((void *)args->arg, data, len);
 		if (error != 0)
 			goto out;
 	}
 
 	error = fo_ioctl(fp, args->cmd, (caddr_t)data, td->td_ucred, td);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static struct linux_ioctl_handler sgx_linux_handler = {
 	sgx_linux_ioctl,
 	SGX_LINUX_IOCTL_MIN,
 	SGX_LINUX_IOCTL_MAX,
 };
 
 SYSINIT(sgx_linux_register, SI_SUB_KLD, SI_ORDER_MIDDLE,
     linux_ioctl_register_handler, &sgx_linux_handler);
 SYSUNINIT(sgx_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
     linux_ioctl_unregister_handler, &sgx_linux_handler);
 
 static int
 sgx_linux_modevent(module_t mod, int type, void *data)
 {
 
 	return (0);
 }
 
 DEV_MODULE(sgx_linux, sgx_linux_modevent, NULL);
 MODULE_DEPEND(sgx_linux, linux64, 1, 1, 1);
diff --git a/sys/compat/freebsd32/freebsd32_ioctl.c b/sys/compat/freebsd32/freebsd32_ioctl.c
index db6946246b90..b3ed457f7132 100644
--- a/sys/compat/freebsd32/freebsd32_ioctl.c
+++ b/sys/compat/freebsd32/freebsd32_ioctl.c
@@ -1,247 +1,247 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2008 David E. O'Brien
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/cdio.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/file.h>
 #include <sys/ioccom.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/pciio.h>
 #include <sys/proc.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ioctl.h>
 #include <compat/freebsd32/freebsd32_misc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 
 CTASSERT(sizeof(struct mem_range_op32) == 12);
 
 static int
 freebsd32_ioctl_memrange(struct thread *td,
     struct freebsd32_ioctl_args *uap, struct file *fp)
 {
 	struct mem_range_op mro;
 	struct mem_range_op32 mro32;
 	int error;
 	u_long com;
 
 	if ((error = copyin(uap->data, &mro32, sizeof(mro32))) != 0)
 		return (error);
 
 	PTRIN_CP(mro32, mro, mo_desc);
 	CP(mro32, mro, mo_arg[0]);
 	CP(mro32, mro, mo_arg[1]);
 
 	com = 0;
 	switch (uap->com) {
 	case MEMRANGE_GET32:
 		com = MEMRANGE_GET;
 		break;
 
 	case MEMRANGE_SET32:
 		com = MEMRANGE_SET;
 		break;
 
 	default:
 		panic("%s: unknown MEMRANGE %#x", __func__, uap->com);
 	}
 
 	if ((error = fo_ioctl(fp, com, (caddr_t)&mro, td->td_ucred, td)) != 0)
 		return (error);
 
 	if ( (com & IOC_OUT) ) {
 		CP(mro, mro32, mo_arg[0]);
 		CP(mro, mro32, mo_arg[1]);
 
 		error = copyout(&mro32, uap->data, sizeof(mro32));
 	}
 
 	return (error);
 }
 
 static int
 freebsd32_ioctl_barmmap(struct thread *td,
     struct freebsd32_ioctl_args *uap, struct file *fp)
 {
 	struct pci_bar_mmap32 pbm32;
 	struct pci_bar_mmap pbm;
 	int error;
 
 	error = copyin(uap->data, &pbm32, sizeof(pbm32));
 	if (error != 0)
 		return (error);
 	PTRIN_CP(pbm32, pbm, pbm_map_base);
 	CP(pbm32, pbm, pbm_sel);
 	CP(pbm32, pbm, pbm_reg);
 	CP(pbm32, pbm, pbm_flags);
 	CP(pbm32, pbm, pbm_memattr);
 	pbm.pbm_bar_length = PAIR32TO64(uint64_t, pbm32.pbm_bar_length);
 	error = fo_ioctl(fp, PCIOCBARMMAP, (caddr_t)&pbm, td->td_ucred, td);
 	if (error == 0) {
 		PTROUT_CP(pbm, pbm32, pbm_map_base);
 		CP(pbm, pbm32, pbm_map_length);
 #if BYTE_ORDER == LITTLE_ENDIAN
 		pbm32.pbm_bar_length1 = pbm.pbm_bar_length;
 		pbm32.pbm_bar_length2 = pbm.pbm_bar_length >> 32;
 #else
 		pbm32.pbm_bar_length1 = pbm.pbm_bar_length >> 32;
 		pbm32.pbm_bar_length2 = pbm.pbm_bar_length;
 #endif
 		CP(pbm, pbm32, pbm_bar_off);
 		error = copyout(&pbm32, uap->data, sizeof(pbm32));
 	}
 	return (error);
 }
 
 static int
 freebsd32_ioctl_sg(struct thread *td,
     struct freebsd32_ioctl_args *uap, struct file *fp)
 {
 	struct sg_io_hdr io;
 	struct sg_io_hdr32 io32;
 	int error;
 
 	if ((error = copyin(uap->data, &io32, sizeof(io32))) != 0)
 		return (error);
 
 	CP(io32, io, interface_id);
 	CP(io32, io, dxfer_direction);
 	CP(io32, io, cmd_len);
 	CP(io32, io, mx_sb_len);
 	CP(io32, io, iovec_count);
 	CP(io32, io, dxfer_len);
 	PTRIN_CP(io32, io, dxferp);
 	PTRIN_CP(io32, io, cmdp);
 	PTRIN_CP(io32, io, sbp);
 	CP(io32, io, timeout);
 	CP(io32, io, flags);
 	CP(io32, io, pack_id);
 	PTRIN_CP(io32, io, usr_ptr);
 	CP(io32, io, status);
 	CP(io32, io, masked_status);
 	CP(io32, io, msg_status);
 	CP(io32, io, sb_len_wr);
 	CP(io32, io, host_status);
 	CP(io32, io, driver_status);
 	CP(io32, io, resid);
 	CP(io32, io, duration);
 	CP(io32, io, info);
 
 	if ((error = fo_ioctl(fp, SG_IO, (caddr_t)&io, td->td_ucred, td)) != 0)
 		return (error);
 
 	CP(io, io32, interface_id);
 	CP(io, io32, dxfer_direction);
 	CP(io, io32, cmd_len);
 	CP(io, io32, mx_sb_len);
 	CP(io, io32, iovec_count);
 	CP(io, io32, dxfer_len);
 	PTROUT_CP(io, io32, dxferp);
 	PTROUT_CP(io, io32, cmdp);
 	PTROUT_CP(io, io32, sbp);
 	CP(io, io32, timeout);
 	CP(io, io32, flags);
 	CP(io, io32, pack_id);
 	PTROUT_CP(io, io32, usr_ptr);
 	CP(io, io32, status);
 	CP(io, io32, masked_status);
 	CP(io, io32, msg_status);
 	CP(io, io32, sb_len_wr);
 	CP(io, io32, host_status);
 	CP(io, io32, driver_status);
 	CP(io, io32, resid);
 	CP(io, io32, duration);
 	CP(io, io32, info);
 
 	error = copyout(&io32, uap->data, sizeof(io32));
 
 	return (error);
 }
 
 int
 freebsd32_ioctl(struct thread *td, struct freebsd32_ioctl_args *uap)
 {
 	struct ioctl_args ap /*{
 		int	fd;
 		u_long	com;
 		caddr_t	data;
 	}*/ ;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
-	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	error = fget(td, uap->fd, cap_rights_init_one(&rights, CAP_IOCTL), &fp);
 	if (error != 0)
 		return (error);
 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	switch (uap->com) {
 	case MEMRANGE_GET32:	/* FALLTHROUGH */
 	case MEMRANGE_SET32:
 		error = freebsd32_ioctl_memrange(td, uap, fp);
 		break;
 
 	case SG_IO_32:
 		error = freebsd32_ioctl_sg(td, uap, fp);
 		break;
 
 	case PCIOCBARMMAP_32:
 		error = freebsd32_ioctl_barmmap(td, uap, fp);
 		break;
 
 	default:
 		fdrop(fp, td);
 		ap.fd = uap->fd;
 		ap.com = uap->com;
 		PTRIN_CP(*uap, ap, data);
 		return sys_ioctl(td, &ap);
 	}
 
 	fdrop(fp, td);
 	return (error);
 }
diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c
index 14afd433d9f1..b7db1c4468d7 100644
--- a/sys/compat/freebsd32/freebsd32_misc.c
+++ b/sys/compat/freebsd32/freebsd32_misc.c
@@ -1,3833 +1,3833 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ktrace.h"
 
 #define __ELF_WORD_SIZE 32
 
 #ifdef COMPAT_FREEBSD11
 #define	_WANT_FREEBSD11_KEVENT
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/capsicum.h>
 #include <sys/clock.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/imgact.h>
 #include <sys/mbuf.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/eventvar.h>	/* Must come after sys/selinfo.h */
 #include <sys/pipe.h>		/* Must come after sys/selinfo.h */
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/thr.h>
 #include <sys/timex.h>
 #include <sys/unistd.h>
 #include <sys/ucontext.h>
 #include <sys/umtx.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 #include <sys/ipc.h>
 #include <sys/msg.h>
 #include <sys/sem.h>
 #include <sys/shm.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #ifdef INET
 #include <netinet/in.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/elf.h>
 #ifdef __amd64__
 #include <machine/md_var.h>
 #endif
 
 #include <security/audit/audit.h>
 
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_misc.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 
 FEATURE(compat_freebsd_32bit, "Compatible with 32-bit FreeBSD");
 
 struct ptrace_io_desc32 {
 	int		piod_op;
 	uint32_t	piod_offs;
 	uint32_t	piod_addr;
 	uint32_t	piod_len;
 };
 
 struct ptrace_sc_ret32 {
 	uint32_t	sr_retval[2];
 	int		sr_error;
 };
 
 struct ptrace_vm_entry32 {
 	int		pve_entry;
 	int		pve_timestamp;
 	uint32_t	pve_start;
 	uint32_t	pve_end;
 	uint32_t	pve_offset;
 	u_int		pve_prot;
 	u_int		pve_pathlen;
 	int32_t		pve_fileid;
 	u_int		pve_fsid;
 	uint32_t	pve_path;
 };
 
 #ifdef __amd64__
 CTASSERT(sizeof(struct timeval32) == 8);
 CTASSERT(sizeof(struct timespec32) == 8);
 CTASSERT(sizeof(struct itimerval32) == 16);
 CTASSERT(sizeof(struct bintime32) == 12);
 #endif
 CTASSERT(sizeof(struct statfs32) == 256);
 #ifdef __amd64__
 CTASSERT(sizeof(struct rusage32) == 72);
 #endif
 CTASSERT(sizeof(struct sigaltstack32) == 12);
 #ifdef __amd64__
 CTASSERT(sizeof(struct kevent32) == 56);
 #else
 CTASSERT(sizeof(struct kevent32) == 64);
 #endif
 CTASSERT(sizeof(struct iovec32) == 8);
 CTASSERT(sizeof(struct msghdr32) == 28);
 #ifdef __amd64__
 CTASSERT(sizeof(struct stat32) == 208);
 CTASSERT(sizeof(struct freebsd11_stat32) == 96);
 #endif
 CTASSERT(sizeof(struct sigaction32) == 24);
 
 static int freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count);
 static int freebsd32_user_clock_nanosleep(struct thread *td, clockid_t clock_id,
     int flags, const struct timespec32 *ua_rqtp, struct timespec32 *ua_rmtp);
 
 void
 freebsd32_rusage_out(const struct rusage *s, struct rusage32 *s32)
 {
 
 	TV_CP(*s, *s32, ru_utime);
 	TV_CP(*s, *s32, ru_stime);
 	CP(*s, *s32, ru_maxrss);
 	CP(*s, *s32, ru_ixrss);
 	CP(*s, *s32, ru_idrss);
 	CP(*s, *s32, ru_isrss);
 	CP(*s, *s32, ru_minflt);
 	CP(*s, *s32, ru_majflt);
 	CP(*s, *s32, ru_nswap);
 	CP(*s, *s32, ru_inblock);
 	CP(*s, *s32, ru_oublock);
 	CP(*s, *s32, ru_msgsnd);
 	CP(*s, *s32, ru_msgrcv);
 	CP(*s, *s32, ru_nsignals);
 	CP(*s, *s32, ru_nvcsw);
 	CP(*s, *s32, ru_nivcsw);
 }
 
 int
 freebsd32_wait4(struct thread *td, struct freebsd32_wait4_args *uap)
 {
 	int error, status;
 	struct rusage32 ru32;
 	struct rusage ru, *rup;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (error)
 		return (error);
 	if (uap->status != NULL)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0) {
 		freebsd32_rusage_out(&ru, &ru32);
 		error = copyout(&ru32, uap->rusage, sizeof(ru32));
 	}
 	return (error);
 }
 
 int
 freebsd32_wait6(struct thread *td, struct freebsd32_wait6_args *uap)
 {
 	struct wrusage32 wru32;
 	struct __wrusage wru, *wrup;
 	struct siginfo32 si32;
 	struct __siginfo si, *sip;
 	int error, status;
 
 	if (uap->wrusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 	if (uap->info != NULL) {
 		sip = &si;
 		bzero(sip, sizeof(*sip));
 	} else
 		sip = NULL;
 	error = kern_wait6(td, uap->idtype, PAIR32TO64(id_t, uap->id),
 	    &status, uap->options, wrup, sip);
 	if (error != 0)
 		return (error);
 	if (uap->status != NULL)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->wrusage != NULL && error == 0) {
 		freebsd32_rusage_out(&wru.wru_self, &wru32.wru_self);
 		freebsd32_rusage_out(&wru.wru_children, &wru32.wru_children);
 		error = copyout(&wru32, uap->wrusage, sizeof(wru32));
 	}
 	if (uap->info != NULL && error == 0) {
 		siginfo_to_siginfo32 (&si, &si32);
 		error = copyout(&si32, uap->info, sizeof(si32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 static void
 copy_statfs(struct statfs *in, struct statfs32 *out)
 {
 
 	statfs_scale_blocks(in, INT32_MAX);
 	bzero(out, sizeof(*out));
 	CP(*in, *out, f_bsize);
 	out->f_iosize = MIN(in->f_iosize, INT32_MAX);
 	CP(*in, *out, f_blocks);
 	CP(*in, *out, f_bfree);
 	CP(*in, *out, f_bavail);
 	out->f_files = MIN(in->f_files, INT32_MAX);
 	out->f_ffree = MIN(in->f_ffree, INT32_MAX);
 	CP(*in, *out, f_fsid);
 	CP(*in, *out, f_owner);
 	CP(*in, *out, f_type);
 	CP(*in, *out, f_flags);
 	out->f_syncwrites = MIN(in->f_syncwrites, INT32_MAX);
 	out->f_asyncwrites = MIN(in->f_asyncwrites, INT32_MAX);
 	strlcpy(out->f_fstypename,
 	      in->f_fstypename, MFSNAMELEN);
 	strlcpy(out->f_mntonname,
 	      in->f_mntonname, min(MNAMELEN, FREEBSD4_MNAMELEN));
 	out->f_syncreads = MIN(in->f_syncreads, INT32_MAX);
 	out->f_asyncreads = MIN(in->f_asyncreads, INT32_MAX);
 	strlcpy(out->f_mntfromname,
 	      in->f_mntfromname, min(MNAMELEN, FREEBSD4_MNAMELEN));
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_getfsstat(struct thread *td,
     struct freebsd4_freebsd32_getfsstat_args *uap)
 {
 	struct statfs *buf, *sp;
 	struct statfs32 stat32;
 	size_t count, size, copycount;
 	int error;
 
 	count = uap->bufsize / sizeof(struct statfs32);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode);
 	if (size > 0) {
 		sp = buf;
 		copycount = count;
 		while (copycount > 0 && error == 0) {
 			copy_statfs(sp, &stat32);
 			error = copyout(&stat32, uap->buf, sizeof(stat32));
 			sp++;
 			uap->buf++;
 			copycount--;
 		}
 		free(buf, M_STATFS);
 	}
 	if (error == 0)
 		td->td_retval[0] = count;
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD10
 int
 freebsd10_freebsd32_pipe(struct thread *td,
     struct freebsd10_freebsd32_pipe_args *uap) {
 	return (freebsd10_pipe(td, (struct freebsd10_pipe_args*)uap));
 }
 #endif
 
 int
 freebsd32_sigaltstack(struct thread *td,
 		      struct freebsd32_sigaltstack_args *uap)
 {
 	struct sigaltstack32 s32;
 	struct sigaltstack ss, oss, *ssp;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		PTRIN_CP(s32, ss, ss_sp);
 		CP(s32, ss, ss_size);
 		CP(s32, ss, ss_flags);
 		ssp = &ss;
 	} else
 		ssp = NULL;
 	error = kern_sigaltstack(td, ssp, &oss);
 	if (error == 0 && uap->oss != NULL) {
 		PTROUT_CP(oss, s32, ss_sp);
 		CP(oss, s32, ss_size);
 		CP(oss, s32, ss_flags);
 		error = copyout(&s32, uap->oss, sizeof(s32));
 	}
 	return (error);
 }
 
 /*
  * Custom version of exec_copyin_args() so that we can translate
  * the pointers.
  */
 int
 freebsd32_exec_copyin_args(struct image_args *args, const char *fname,
     enum uio_seg segflg, u_int32_t *argv, u_int32_t *envv)
 {
 	char *argp, *envp;
 	u_int32_t *p32, arg;
 	int error;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 
 	/*
 	 * Allocate demand-paged memory for the file name, argument, and
 	 * environment strings.
 	 */
 	error = exec_alloc_args(args);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Copy the file name.
 	 */
 	error = exec_args_add_fname(args, fname, segflg);
 	if (error != 0)
 		goto err_exit;
 
 	/*
 	 * extract arguments first
 	 */
 	p32 = argv;
 	for (;;) {
 		error = copyin(p32++, &arg, sizeof(arg));
 		if (error)
 			goto err_exit;
 		if (arg == 0)
 			break;
 		argp = PTRIN(arg);
 		error = exec_args_add_arg(args, argp, UIO_USERSPACE);
 		if (error != 0)
 			goto err_exit;
 	}
 			
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		p32 = envv;
 		for (;;) {
 			error = copyin(p32++, &arg, sizeof(arg));
 			if (error)
 				goto err_exit;
 			if (arg == 0)
 				break;
 			envp = PTRIN(arg);
 			error = exec_args_add_env(args, envp, UIO_USERSPACE);
 			if (error != 0)
 				goto err_exit;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 int
 freebsd32_execve(struct thread *td, struct freebsd32_execve_args *uap)
 {
 	struct image_args eargs;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = freebsd32_exec_copyin_args(&eargs, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL, oldvmspace);
 	post_execve(td, error, oldvmspace);
 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
 	return (error);
 }
 
 int
 freebsd32_fexecve(struct thread *td, struct freebsd32_fexecve_args *uap)
 {
 	struct image_args eargs;
 	struct vmspace *oldvmspace;
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
 	if (error != 0)
 		return (error);
 	error = freebsd32_exec_copyin_args(&eargs, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
 		eargs.fd = uap->fd;
 		error = kern_execve(td, &eargs, NULL, oldvmspace);
 	}
 	post_execve(td, error, oldvmspace);
 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
 	return (error);
 }
 
 int
 freebsd32_mknodat(struct thread *td, struct freebsd32_mknodat_args *uap)
 {
 
 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->mode, PAIR32TO64(dev_t, uap->dev)));
 }
 
 int
 freebsd32_mprotect(struct thread *td, struct freebsd32_mprotect_args *uap)
 {
 	int prot;
 
 	prot = uap->prot;
 #if defined(__amd64__)
 	if (i386_read_exec && (prot & PROT_READ) != 0)
 		prot |= PROT_EXEC;
 #endif
 	return (kern_mprotect(td, (uintptr_t)PTRIN(uap->addr), uap->len,
 	    prot));
 }
 
 int
 freebsd32_mmap(struct thread *td, struct freebsd32_mmap_args *uap)
 {
 	int prot;
 
 	prot = uap->prot;
 #if defined(__amd64__)
 	if (i386_read_exec && (prot & PROT_READ))
 		prot |= PROT_EXEC;
 #endif
 
 	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot,
 	    uap->flags, uap->fd, PAIR32TO64(off_t, uap->pos)));
 }
 
 #ifdef COMPAT_FREEBSD6
 int
 freebsd6_freebsd32_mmap(struct thread *td,
     struct freebsd6_freebsd32_mmap_args *uap)
 {
 	int prot;
 
 	prot = uap->prot;
 #if defined(__amd64__)
 	if (i386_read_exec && (prot & PROT_READ))
 		prot |= PROT_EXEC;
 #endif
 
 	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot,
 	    uap->flags, uap->fd, PAIR32TO64(off_t, uap->pos)));
 }
 #endif
 
 int
 freebsd32_setitimer(struct thread *td, struct freebsd32_setitimer_args *uap)
 {
 	struct itimerval itv, oitv, *itvp;	
 	struct itimerval32 i32;
 	int error;
 
 	if (uap->itv != NULL) {
 		error = copyin(uap->itv, &i32, sizeof(i32));
 		if (error)
 			return (error);
 		TV_CP(i32, itv, it_interval);
 		TV_CP(i32, itv, it_value);
 		itvp = &itv;
 	} else
 		itvp = NULL;
 	error = kern_setitimer(td, uap->which, itvp, &oitv);
 	if (error || uap->oitv == NULL)
 		return (error);
 	TV_CP(oitv, i32, it_interval);
 	TV_CP(oitv, i32, it_value);
 	return (copyout(&i32, uap->oitv, sizeof(i32)));
 }
 
 int
 freebsd32_getitimer(struct thread *td, struct freebsd32_getitimer_args *uap)
 {
 	struct itimerval itv;
 	struct itimerval32 i32;
 	int error;
 
 	error = kern_getitimer(td, uap->which, &itv);
 	if (error || uap->itv == NULL)
 		return (error);
 	TV_CP(itv, i32, it_interval);
 	TV_CP(itv, i32, it_value);
 	return (copyout(&i32, uap->itv, sizeof(i32)));
 }
 
 int
 freebsd32_select(struct thread *td, struct freebsd32_select_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval tv, *tvp;
 	int error;
 
 	if (uap->tv != NULL) {
 		error = copyin(uap->tv, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, tv, tv_sec);
 		CP(tv32, tv, tv_usec);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	/*
 	 * XXX Do pointers need PTRIN()?
 	 */
 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    sizeof(int32_t) * 8));
 }
 
 int
 freebsd32_pselect(struct thread *td, struct freebsd32_pselect_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts;
 	struct timeval tv, *tvp;
 	sigset_t set, *uset;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts32, sizeof(ts32));
 		if (error != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->sm != NULL) {
 		error = copyin(uap->sm, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		uset = &set;
 	} else
 		uset = NULL;
 	/*
 	 * XXX Do pointers need PTRIN()?
 	 */
 	error = kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
 	    uset, sizeof(int32_t) * 8);
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd32_kevent_args *uap;
 	struct kevent32	ks32[KQ_NEVENTS];
 	uint64_t e;
 	int i, j, error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd32_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		CP(kevp[i], ks32[i], ident);
 		CP(kevp[i], ks32[i], filter);
 		CP(kevp[i], ks32[i], flags);
 		CP(kevp[i], ks32[i], fflags);
 #if BYTE_ORDER == LITTLE_ENDIAN
 		ks32[i].data1 = kevp[i].data;
 		ks32[i].data2 = kevp[i].data >> 32;
 #else
 		ks32[i].data1 = kevp[i].data >> 32;
 		ks32[i].data2 = kevp[i].data;
 #endif
 		PTROUT_CP(kevp[i], ks32[i], udata);
 		for (j = 0; j < nitems(kevp->ext); j++) {
 			e = kevp[i].ext[j];
 #if BYTE_ORDER == LITTLE_ENDIAN
 			ks32[i].ext64[2 * j] = e;
 			ks32[i].ext64[2 * j + 1] = e >> 32;
 #else
 			ks32[i].ext64[2 * j] = e >> 32;
 			ks32[i].ext64[2 * j + 1] = e;
 #endif
 		}
 	}
 	error = copyout(ks32, uap->eventlist, count * sizeof *ks32);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd32_kevent_args *uap;
 	struct kevent32	ks32[KQ_NEVENTS];
 	uint64_t e;
 	int i, j, error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd32_kevent_args *)arg;
 
 	error = copyin(uap->changelist, ks32, count * sizeof *ks32);
 	if (error)
 		goto done;
 	uap->changelist += count;
 
 	for (i = 0; i < count; i++) {
 		CP(ks32[i], kevp[i], ident);
 		CP(ks32[i], kevp[i], filter);
 		CP(ks32[i], kevp[i], flags);
 		CP(ks32[i], kevp[i], fflags);
 		kevp[i].data = PAIR32TO64(uint64_t, ks32[i].data);
 		PTRIN_CP(ks32[i], kevp[i], udata);
 		for (j = 0; j < nitems(kevp->ext); j++) {
 #if BYTE_ORDER == LITTLE_ENDIAN
 			e = ks32[i].ext64[2 * j + 1];
 			e <<= 32;
 			e += ks32[i].ext64[2 * j];
 #else
 			e = ks32[i].ext64[2 * j];
 			e <<= 32;
 			e += ks32[i].ext64[2 * j + 1];
 #endif
 			kevp[i].ext[j] = e;
 		}
 	}
 done:
 	return (error);
 }
 
 int
 freebsd32_kevent(struct thread *td, struct freebsd32_kevent_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = freebsd32_kevent_copyout,
 		.k_copyin = freebsd32_kevent_copyin,
 	};
 #ifdef KTRACE
 	struct kevent32 *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("kevent32", UIO_USERSPACE, uap->changelist,
 		    uap->nchanges, sizeof(struct kevent32));
 #endif
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    &k_ops, tsp);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("kevent32", UIO_USERSPACE, eventlist,
 		    td->td_retval[0], sizeof(struct kevent32));
 #endif
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD11
 static int
 freebsd32_kevent11_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_freebsd32_kevent_args *uap;
 	struct kevent32_freebsd11 ks32[KQ_NEVENTS];
 	int i, error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_freebsd32_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		CP(kevp[i], ks32[i], ident);
 		CP(kevp[i], ks32[i], filter);
 		CP(kevp[i], ks32[i], flags);
 		CP(kevp[i], ks32[i], fflags);
 		CP(kevp[i], ks32[i], data);
 		PTROUT_CP(kevp[i], ks32[i], udata);
 	}
 	error = copyout(ks32, uap->eventlist, count * sizeof *ks32);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 freebsd32_kevent11_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_freebsd32_kevent_args *uap;
 	struct kevent32_freebsd11 ks32[KQ_NEVENTS];
 	int i, j, error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_freebsd32_kevent_args *)arg;
 
 	error = copyin(uap->changelist, ks32, count * sizeof *ks32);
 	if (error)
 		goto done;
 	uap->changelist += count;
 
 	for (i = 0; i < count; i++) {
 		CP(ks32[i], kevp[i], ident);
 		CP(ks32[i], kevp[i], filter);
 		CP(ks32[i], kevp[i], flags);
 		CP(ks32[i], kevp[i], fflags);
 		CP(ks32[i], kevp[i], data);
 		PTRIN_CP(ks32[i], kevp[i], udata);
 		for (j = 0; j < nitems(kevp->ext); j++)
 			kevp[i].ext[j] = 0;
 	}
 done:
 	return (error);
 }
 
 int
 freebsd11_freebsd32_kevent(struct thread *td,
     struct freebsd11_freebsd32_kevent_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = freebsd32_kevent11_copyout,
 		.k_copyin = freebsd32_kevent11_copyin,
 	};
 #ifdef KTRACE
 	struct kevent32_freebsd11 *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("kevent32_freebsd11", UIO_USERSPACE,
 		    uap->changelist, uap->nchanges,
 		    sizeof(struct kevent32_freebsd11));
 #endif
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    &k_ops, tsp);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray("kevent32_freebsd11", UIO_USERSPACE,
 		    eventlist, td->td_retval[0],
 		    sizeof(struct kevent32_freebsd11));
 #endif
 	return (error);
 }
 #endif
 
 int
 freebsd32_gettimeofday(struct thread *td,
 		       struct freebsd32_gettimeofday_args *uap)
 {
 	struct timeval atv;
 	struct timeval32 atv32;
 	struct timezone rtz;
 	int error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		CP(atv, atv32, tv_sec);
 		CP(atv, atv32, tv_usec);
 		error = copyout(&atv32, uap->tp, sizeof (atv32));
 	}
 	if (error == 0 && uap->tzp != NULL) {
 		rtz.tz_minuteswest = 0;
 		rtz.tz_dsttime = 0;
 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
 	}
 	return (error);
 }
 
 int
 freebsd32_getrusage(struct thread *td, struct freebsd32_getrusage_args *uap)
 {
 	struct rusage32 s32;
 	struct rusage s;
 	int error;
 
 	error = kern_getrusage(td, uap->who, &s);
 	if (error == 0) {
 		freebsd32_rusage_out(&s, &s32);
 		error = copyout(&s32, uap->rusage, sizeof(s32));
 	}
 	return (error);
 }
 
 static void
 ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl,
     struct ptrace_lwpinfo32 *pl32)
 {
 
 	bzero(pl32, sizeof(*pl32));
 	pl32->pl_lwpid = pl->pl_lwpid;
 	pl32->pl_event = pl->pl_event;
 	pl32->pl_flags = pl->pl_flags;
 	pl32->pl_sigmask = pl->pl_sigmask;
 	pl32->pl_siglist = pl->pl_siglist;
 	siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
 	strcpy(pl32->pl_tdname, pl->pl_tdname);
 	pl32->pl_child_pid = pl->pl_child_pid;
 	pl32->pl_syscall_code = pl->pl_syscall_code;
 	pl32->pl_syscall_narg = pl->pl_syscall_narg;
 }
 
 static void
 ptrace_sc_ret_to32(const struct ptrace_sc_ret *psr,
     struct ptrace_sc_ret32 *psr32)
 {
 
 	bzero(psr32, sizeof(*psr32));
 	psr32->sr_retval[0] = psr->sr_retval[0];
 	psr32->sr_retval[1] = psr->sr_retval[1];
 	psr32->sr_error = psr->sr_error;
 }
 
 int
 freebsd32_ptrace(struct thread *td, struct freebsd32_ptrace_args *uap)
 {
 	union {
 		struct ptrace_io_desc piod;
 		struct ptrace_lwpinfo pl;
 		struct ptrace_vm_entry pve;
 		struct dbreg32 dbreg;
 		struct fpreg32 fpreg;
 		struct reg32 reg;
 		register_t args[nitems(td->td_sa.args)];
 		struct ptrace_sc_ret psr;
 		int ptevents;
 	} r;
 	union {
 		struct ptrace_io_desc32 piod;
 		struct ptrace_lwpinfo32 pl;
 		struct ptrace_vm_entry32 pve;
 		uint32_t args[nitems(td->td_sa.args)];
 		struct ptrace_sc_ret32 psr;
 	} r32;
 	void *addr;
 	int data, error = 0, i;
 
 	AUDIT_ARG_PID(uap->pid);
 	AUDIT_ARG_CMD(uap->req);
 	AUDIT_ARG_VALUE(uap->data);
 	addr = &r;
 	data = uap->data;
 	switch (uap->req) {
 	case PT_GET_EVENT_MASK:
 	case PT_GET_SC_ARGS:
 	case PT_GET_SC_RET:
 		break;
 	case PT_LWPINFO:
 		if (uap->data > sizeof(r32.pl))
 			return (EINVAL);
 
 		/*
 		 * Pass size of native structure in 'data'.  Truncate
 		 * if necessary to avoid siginfo.
 		 */
 		data = sizeof(r.pl);
 		if (uap->data < offsetof(struct ptrace_lwpinfo32, pl_siginfo) +
 		    sizeof(struct siginfo32))
 			data = offsetof(struct ptrace_lwpinfo, pl_siginfo);
 		break;
 	case PT_GETREGS:
 		bzero(&r.reg, sizeof(r.reg));
 		break;
 	case PT_GETFPREGS:
 		bzero(&r.fpreg, sizeof(r.fpreg));
 		break;
 	case PT_GETDBREGS:
 		bzero(&r.dbreg, sizeof(r.dbreg));
 		break;
 	case PT_SETREGS:
 		error = copyin(uap->addr, &r.reg, sizeof(r.reg));
 		break;
 	case PT_SETFPREGS:
 		error = copyin(uap->addr, &r.fpreg, sizeof(r.fpreg));
 		break;
 	case PT_SETDBREGS:
 		error = copyin(uap->addr, &r.dbreg, sizeof(r.dbreg));
 		break;
 	case PT_SET_EVENT_MASK:
 		if (uap->data != sizeof(r.ptevents))
 			error = EINVAL;
 		else
 			error = copyin(uap->addr, &r.ptevents, uap->data);
 		break;
 	case PT_IO:
 		error = copyin(uap->addr, &r32.piod, sizeof(r32.piod));
 		if (error)
 			break;
 		CP(r32.piod, r.piod, piod_op);
 		PTRIN_CP(r32.piod, r.piod, piod_offs);
 		PTRIN_CP(r32.piod, r.piod, piod_addr);
 		CP(r32.piod, r.piod, piod_len);
 		break;
 	case PT_VM_ENTRY:
 		error = copyin(uap->addr, &r32.pve, sizeof(r32.pve));
 		if (error)
 			break;
 
 		CP(r32.pve, r.pve, pve_entry);
 		CP(r32.pve, r.pve, pve_timestamp);
 		CP(r32.pve, r.pve, pve_start);
 		CP(r32.pve, r.pve, pve_end);
 		CP(r32.pve, r.pve, pve_offset);
 		CP(r32.pve, r.pve, pve_prot);
 		CP(r32.pve, r.pve, pve_pathlen);
 		CP(r32.pve, r.pve, pve_fileid);
 		CP(r32.pve, r.pve, pve_fsid);
 		PTRIN_CP(r32.pve, r.pve, pve_path);
 		break;
 	default:
 		addr = uap->addr;
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = kern_ptrace(td, uap->req, uap->pid, addr, data);
 	if (error)
 		return (error);
 
 	switch (uap->req) {
 	case PT_VM_ENTRY:
 		CP(r.pve, r32.pve, pve_entry);
 		CP(r.pve, r32.pve, pve_timestamp);
 		CP(r.pve, r32.pve, pve_start);
 		CP(r.pve, r32.pve, pve_end);
 		CP(r.pve, r32.pve, pve_offset);
 		CP(r.pve, r32.pve, pve_prot);
 		CP(r.pve, r32.pve, pve_pathlen);
 		CP(r.pve, r32.pve, pve_fileid);
 		CP(r.pve, r32.pve, pve_fsid);
 		error = copyout(&r32.pve, uap->addr, sizeof(r32.pve));
 		break;
 	case PT_IO:
 		CP(r.piod, r32.piod, piod_len);
 		error = copyout(&r32.piod, uap->addr, sizeof(r32.piod));
 		break;
 	case PT_GETREGS:
 		error = copyout(&r.reg, uap->addr, sizeof(r.reg));
 		break;
 	case PT_GETFPREGS:
 		error = copyout(&r.fpreg, uap->addr, sizeof(r.fpreg));
 		break;
 	case PT_GETDBREGS:
 		error = copyout(&r.dbreg, uap->addr, sizeof(r.dbreg));
 		break;
 	case PT_GET_EVENT_MASK:
 		/* NB: The size in uap->data is validated in kern_ptrace(). */
 		error = copyout(&r.ptevents, uap->addr, uap->data);
 		break;
 	case PT_LWPINFO:
 		ptrace_lwpinfo_to32(&r.pl, &r32.pl);
 		error = copyout(&r32.pl, uap->addr, uap->data);
 		break;
 	case PT_GET_SC_ARGS:
 		for (i = 0; i < nitems(r.args); i++)
 			r32.args[i] = (uint32_t)r.args[i];
 		error = copyout(r32.args, uap->addr, MIN(uap->data,
 		    sizeof(r32.args)));
 		break;
 	case PT_GET_SC_RET:
 		ptrace_sc_ret_to32(&r.psr, &r32.psr);
 		error = copyout(&r32.psr, uap->addr, MIN(uap->data,
 		    sizeof(r32.psr)));
 		break;
 	}
 
 	return (error);
 }
 
 int
 freebsd32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
 {
 	struct iovec32 iov32;
 	struct iovec *iov;
 	struct uio *uio;
 	u_int iovlen;
 	int error, i;
 
 	*uiop = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (EINVAL);
 	iovlen = iovcnt * sizeof(struct iovec);
 	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
 	iov = (struct iovec *)(uio + 1);
 	for (i = 0; i < iovcnt; i++) {
 		error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
 		if (error) {
 			free(uio, M_IOV);
 			return (error);
 		}
 		iov[i].iov_base = PTRIN(iov32.iov_base);
 		iov[i].iov_len = iov32.iov_len;
 	}
 	uio->uio_iov = iov;
 	uio->uio_iovcnt = iovcnt;
 	uio->uio_segflg = UIO_USERSPACE;
 	uio->uio_offset = -1;
 	uio->uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - uio->uio_resid) {
 			free(uio, M_IOV);
 			return (EINVAL);
 		}
 		uio->uio_resid += iov->iov_len;
 		iov++;
 	}
 	*uiop = uio;
 	return (0);
 }
 
 int
 freebsd32_readv(struct thread *td, struct freebsd32_readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_writev(struct thread *td, struct freebsd32_writev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_writev(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_preadv(struct thread *td, struct freebsd32_preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_pwritev(struct thread *td, struct freebsd32_pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_copyiniov(struct iovec32 *iovp32, u_int iovcnt, struct iovec **iovp,
     int error)
 {
 	struct iovec32 iov32;
 	struct iovec *iov;
 	u_int iovlen;
 	int i;
 
 	*iovp = NULL;
 	if (iovcnt > UIO_MAXIOV)
 		return (error);
 	iovlen = iovcnt * sizeof(struct iovec);
 	iov = malloc(iovlen, M_IOV, M_WAITOK);
 	for (i = 0; i < iovcnt; i++) {
 		error = copyin(&iovp32[i], &iov32, sizeof(struct iovec32));
 		if (error) {
 			free(iov, M_IOV);
 			return (error);
 		}
 		iov[i].iov_base = PTRIN(iov32.iov_base);
 		iov[i].iov_len = iov32.iov_len;
 	}
 	*iovp = iov;
 	return (0);
 }
 
 static int
 freebsd32_copyinmsghdr(struct msghdr32 *msg32, struct msghdr *msg)
 {
 	struct msghdr32 m32;
 	int error;
 
 	error = copyin(msg32, &m32, sizeof(m32));
 	if (error)
 		return (error);
 	msg->msg_name = PTRIN(m32.msg_name);
 	msg->msg_namelen = m32.msg_namelen;
 	msg->msg_iov = PTRIN(m32.msg_iov);
 	msg->msg_iovlen = m32.msg_iovlen;
 	msg->msg_control = PTRIN(m32.msg_control);
 	msg->msg_controllen = m32.msg_controllen;
 	msg->msg_flags = m32.msg_flags;
 	return (0);
 }
 
 static int
 freebsd32_copyoutmsghdr(struct msghdr *msg, struct msghdr32 *msg32)
 {
 	struct msghdr32 m32;
 	int error;
 
 	m32.msg_name = PTROUT(msg->msg_name);
 	m32.msg_namelen = msg->msg_namelen;
 	m32.msg_iov = PTROUT(msg->msg_iov);
 	m32.msg_iovlen = msg->msg_iovlen;
 	m32.msg_control = PTROUT(msg->msg_control);
 	m32.msg_controllen = msg->msg_controllen;
 	m32.msg_flags = msg->msg_flags;
 	error = copyout(&m32, msg32, sizeof(m32));
 	return (error);
 }
 
 #ifndef __mips__
 #define FREEBSD32_ALIGNBYTES	(sizeof(int) - 1)
 #else
 #define FREEBSD32_ALIGNBYTES	(sizeof(long) - 1)
 #endif
 #define FREEBSD32_ALIGN(p)	\
 	(((u_long)(p) + FREEBSD32_ALIGNBYTES) & ~FREEBSD32_ALIGNBYTES)
 #define	FREEBSD32_CMSG_SPACE(l)	\
 	(FREEBSD32_ALIGN(sizeof(struct cmsghdr)) + FREEBSD32_ALIGN(l))
 
 #define	FREEBSD32_CMSG_DATA(cmsg)	((unsigned char *)(cmsg) + \
 				 FREEBSD32_ALIGN(sizeof(struct cmsghdr)))
 
 static size_t
 freebsd32_cmsg_convert(const struct cmsghdr *cm, void *data, socklen_t datalen)
 {
 	size_t copylen;
 	union {
 		struct timespec32 ts;
 		struct timeval32 tv;
 		struct bintime32 bt;
 	} tmp32;
 
 	union {
 		struct timespec ts;
 		struct timeval tv;
 		struct bintime bt;
 	} *in;
 
 	in = data;
 	copylen = 0;
 	switch (cm->cmsg_level) {
 	case SOL_SOCKET:
 		switch (cm->cmsg_type) {
 		case SCM_TIMESTAMP:
 			TV_CP(*in, tmp32, tv);
 			copylen = sizeof(tmp32.tv);
 			break;
 
 		case SCM_BINTIME:
 			BT_CP(*in, tmp32, bt);
 			copylen = sizeof(tmp32.bt);
 			break;
 
 		case SCM_REALTIME:
 		case SCM_MONOTONIC:
 			TS_CP(*in, tmp32, ts);
 			copylen = sizeof(tmp32.ts);
 			break;
 
 		default:
 			break;
 		}
 
 	default:
 		break;
 	}
 
 	if (copylen == 0)
 		return (datalen);
 
 	KASSERT((datalen >= copylen), ("corrupted cmsghdr"));
 
 	bcopy(&tmp32, data, copylen);
 	return (copylen);
 }
 
 static int
 freebsd32_copy_msg_out(struct msghdr *msg, struct mbuf *control)
 {
 	struct cmsghdr *cm;
 	void *data;
 	socklen_t clen, datalen, datalen_out, oldclen;
 	int error;
 	caddr_t ctlbuf;
 	int len, maxlen, copylen;
 	struct mbuf *m;
 	error = 0;
 
 	len    = msg->msg_controllen;
 	maxlen = msg->msg_controllen;
 	msg->msg_controllen = 0;
 
 	ctlbuf = msg->msg_control;
 	for (m = control; m != NULL && len > 0; m = m->m_next) {
 		cm = mtod(m, struct cmsghdr *);
 		clen = m->m_len;
 		while (cm != NULL) {
 			if (sizeof(struct cmsghdr) > clen ||
 			    cm->cmsg_len > clen) {
 				error = EINVAL;
 				break;
 			}
 
 			data   = CMSG_DATA(cm);
 			datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 			datalen_out = freebsd32_cmsg_convert(cm, data, datalen);
 
 			/*
 			 * Copy out the message header.  Preserve the native
 			 * message size in case we need to inspect the message
 			 * contents later.
 			 */
 			copylen = sizeof(struct cmsghdr);
 			if (len < copylen) {
 				msg->msg_flags |= MSG_CTRUNC;
 				m_dispose_extcontrolm(m);
 				goto exit;
 			}
 			oldclen = cm->cmsg_len;
 			cm->cmsg_len = FREEBSD32_ALIGN(sizeof(struct cmsghdr)) +
 			    datalen_out;
 			error = copyout(cm, ctlbuf, copylen);
 			cm->cmsg_len = oldclen;
 			if (error != 0)
 				goto exit;
 
 			ctlbuf += FREEBSD32_ALIGN(copylen);
 			len    -= FREEBSD32_ALIGN(copylen);
 
 			copylen = datalen_out;
 			if (len < copylen) {
 				msg->msg_flags |= MSG_CTRUNC;
 				m_dispose_extcontrolm(m);
 				break;
 			}
 
 			/* Copy out the message data. */
 			error = copyout(data, ctlbuf, copylen);
 			if (error)
 				goto exit;
 
 			ctlbuf += FREEBSD32_ALIGN(copylen);
 			len    -= FREEBSD32_ALIGN(copylen);
 
 			if (CMSG_SPACE(datalen) < clen) {
 				clen -= CMSG_SPACE(datalen);
 				cm = (struct cmsghdr *)
 				    ((caddr_t)cm + CMSG_SPACE(datalen));
 			} else {
 				clen = 0;
 				cm = NULL;
 			}
 
 			msg->msg_controllen +=
 			    FREEBSD32_CMSG_SPACE(datalen_out);
 		}
 	}
 	if (len == 0 && m != NULL) {
 		msg->msg_flags |= MSG_CTRUNC;
 		m_dispose_extcontrolm(m);
 	}
 
 exit:
 	return (error);
 }
 
 int
 freebsd32_recvmsg(td, uap)
 	struct thread *td;
 	struct freebsd32_recvmsg_args /* {
 		int	s;
 		struct	msghdr32 *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct msghdr32 m32;
 	struct iovec *uiov, *iov;
 	struct mbuf *control = NULL;
 	struct mbuf **controlp;
 
 	int error;
 	error = copyin(uap->msg, &m32, sizeof(m32));
 	if (error)
 		return (error);
 	error = freebsd32_copyinmsghdr(uap->msg, &msg);
 	if (error)
 		return (error);
 	error = freebsd32_copyiniov(PTRIN(m32.msg_iov), m32.msg_iovlen, &iov,
 	    EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags;
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 
 	controlp = (msg.msg_control != NULL) ?  &control : NULL;
 	error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, controlp);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 
 		if (control != NULL)
 			error = freebsd32_copy_msg_out(&msg, control);
 		else
 			msg.msg_controllen = 0;
 
 		if (error == 0)
 			error = freebsd32_copyoutmsghdr(&msg, uap->msg);
 	}
 	free(iov, M_IOV);
 
 	if (control != NULL) {
 		if (error != 0)
 			m_dispose_extcontrolm(control);
 		m_freem(control);
 	}
 
 	return (error);
 }
 
 /*
  * Copy-in the array of control messages constructed using alignment
  * and padding suitable for a 32-bit environment and construct an
  * mbuf using alignment and padding suitable for a 64-bit kernel.
  * The alignment and padding are defined indirectly by CMSG_DATA(),
  * CMSG_SPACE() and CMSG_LEN().
  */
 static int
 freebsd32_copyin_control(struct mbuf **mp, caddr_t buf, u_int buflen)
 {
 	struct cmsghdr *cm;
 	struct mbuf *m;
 	void *in, *in1, *md;
 	u_int msglen, outlen;
 	int error;
 
 	if (buflen > MCLBYTES)
 		return (EINVAL);
 
 	in = malloc(buflen, M_TEMP, M_WAITOK);
 	error = copyin(buf, in, buflen);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Make a pass over the input buffer to determine the amount of space
 	 * required for 64 bit-aligned copies of the control messages.
 	 */
 	in1 = in;
 	outlen = 0;
 	while (buflen > 0) {
 		if (buflen < sizeof(*cm)) {
 			error = EINVAL;
 			break;
 		}
 		cm = (struct cmsghdr *)in1;
 		if (cm->cmsg_len < FREEBSD32_ALIGN(sizeof(*cm))) {
 			error = EINVAL;
 			break;
 		}
 		msglen = FREEBSD32_ALIGN(cm->cmsg_len);
 		if (msglen > buflen || msglen < cm->cmsg_len) {
 			error = EINVAL;
 			break;
 		}
 		buflen -= msglen;
 
 		in1 = (char *)in1 + msglen;
 		outlen += CMSG_ALIGN(sizeof(*cm)) +
 		    CMSG_ALIGN(msglen - FREEBSD32_ALIGN(sizeof(*cm)));
 	}
 	if (error == 0 && outlen > MCLBYTES) {
 		/*
 		 * XXXMJ This implies that the upper limit on 32-bit aligned
 		 * control messages is less than MCLBYTES, and so we are not
 		 * perfectly compatible.  However, there is no platform
 		 * guarantee that mbuf clusters larger than MCLBYTES can be
 		 * allocated.
 		 */
 		error = EINVAL;
 	}
 	if (error != 0)
 		goto out;
 
 	m = m_get2(outlen, M_WAITOK, MT_CONTROL, 0);
 	m->m_len = outlen;
 	md = mtod(m, void *);
 
 	/*
 	 * Make a second pass over input messages, copying them into the output
 	 * buffer.
 	 */
 	in1 = in;
 	while (outlen > 0) {
 		/* Copy the message header and align the length field. */
 		cm = md;
 		memcpy(cm, in1, sizeof(*cm));
 		msglen = cm->cmsg_len - FREEBSD32_ALIGN(sizeof(*cm));
 		cm->cmsg_len = CMSG_ALIGN(sizeof(*cm)) + msglen;
 
 		/* Copy the message body. */
 		in1 = (char *)in1 + FREEBSD32_ALIGN(sizeof(*cm));
 		md = (char *)md + CMSG_ALIGN(sizeof(*cm));
 		memcpy(md, in1, msglen);
 		in1 = (char *)in1 + FREEBSD32_ALIGN(msglen);
 		md = (char *)md + CMSG_ALIGN(msglen);
 		KASSERT(outlen >= CMSG_ALIGN(sizeof(*cm)) + CMSG_ALIGN(msglen),
 		    ("outlen %u underflow, msglen %u", outlen, msglen));
 		outlen -= CMSG_ALIGN(sizeof(*cm)) + CMSG_ALIGN(msglen);
 	}
 
 	*mp = m;
 out:
 	free(in, M_TEMP);
 	return (error);
 }
 
 int
 freebsd32_sendmsg(struct thread *td,
 		  struct freebsd32_sendmsg_args *uap)
 {
 	struct msghdr msg;
 	struct msghdr32 m32;
 	struct iovec *iov;
 	struct mbuf *control = NULL;
 	struct sockaddr *to = NULL;
 	int error;
 
 	error = copyin(uap->msg, &m32, sizeof(m32));
 	if (error)
 		return (error);
 	error = freebsd32_copyinmsghdr(uap->msg, &msg);
 	if (error)
 		return (error);
 	error = freebsd32_copyiniov(PTRIN(m32.msg_iov), m32.msg_iovlen, &iov,
 	    EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 	if (msg.msg_name != NULL) {
 		error = getsockaddr(&to, msg.msg_name, msg.msg_namelen);
 		if (error) {
 			to = NULL;
 			goto out;
 		}
 		msg.msg_name = to;
 	}
 
 	if (msg.msg_control) {
 		if (msg.msg_controllen < sizeof(struct cmsghdr)) {
 			error = EINVAL;
 			goto out;
 		}
 
 		error = freebsd32_copyin_control(&control, msg.msg_control,
 		    msg.msg_controllen);
 		if (error)
 			goto out;
 
 		msg.msg_control = NULL;
 		msg.msg_controllen = 0;
 	}
 
 	error = kern_sendit(td, uap->s, &msg, uap->flags, control,
 	    UIO_USERSPACE);
 
 out:
 	free(iov, M_IOV);
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 }
 
 int
 freebsd32_recvfrom(struct thread *td,
 		   struct freebsd32_recvfrom_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	if (uap->fromlenaddr) {
 		error = copyin(PTRIN(uap->fromlenaddr), &msg.msg_namelen,
 		    sizeof(msg.msg_namelen));
 		if (error)
 			return (error);
 	} else {
 		msg.msg_namelen = 0;
 	}
 
 	msg.msg_name = PTRIN(uap->from);
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = PTRIN(uap->buf);
 	aiov.iov_len = uap->len;
 	msg.msg_control = NULL;
 	msg.msg_flags = uap->flags;
 	error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, NULL);
 	if (error == 0 && uap->fromlenaddr)
 		error = copyout(&msg.msg_namelen, PTRIN(uap->fromlenaddr),
 		    sizeof (msg.msg_namelen));
 	return (error);
 }
 
 int
 freebsd32_settimeofday(struct thread *td,
 		       struct freebsd32_settimeofday_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval tv, *tvp;
 	struct timezone tz, *tzp;
 	int error;
 
 	if (uap->tv) {
 		error = copyin(uap->tv, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, tv, tv_sec);
 		CP(tv32, tv, tv_usec);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 	if (uap->tzp) {
 		error = copyin(uap->tzp, &tz, sizeof(tz));
 		if (error)
 			return (error);
 		tzp = &tz;
 	} else
 		tzp = NULL;
 	return (kern_settimeofday(td, tvp, tzp));
 }
 
 int
 freebsd32_utimes(struct thread *td, struct freebsd32_utimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_lutimes(struct thread *td, struct freebsd32_lutimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimes(struct thread *td, struct freebsd32_futimes_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->tptr != NULL) {
 		error = copyin(uap->tptr, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_futimes(td, uap->fd, sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimesat(struct thread *td, struct freebsd32_futimesat_args *uap)
 {
 	struct timeval32 s32[2];
 	struct timeval s[2], *sp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, s32, sizeof(s32));
 		if (error)
 			return (error);
 		CP(s32[0], s[0], tv_sec);
 		CP(s32[0], s[0], tv_usec);
 		CP(s32[1], s[1], tv_sec);
 		CP(s32[1], s[1], tv_usec);
 		sp = s;
 	} else
 		sp = NULL;
 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
 		sp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_futimens(struct thread *td, struct freebsd32_futimens_args *uap)
 {
 	struct timespec32 ts32[2];
 	struct timespec ts[2], *tsp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32[0], ts[0], tv_sec);
 		CP(ts32[0], ts[0], tv_nsec);
 		CP(ts32[1], ts[1], tv_sec);
 		CP(ts32[1], ts[1], tv_nsec);
 		tsp = ts;
 	} else
 		tsp = NULL;
 	return (kern_futimens(td, uap->fd, tsp, UIO_SYSSPACE));
 }
 
 int
 freebsd32_utimensat(struct thread *td, struct freebsd32_utimensat_args *uap)
 {
 	struct timespec32 ts32[2];
 	struct timespec ts[2], *tsp;
 	int error;
 
 	if (uap->times != NULL) {
 		error = copyin(uap->times, ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		CP(ts32[0], ts[0], tv_sec);
 		CP(ts32[0], ts[0], tv_nsec);
 		CP(ts32[1], ts[1], tv_sec);
 		CP(ts32[1], ts[1], tv_nsec);
 		tsp = ts;
 	} else
 		tsp = NULL;
 	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    tsp, UIO_SYSSPACE, uap->flag));
 }
 
 int
 freebsd32_adjtime(struct thread *td, struct freebsd32_adjtime_args *uap)
 {
 	struct timeval32 tv32;
 	struct timeval delta, olddelta, *deltap;
 	int error;
 
 	if (uap->delta) {
 		error = copyin(uap->delta, &tv32, sizeof(tv32));
 		if (error)
 			return (error);
 		CP(tv32, delta, tv_sec);
 		CP(tv32, delta, tv_usec);
 		deltap = &delta;
 	} else
 		deltap = NULL;
 	error = kern_adjtime(td, deltap, &olddelta);
 	if (uap->olddelta && error == 0) {
 		CP(olddelta, tv32, tv_sec);
 		CP(olddelta, tv32, tv_usec);
 		error = copyout(&tv32, uap->olddelta, sizeof(tv32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_statfs(struct thread *td, struct freebsd4_freebsd32_statfs_args *uap)
 {
 	struct statfs32 s32;
 	struct statfs *sp;
 	int error;
 
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_fstatfs(struct thread *td, struct freebsd4_freebsd32_fstatfs_args *uap)
 {
 	struct statfs32 s32;
 	struct statfs *sp;
 	int error;
 
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_fhstatfs(struct thread *td, struct freebsd4_freebsd32_fhstatfs_args *uap)
 {
 	struct statfs32 s32;
 	struct statfs *sp;
 	fhandle_t fh;
 	int error;
 
 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
 		return (error);
 	sp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sp);
 	if (error == 0) {
 		copy_statfs(sp, &s32);
 		error = copyout(&s32, uap->buf, sizeof(s32));
 	}
 	free(sp, M_STATFS);
 	return (error);
 }
 #endif
 
 int
 freebsd32_pread(struct thread *td, struct freebsd32_pread_args *uap)
 {
 
 	return (kern_pread(td, uap->fd, uap->buf, uap->nbyte,
 	    PAIR32TO64(off_t, uap->offset)));
 }
 
 int
 freebsd32_pwrite(struct thread *td, struct freebsd32_pwrite_args *uap)
 {
 
 	return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte,
 	    PAIR32TO64(off_t, uap->offset)));
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_lseek(struct thread *td, struct ofreebsd32_lseek_args *uap)
 {
 
 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 }
 #endif
 
 int
 freebsd32_lseek(struct thread *td, struct freebsd32_lseek_args *uap)
 {
 	int error;
 	off_t pos;
 
 	error = kern_lseek(td, uap->fd, PAIR32TO64(off_t, uap->offset),
 	    uap->whence);
 	/* Expand the quad return into two parts for eax and edx */
 	pos = td->td_uretoff.tdu_off;
 	td->td_retval[RETVAL_LO] = pos & 0xffffffff;	/* %eax */
 	td->td_retval[RETVAL_HI] = pos >> 32;		/* %edx */
 	return error;
 }
 
 int
 freebsd32_truncate(struct thread *td, struct freebsd32_truncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE,
 	    PAIR32TO64(off_t, uap->length)));
 }
 
 int
 freebsd32_ftruncate(struct thread *td, struct freebsd32_ftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, PAIR32TO64(off_t, uap->length)));
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_getdirentries(struct thread *td,
     struct ofreebsd32_getdirentries_args *uap)
 {
 	struct ogetdirentries_args ap;
 	int error;
 	long loff;
 	int32_t loff_cut;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	error = kern_ogetdirentries(td, &ap, &loff);
 	if (error == 0) {
 		loff_cut = loff;
 		error = copyout(&loff_cut, uap->basep, sizeof(int32_t));
 	}
 	return (error);
 }
 #endif
 
 #if defined(COMPAT_FREEBSD11)
 int
 freebsd11_freebsd32_getdirentries(struct thread *td,
     struct freebsd11_freebsd32_getdirentries_args *uap)
 {
 	long base;
 	int32_t base32;
 	int error;
 
 	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
 	    &base, NULL);
 	if (error)
 		return (error);
 	if (uap->basep != NULL) {
 		base32 = base;
 		error = copyout(&base32, uap->basep, sizeof(int32_t));
 	}
 	return (error);
 }
 
 int
 freebsd11_freebsd32_getdents(struct thread *td,
     struct freebsd11_freebsd32_getdents_args *uap)
 {
 	struct freebsd11_freebsd32_getdirentries_args ap;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	return (freebsd11_freebsd32_getdirentries(td, &ap));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 #ifdef COMPAT_FREEBSD6
 /* versions with the 'int pad' argument */
 int
 freebsd6_freebsd32_pread(struct thread *td, struct freebsd6_freebsd32_pread_args *uap)
 {
 
 	return (kern_pread(td, uap->fd, uap->buf, uap->nbyte,
 	    PAIR32TO64(off_t, uap->offset)));
 }
 
 int
 freebsd6_freebsd32_pwrite(struct thread *td, struct freebsd6_freebsd32_pwrite_args *uap)
 {
 
 	return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte,
 	    PAIR32TO64(off_t, uap->offset)));
 }
 
 int
 freebsd6_freebsd32_lseek(struct thread *td, struct freebsd6_freebsd32_lseek_args *uap)
 {
 	int error;
 	off_t pos;
 
 	error = kern_lseek(td, uap->fd, PAIR32TO64(off_t, uap->offset),
 	    uap->whence);
 	/* Expand the quad return into two parts for eax and edx */
 	pos = *(off_t *)(td->td_retval);
 	td->td_retval[RETVAL_LO] = pos & 0xffffffff;	/* %eax */
 	td->td_retval[RETVAL_HI] = pos >> 32;		/* %edx */
 	return error;
 }
 
 int
 freebsd6_freebsd32_truncate(struct thread *td, struct freebsd6_freebsd32_truncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE,
 	    PAIR32TO64(off_t, uap->length)));
 }
 
 int
 freebsd6_freebsd32_ftruncate(struct thread *td, struct freebsd6_freebsd32_ftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, PAIR32TO64(off_t, uap->length)));
 }
 #endif /* COMPAT_FREEBSD6 */
 
 struct sf_hdtr32 {
 	uint32_t headers;
 	int hdr_cnt;
 	uint32_t trailers;
 	int trl_cnt;
 };
 
 static int
 freebsd32_do_sendfile(struct thread *td,
     struct freebsd32_sendfile_args *uap, int compat)
 {
 	struct sf_hdtr32 hdtr32;
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	struct file *fp;
 	cap_rights_t rights;
 	struct iovec32 *iov32;
 	off_t offset, sbytes;
 	int error;
 
 	offset = PAIR32TO64(off_t, uap->offset);
 	if (offset < 0)
 		return (EINVAL);
 
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr32, sizeof(hdtr32));
 		if (error)
 			goto out;
 		PTRIN_CP(hdtr32, hdtr, headers);
 		CP(hdtr32, hdtr, hdr_cnt);
 		PTRIN_CP(hdtr32, hdtr, trailers);
 		CP(hdtr32, hdtr, trl_cnt);
 
 		if (hdtr.headers != NULL) {
 			iov32 = PTRIN(hdtr32.headers);
 			error = freebsd32_copyinuio(iov32,
 			    hdtr32.hdr_cnt, &hdr_uio);
 			if (error)
 				goto out;
 #ifdef COMPAT_FREEBSD4
 			/*
 			 * In FreeBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 #endif
 		}
 		if (hdtr.trailers != NULL) {
 			iov32 = PTRIN(hdtr32.trailers);
 			error = freebsd32_copyinuio(iov32,
 			    hdtr32.trl_cnt, &trl_uio);
 			if (error)
 				goto out;
 		}
 	}
 
 	AUDIT_ARG_FD(uap->fd);
 
 	if ((error = fget_read(td, uap->fd,
-	    cap_rights_init(&rights, CAP_PREAD), &fp)) != 0)
+	    cap_rights_init_one(&rights, CAP_PREAD), &fp)) != 0)
 		goto out;
 
 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, offset,
 	    uap->nbytes, &sbytes, uap->flags, td);
 	fdrop(fp, td);
 
 	if (uap->sbytes != NULL)
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 
 out:
 	if (hdr_uio)
 		free(hdr_uio, M_IOV);
 	if (trl_uio)
 		free(trl_uio, M_IOV);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_sendfile(struct thread *td,
     struct freebsd4_freebsd32_sendfile_args *uap)
 {
 	return (freebsd32_do_sendfile(td,
 	    (struct freebsd32_sendfile_args *)uap, 1));
 }
 #endif
 
 int
 freebsd32_sendfile(struct thread *td, struct freebsd32_sendfile_args *uap)
 {
 
 	return (freebsd32_do_sendfile(td, uap, 0));
 }
 
 static void
 copy_stat(struct stat *in, struct stat32 *out)
 {
 
 	CP(*in, *out, st_dev);
 	CP(*in, *out, st_ino);
 	CP(*in, *out, st_mode);
 	CP(*in, *out, st_nlink);
 	CP(*in, *out, st_uid);
 	CP(*in, *out, st_gid);
 	CP(*in, *out, st_rdev);
 	TS_CP(*in, *out, st_atim);
 	TS_CP(*in, *out, st_mtim);
 	TS_CP(*in, *out, st_ctim);
 	CP(*in, *out, st_size);
 	CP(*in, *out, st_blocks);
 	CP(*in, *out, st_blksize);
 	CP(*in, *out, st_flags);
 	CP(*in, *out, st_gen);
 	TS_CP(*in, *out, st_birthtim);
 	out->st_padding0 = 0;
 	out->st_padding1 = 0;
 #ifdef __STAT32_TIME_T_EXT
 	out->st_atim_ext = 0;
 	out->st_mtim_ext = 0;
 	out->st_ctim_ext = 0;
 	out->st_btim_ext = 0;
 #endif
 	bzero(out->st_spare, sizeof(out->st_spare));
 }
 
 #ifdef COMPAT_43
 static void
 copy_ostat(struct stat *in, struct ostat32 *out)
 {
 
 	bzero(out, sizeof(*out));
 	CP(*in, *out, st_dev);
 	CP(*in, *out, st_ino);
 	CP(*in, *out, st_mode);
 	CP(*in, *out, st_nlink);
 	CP(*in, *out, st_uid);
 	CP(*in, *out, st_gid);
 	CP(*in, *out, st_rdev);
 	out->st_size = MIN(in->st_size, INT32_MAX);
 	TS_CP(*in, *out, st_atim);
 	TS_CP(*in, *out, st_mtim);
 	TS_CP(*in, *out, st_ctim);
 	CP(*in, *out, st_blksize);
 	CP(*in, *out, st_blocks);
 	CP(*in, *out, st_flags);
 	CP(*in, *out, st_gen);
 }
 #endif
 
 #ifdef COMPAT_43
 int
 ofreebsd32_stat(struct thread *td, struct ofreebsd32_stat_args *uap)
 {
 	struct stat sb;
 	struct ostat32 sb32;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error)
 		return (error);
 	copy_ostat(&sb, &sb32);
 	error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_fstat(struct thread *td, struct freebsd32_fstat_args *uap)
 {
 	struct stat ub;
 	struct stat32 ub32;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error)
 		return (error);
 	copy_stat(&ub, &ub32);
 	error = copyout(&ub32, uap->ub, sizeof(ub32));
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_fstat(struct thread *td, struct ofreebsd32_fstat_args *uap)
 {
 	struct stat ub;
 	struct ostat32 ub32;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error)
 		return (error);
 	copy_ostat(&ub, &ub32);
 	error = copyout(&ub32, uap->ub, sizeof(ub32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_fstatat(struct thread *td, struct freebsd32_fstatat_args *uap)
 {
 	struct stat ub;
 	struct stat32 ub32;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE,
 	    &ub, NULL);
 	if (error)
 		return (error);
 	copy_stat(&ub, &ub32);
 	error = copyout(&ub32, uap->buf, sizeof(ub32));
 	return (error);
 }
 
 #ifdef COMPAT_43
 int
 ofreebsd32_lstat(struct thread *td, struct ofreebsd32_lstat_args *uap)
 {
 	struct stat sb;
 	struct ostat32 sb32;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error)
 		return (error);
 	copy_ostat(&sb, &sb32);
 	error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 #endif
 
 int
 freebsd32_fhstat(struct thread *td, struct freebsd32_fhstat_args *uap)
 {
 	struct stat sb;
 	struct stat32 sb32;
 	struct fhandle fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
         if (error != 0)
                 return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error != 0)
 		return (error);
 	copy_stat(&sb, &sb32);
 	error = copyout(&sb32, uap->sb, sizeof (sb32));
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 extern int ino64_trunc_error;
 
 static int
 freebsd11_cvtstat32(struct stat *in, struct freebsd11_stat32 *out)
 {
 
 	CP(*in, *out, st_ino);
 	if (in->st_ino != out->st_ino) {
 		switch (ino64_trunc_error) {
 		default:
 		case 0:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		case 2:
 			out->st_ino = UINT32_MAX;
 			break;
 		}
 	}
 	CP(*in, *out, st_nlink);
 	if (in->st_nlink != out->st_nlink) {
 		switch (ino64_trunc_error) {
 		default:
 		case 0:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		case 2:
 			out->st_nlink = UINT16_MAX;
 			break;
 		}
 	}
 	out->st_dev = in->st_dev;
 	if (out->st_dev != in->st_dev) {
 		switch (ino64_trunc_error) {
 		default:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		}
 	}
 	CP(*in, *out, st_mode);
 	CP(*in, *out, st_uid);
 	CP(*in, *out, st_gid);
 	out->st_rdev = in->st_rdev;
 	if (out->st_rdev != in->st_rdev) {
 		switch (ino64_trunc_error) {
 		default:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		}
 	}
 	TS_CP(*in, *out, st_atim);
 	TS_CP(*in, *out, st_mtim);
 	TS_CP(*in, *out, st_ctim);
 	CP(*in, *out, st_size);
 	CP(*in, *out, st_blocks);
 	CP(*in, *out, st_blksize);
 	CP(*in, *out, st_flags);
 	CP(*in, *out, st_gen);
 	TS_CP(*in, *out, st_birthtim);
 	out->st_lspare = 0;
 	bzero((char *)&out->st_birthtim + sizeof(out->st_birthtim),
 	    sizeof(*out) - offsetof(struct freebsd11_stat32,
 	    st_birthtim) - sizeof(out->st_birthtim));
 	return (0);
 }
 
 int
 freebsd11_freebsd32_stat(struct thread *td,
     struct freebsd11_freebsd32_stat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_fstat(struct thread *td,
     struct freebsd11_freebsd32_fstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_fstatat(struct thread *td,
     struct freebsd11_freebsd32_fstatat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->buf, sizeof (sb32));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_lstat(struct thread *td,
     struct freebsd11_freebsd32_lstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->ub, sizeof (sb32));
 	return (error);
 }
 
 int
 freebsd11_freebsd32_fhstat(struct thread *td,
     struct freebsd11_freebsd32_fhstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat32 sb32;
 	struct fhandle fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
         if (error != 0)
                 return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat32(&sb, &sb32);
 	if (error == 0)
 		error = copyout(&sb32, uap->sb, sizeof (sb32));
 	return (error);
 }
 #endif
 
 int
 freebsd32___sysctl(struct thread *td, struct freebsd32___sysctl_args *uap)
 {
 	int error, name[CTL_MAXNAME];
 	size_t j, oldlen;
 	uint32_t tmp;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
  	error = copyin(uap->name, name, uap->namelen * sizeof(int));
  	if (error)
 		return (error);
 	if (uap->oldlenp) {
 		error = fueword32(uap->oldlenp, &tmp);
 		oldlen = tmp;
 	} else {
 		oldlen = 0;
 	}
 	if (error != 0)
 		return (EFAULT);
 	error = userland_sysctl(td, name, uap->namelen,
 		uap->old, &oldlen, 1,
 		uap->new, uap->newlen, &j, SCTL_MASK32);
 	if (error)
 		return (error);
 	if (uap->oldlenp)
 		suword32(uap->oldlenp, j);
 	return (0);
 }
 
 int
 freebsd32___sysctlbyname(struct thread *td,
     struct freebsd32___sysctlbyname_args *uap)
 {
 	size_t oldlen, rv;
 	int error;
 	uint32_t tmp;
 
 	if (uap->oldlenp != NULL) {
 		error = fueword32(uap->oldlenp, &tmp);
 		oldlen = tmp;
 	} else {
 		error = oldlen = 0;
 	}
 	if (error != 0)
 		return (EFAULT);
 	error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old,
 	    &oldlen, uap->new, uap->newlen, &rv, SCTL_MASK32, 1);
 	if (error != 0)
 		return (error);
 	if (uap->oldlenp != NULL)
 		error = suword32(uap->oldlenp, rv);
 
 	return (error);
 }
 
 int
 freebsd32_jail(struct thread *td, struct freebsd32_jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		/* FreeBSD single IPv4 jails. */
 		struct jail32_v0 j32_v0;
 
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j32_v0, sizeof(struct jail32_v0));
 		if (error)
 			return (error);
 		CP(j32_v0, j, version);
 		PTRIN_CP(j32_v0, j, path);
 		PTRIN_CP(j32_v0, j, hostname);
 		j.ip4s = htonl(j32_v0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 	{
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		struct jail32 j32;
 
 		error = copyin(uap->jail, &j32, sizeof(struct jail32));
 		if (error)
 			return (error);
 		CP(j32, j, version);
 		PTRIN_CP(j32, j, path);
 		PTRIN_CP(j32, j, hostname);
 		PTRIN_CP(j32, j, jailname);
 		CP(j32, j, ip4s);
 		CP(j32, j, ip6s);
 		PTRIN_CP(j32, j, ip4);
 		PTRIN_CP(j32, j, ip6);
 		break;
 	}
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 freebsd32_jail_set(struct thread *td, struct freebsd32_jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_jail_get(struct thread *td, struct freebsd32_jail_get_args *uap)
 {
 	struct iovec32 iov32;
 	struct uio *auio;
 	int error, i;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		for (i = 0; i < uap->iovcnt; i++) {
 			PTROUT_CP(auio->uio_iov[i], iov32, iov_base);
 			CP(auio->uio_iov[i], iov32, iov_len);
 			error = copyout(&iov32, uap->iovp + i, sizeof(iov32));
 			if (error != 0)
 				break;
 		}
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 freebsd32_sigaction(struct thread *td, struct freebsd32_sigaction_args *uap)
 {
 	struct sigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->act) {
 		error = copyin(uap->act, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		CP(s32, sa, sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->sig, sap, &osa, 0);
 	if (error == 0 && uap->oact != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		CP(osa, s32, sa_mask);
 		error = copyout(&s32, uap->oact, sizeof(s32));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_freebsd32_sigaction(struct thread *td,
 			     struct freebsd4_freebsd32_sigaction_args *uap)
 {
 	struct sigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->act) {
 		error = copyin(uap->act, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		CP(s32, sa, sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->sig, sap, &osa, KSA_FREEBSD4);
 	if (error == 0 && uap->oact != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		CP(osa, s32, sa_mask);
 		error = copyout(&s32, uap->oact, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 #ifdef COMPAT_43
 struct osigaction32 {
 	u_int32_t	sa_u;
 	osigset_t	sa_mask;
 	int		sa_flags;
 };
 
 #define	ONSIG	32
 
 int
 ofreebsd32_sigaction(struct thread *td,
 			     struct ofreebsd32_sigaction_args *uap)
 {
 	struct osigaction32 s32;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	if (uap->nsa) {
 		error = copyin(uap->nsa, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(s32.sa_u);
 		CP(s32, sa, sa_flags);
 		OSIG2SIG(s32.sa_mask, sa.sa_mask);
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
 	if (error == 0 && uap->osa != NULL) {
 		s32.sa_u = PTROUT(osa.sa_handler);
 		CP(osa, s32, sa_flags);
 		SIG2OSIG(osa.sa_mask, s32.sa_mask);
 		error = copyout(&s32, uap->osa, sizeof(s32));
 	}
 	return (error);
 }
 
 int
 ofreebsd32_sigprocmask(struct thread *td,
 			       struct ofreebsd32_sigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, SIGPROCMASK_OLD);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 
 int
 ofreebsd32_sigpending(struct thread *td,
 			      struct ofreebsd32_sigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t siglist;
 
 	PROC_LOCK(p);
 	siglist = p->p_siglist;
 	SIGSETOR(siglist, td->td_siglist);
 	PROC_UNLOCK(p);
 	SIG2OSIG(siglist, td->td_retval[0]);
 	return (0);
 }
 
 struct sigvec32 {
 	u_int32_t	sv_handler;
 	int		sv_mask;
 	int		sv_flags;
 };
 
 int
 ofreebsd32_sigvec(struct thread *td,
 			  struct ofreebsd32_sigvec_args *uap)
 {
 	struct sigvec32 vec;
 	struct sigaction sa, osa, *sap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	if (uap->nsv) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		sa.sa_handler = PTRIN(vec.sv_handler);
 		OSIG2SIG(vec.sv_mask, sa.sa_mask);
 		sa.sa_flags = vec.sv_flags;
 		sa.sa_flags ^= SA_RESTART;
 		sap = &sa;
 	} else
 		sap = NULL;
 	error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
 	if (error == 0 && uap->osv != NULL) {
 		vec.sv_handler = PTROUT(osa.sa_handler);
 		SIG2OSIG(osa.sa_mask, vec.sv_mask);
 		vec.sv_flags = osa.sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 int
 ofreebsd32_sigblock(struct thread *td,
 			    struct ofreebsd32_sigblock_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 int
 ofreebsd32_sigsetmask(struct thread *td,
 			      struct ofreebsd32_sigsetmask_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 int
 ofreebsd32_sigsuspend(struct thread *td,
 			      struct ofreebsd32_sigsuspend_args *uap)
 {
 	sigset_t mask;
 
 	OSIG2SIG(uap->mask, mask);
 	return (kern_sigsuspend(td, mask));
 }
 
 struct sigstack32 {
 	u_int32_t	ss_sp;
 	int		ss_onstack;
 };
 
 int
 ofreebsd32_sigstack(struct thread *td,
 			    struct ofreebsd32_sigstack_args *uap)
 {
 	struct sigstack32 s32;
 	struct sigstack nss, oss;
 	int error = 0, unss;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		nss.ss_sp = PTRIN(s32.ss_sp);
 		CP(s32, nss, ss_onstack);
 		unss = 1;
 	} else {
 		unss = 0;
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (unss) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= (nss.ss_onstack & SS_ONSTACK);
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL) {
 		s32.ss_sp = PTROUT(oss.ss_sp);
 		CP(oss, s32, ss_onstack);
 		error = copyout(&s32, uap->oss, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_nanosleep(struct thread *td, struct freebsd32_nanosleep_args *uap)
 {
 
 	return (freebsd32_user_clock_nanosleep(td, CLOCK_REALTIME,
 	    TIMER_RELTIME, uap->rqtp, uap->rmtp));
 }
 
 int
 freebsd32_clock_nanosleep(struct thread *td,
     struct freebsd32_clock_nanosleep_args *uap)
 {
 	int error;
 
 	error = freebsd32_user_clock_nanosleep(td, uap->clock_id, uap->flags,
 	    uap->rqtp, uap->rmtp);
 	return (kern_posix_error(td, error));
 }
 
 static int
 freebsd32_user_clock_nanosleep(struct thread *td, clockid_t clock_id,
     int flags, const struct timespec32 *ua_rqtp, struct timespec32 *ua_rmtp)
 {
 	struct timespec32 rmt32, rqt32;
 	struct timespec rmt, rqt;
 	int error, error2;
 
 	error = copyin(ua_rqtp, &rqt32, sizeof(rqt32));
 	if (error)
 		return (error);
 
 	CP(rqt32, rqt, tv_sec);
 	CP(rqt32, rqt, tv_nsec);
 
 	error = kern_clock_nanosleep(td, clock_id, flags, &rqt, &rmt);
 	if (error == EINTR && ua_rmtp != NULL && (flags & TIMER_ABSTIME) == 0) {
 		CP(rmt, rmt32, tv_sec);
 		CP(rmt, rmt32, tv_nsec);
 
 		error2 = copyout(&rmt32, ua_rmtp, sizeof(rmt32));
 		if (error2 != 0)
 			error = error2;
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_gettime(struct thread *td,
 			struct freebsd32_clock_gettime_args *uap)
 {
 	struct timespec	ats;
 	struct timespec32 ats32;
 	int error;
 
 	error = kern_clock_gettime(td, uap->clock_id, &ats);
 	if (error == 0) {
 		CP(ats, ats32, tv_sec);
 		CP(ats, ats32, tv_nsec);
 		error = copyout(&ats32, uap->tp, sizeof(ats32));
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_settime(struct thread *td,
 			struct freebsd32_clock_settime_args *uap)
 {
 	struct timespec	ats;
 	struct timespec32 ats32;
 	int error;
 
 	error = copyin(uap->tp, &ats32, sizeof(ats32));
 	if (error)
 		return (error);
 	CP(ats32, ats, tv_sec);
 	CP(ats32, ats, tv_nsec);
 
 	return (kern_clock_settime(td, uap->clock_id, &ats));
 }
 
 int
 freebsd32_clock_getres(struct thread *td,
 		       struct freebsd32_clock_getres_args *uap)
 {
 	struct timespec	ts;
 	struct timespec32 ts32;
 	int error;
 
 	if (uap->tp == NULL)
 		return (0);
 	error = kern_clock_getres(td, uap->clock_id, &ts);
 	if (error == 0) {
 		CP(ts, ts32, tv_sec);
 		CP(ts, ts32, tv_nsec);
 		error = copyout(&ts32, uap->tp, sizeof(ts32));
 	}
 	return (error);
 }
 
 int freebsd32_ktimer_create(struct thread *td,
     struct freebsd32_ktimer_create_args *uap)
 {
 	struct sigevent32 ev32;
 	struct sigevent ev, *evp;
 	int error, id;
 
 	if (uap->evp == NULL) {
 		evp = NULL;
 	} else {
 		evp = &ev;
 		error = copyin(uap->evp, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 	}
 	error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1);
 	if (error == 0) {
 		error = copyout(&id, uap->timerid, sizeof(int));
 		if (error != 0)
 			kern_ktimer_delete(td, id);
 	}
 	return (error);
 }
 
 int
 freebsd32_ktimer_settime(struct thread *td,
     struct freebsd32_ktimer_settime_args *uap)
 {
 	struct itimerspec32 val32, oval32;
 	struct itimerspec val, oval, *ovalp;
 	int error;
 
 	error = copyin(uap->value, &val32, sizeof(val32));
 	if (error != 0)
 		return (error);
 	ITS_CP(val32, val);
 	ovalp = uap->ovalue != NULL ? &oval : NULL;
 	error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp);
 	if (error == 0 && uap->ovalue != NULL) {
 		ITS_CP(oval, oval32);
 		error = copyout(&oval32, uap->ovalue, sizeof(oval32));
 	}
 	return (error);
 }
 
 int
 freebsd32_ktimer_gettime(struct thread *td,
     struct freebsd32_ktimer_gettime_args *uap)
 {
 	struct itimerspec32 val32;
 	struct itimerspec val;
 	int error;
 
 	error = kern_ktimer_gettime(td, uap->timerid, &val);
 	if (error == 0) {
 		ITS_CP(val, val32);
 		error = copyout(&val32, uap->value, sizeof(val32));
 	}
 	return (error);
 }
 
 int
 freebsd32_clock_getcpuclockid2(struct thread *td,
     struct freebsd32_clock_getcpuclockid2_args *uap)
 {
 	clockid_t clk_id;
 	int error;
 
 	error = kern_clock_getcpuclockid2(td, PAIR32TO64(id_t, uap->id),
 	    uap->which, &clk_id);
 	if (error == 0)
 		error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
 	return (error);
 }
 
 int
 freebsd32_thr_new(struct thread *td,
 		  struct freebsd32_thr_new_args *uap)
 {
 	struct thr_param32 param32;
 	struct thr_param param;
 	int error;
 
 	if (uap->param_size < 0 ||
 	    uap->param_size > sizeof(struct thr_param32))
 		return (EINVAL);
 	bzero(&param, sizeof(struct thr_param));
 	bzero(&param32, sizeof(struct thr_param32));
 	error = copyin(uap->param, &param32, uap->param_size);
 	if (error != 0)
 		return (error);
 	param.start_func = PTRIN(param32.start_func);
 	param.arg = PTRIN(param32.arg);
 	param.stack_base = PTRIN(param32.stack_base);
 	param.stack_size = param32.stack_size;
 	param.tls_base = PTRIN(param32.tls_base);
 	param.tls_size = param32.tls_size;
 	param.child_tid = PTRIN(param32.child_tid);
 	param.parent_tid = PTRIN(param32.parent_tid);
 	param.flags = param32.flags;
 	param.rtp = PTRIN(param32.rtp);
 	param.spare[0] = PTRIN(param32.spare[0]);
 	param.spare[1] = PTRIN(param32.spare[1]);
 	param.spare[2] = PTRIN(param32.spare[2]);
 
 	return (kern_thr_new(td, &param));
 }
 
 int
 freebsd32_thr_suspend(struct thread *td, struct freebsd32_thr_suspend_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	int error;
 
 	error = 0;
 	tsp = NULL;
 	if (uap->timeout != NULL) {
 		error = copyin((const void *)uap->timeout, (void *)&ts32,
 		    sizeof(struct timespec32));
 		if (error != 0)
 			return (error);
 		ts.tv_sec = ts32.tv_sec;
 		ts.tv_nsec = ts32.tv_nsec;
 		tsp = &ts;
 	}
 	return (kern_thr_suspend(td, tsp));
 }
 
 void
 siginfo_to_siginfo32(const siginfo_t *src, struct siginfo32 *dst)
 {
 	bzero(dst, sizeof(*dst));
 	dst->si_signo = src->si_signo;
 	dst->si_errno = src->si_errno;
 	dst->si_code = src->si_code;
 	dst->si_pid = src->si_pid;
 	dst->si_uid = src->si_uid;
 	dst->si_status = src->si_status;
 	dst->si_addr = (uintptr_t)src->si_addr;
 	dst->si_value.sival_int = src->si_value.sival_int;
 	dst->si_timerid = src->si_timerid;
 	dst->si_overrun = src->si_overrun;
 }
 
 #ifndef _FREEBSD32_SYSPROTO_H_
 struct freebsd32_sigqueue_args {
         pid_t pid;
         int signum;
         /* union sigval32 */ int value;
 };
 #endif
 int
 freebsd32_sigqueue(struct thread *td, struct freebsd32_sigqueue_args *uap)
 {
 	union sigval sv;
 
 	/*
 	 * On 32-bit ABIs, sival_int and sival_ptr are the same.
 	 * On 64-bit little-endian ABIs, the low bits are the same.
 	 * In 64-bit big-endian ABIs, sival_int overlaps with
 	 * sival_ptr's HIGH bits.  We choose to support sival_int
 	 * rather than sival_ptr in this case as it seems to be
 	 * more common.
 	 */
 	bzero(&sv, sizeof(sv));
 	sv.sival_int = uap->value;
 
 	return (kern_sigqueue(td, uap->pid, uap->signum, &sv));
 }
 
 int
 freebsd32_sigtimedwait(struct thread *td, struct freebsd32_sigtimedwait_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	struct siginfo32 si32;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts32, sizeof(ts32));
 		if (error)
 			return (error);
 		ts.tv_sec = ts32.tv_sec;
 		ts.tv_nsec = ts32.tv_nsec;
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info) {
 		siginfo_to_siginfo32(&ksi.ksi_info, &si32);
 		error = copyout(&si32, uap->info, sizeof(struct siginfo32));
 	}
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 freebsd32_sigwaitinfo(struct thread *td, struct freebsd32_sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	struct siginfo32 si32;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info) {
 		siginfo_to_siginfo32(&ksi.ksi_info, &si32);
 		error = copyout(&si32, uap->info, sizeof(struct siginfo32));
 	}	
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 freebsd32_cpuset_setid(struct thread *td,
     struct freebsd32_cpuset_setid_args *uap)
 {
 
 	return (kern_cpuset_setid(td, uap->which,
 	    PAIR32TO64(id_t, uap->id), uap->setid));
 }
 
 int
 freebsd32_cpuset_getid(struct thread *td,
     struct freebsd32_cpuset_getid_args *uap)
 {
 
 	return (kern_cpuset_getid(td, uap->level, uap->which,
 	    PAIR32TO64(id_t, uap->id), uap->setid));
 }
 
 int
 freebsd32_cpuset_getaffinity(struct thread *td,
     struct freebsd32_cpuset_getaffinity_args *uap)
 {
 
 	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
 	    PAIR32TO64(id_t,uap->id), uap->cpusetsize, uap->mask));
 }
 
 int
 freebsd32_cpuset_setaffinity(struct thread *td,
     struct freebsd32_cpuset_setaffinity_args *uap)
 {
 
 	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
 	    PAIR32TO64(id_t,uap->id), uap->cpusetsize, uap->mask));
 }
 
 int
 freebsd32_cpuset_getdomain(struct thread *td,
     struct freebsd32_cpuset_getdomain_args *uap)
 {
 
 	return (kern_cpuset_getdomain(td, uap->level, uap->which,
 	    PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy));
 }
 
 int
 freebsd32_cpuset_setdomain(struct thread *td,
     struct freebsd32_cpuset_setdomain_args *uap)
 {
 
 	return (kern_cpuset_setdomain(td, uap->level, uap->which,
 	    PAIR32TO64(id_t,uap->id), uap->domainsetsize, uap->mask, uap->policy));
 }
 
 int
 freebsd32_nmount(struct thread *td,
     struct freebsd32_nmount_args /* {
     	struct iovec *iovp;
     	unsigned int iovcnt;
     	int flags;
     } */ *uap)
 {
 	struct uio *auio;
 	uint64_t flags;
 	int error;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit archtectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	/*
 	 * check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((uap->iovcnt & 1) || (uap->iovcnt < 4))
 		return (EINVAL);
 
 	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = vfs_donmount(td, flags, auio);
 
 	free(auio, M_IOV);
 	return error;
 }
 
 #if 0
 int
 freebsd32_xxx(struct thread *td, struct freebsd32_xxx_args *uap)
 {
 	struct yyy32 *p32, s32;
 	struct yyy *p = NULL, s;
 	struct xxx_arg ap;
 	int error;
 
 	if (uap->zzz) {
 		error = copyin(uap->zzz, &s32, sizeof(s32));
 		if (error)
 			return (error);
 		/* translate in */
 		p = &s;
 	}
 	error = kern_xxx(td, p);
 	if (error)
 		return (error);
 	if (uap->zzz) {
 		/* translate out */
 		error = copyout(&s32, p32, sizeof(s32));
 	}
 	return (error);
 }
 #endif
 
 int
 syscall32_module_handler(struct module *mod, int what, void *arg)
 {
 
 	return (kern_syscall_module_handler(freebsd32_sysent, mod, what, arg));
 }
 
 int
 syscall32_helper_register(struct syscall_helper_data *sd, int flags)
 {
 
 	return (kern_syscall_helper_register(freebsd32_sysent, sd, flags));
 }
 
 int
 syscall32_helper_unregister(struct syscall_helper_data *sd)
 {
 
 	return (kern_syscall_helper_unregister(freebsd32_sysent, sd));
 }
 
 int
 freebsd32_copyout_strings(struct image_params *imgp, uintptr_t *stack_base)
 {
 	int argc, envc, i;
 	u_int32_t *vectp;
 	char *stringp;
 	uintptr_t destp, ustringp;
 	struct freebsd32_ps_strings *arginfo;
 	char canary[sizeof(long) * 8];
 	int32_t pagesizes32[MAXPAGESIZES];
 	size_t execpath_len;
 	int error, szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	if (imgp->execpath != NULL && imgp->auxargs != NULL)
 		execpath_len = strlen(imgp->execpath) + 1;
 	else
 		execpath_len = 0;
 	arginfo = (struct freebsd32_ps_strings *)curproc->p_sysent->
 	    sv_psstrings;
 	imgp->ps_strings = arginfo;
 	if (imgp->proc->p_sysent->sv_sigcode_base == 0)
 		szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
 	else
 		szsigcode = 0;
 	destp =	(uintptr_t)arginfo;
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode != 0) {
 		destp -= szsigcode;
 		destp = rounddown2(destp, sizeof(uint32_t));
 		error = copyout(imgp->proc->p_sysent->sv_sigcode, (void *)destp,
 		    szsigcode);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (execpath_len != 0) {
 		destp -= execpath_len;
 		imgp->execpathp = (void *)destp;
 		error = copyout(imgp->execpath, imgp->execpathp, execpath_len);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
 	destp -= sizeof(canary);
 	imgp->canary = (void *)destp;
 	error = copyout(canary, imgp->canary, sizeof(canary));
 	if (error != 0)
 		return (error);
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
 	for (i = 0; i < MAXPAGESIZES; i++)
 		pagesizes32[i] = (uint32_t)pagesizes[i];
 	destp -= sizeof(pagesizes32);
 	destp = rounddown2(destp, sizeof(uint32_t));
 	imgp->pagesizes = (void *)destp;
 	error = copyout(pagesizes32, imgp->pagesizes, sizeof(pagesizes32));
 	if (error != 0)
 		return (error);
 	imgp->pagesizeslen = sizeof(pagesizes32);
 
 	/*
 	 * Allocate room for the argument and environment strings.
 	 */
 	destp -= ARG_MAX - imgp->args->stringspace;
 	destp = rounddown2(destp, sizeof(uint32_t));
 	ustringp = destp;
 
 	exec_stackgap(imgp, &destp);
 
 	if (imgp->auxargs) {
 		/*
 		 * Allocate room on the stack for the ELF auxargs
 		 * array.  It has up to AT_COUNT entries.
 		 */
 		destp -= AT_COUNT * sizeof(Elf32_Auxinfo);
 		destp = rounddown2(destp, sizeof(uint32_t));
 	}
 
 	vectp = (uint32_t *)destp;
 
 	/*
 	 * Allocate room for the argv[] and env vectors including the
 	 * terminating NULL pointers.
 	 */
 	vectp -= imgp->args->argc + 1 + imgp->args->envc + 1;
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	*stack_base = (uintptr_t)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	error = copyout(stringp, (void *)ustringp,
 	    ARG_MAX - imgp->args->stringspace);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	imgp->argv = vectp;
 	if (suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp) != 0 ||
 	    suword32(&arginfo->ps_nargvstr, argc) != 0)
 		return (EFAULT);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		if (suword32(vectp++, ustringp) != 0)
 			return (EFAULT);
 		while (*stringp++ != 0)
 			ustringp++;
 		ustringp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	if (suword32(vectp++, 0) != 0)
 		return (EFAULT);
 
 	imgp->envv = vectp;
 	if (suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp) != 0 ||
 	    suword32(&arginfo->ps_nenvstr, envc) != 0)
 		return (EFAULT);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		if (suword32(vectp++, ustringp) != 0)
 			return (EFAULT);
 		while (*stringp++ != 0)
 			ustringp++;
 		ustringp++;
 	}
 
 	/* end of vector table is a null pointer */
 	if (suword32(vectp, 0) != 0)
 		return (EFAULT);
 
 	if (imgp->auxargs) {
 		vectp++;
 		error = imgp->sysent->sv_copyout_auxargs(imgp,
 		    (uintptr_t)vectp);
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 int
 freebsd32_kldstat(struct thread *td, struct freebsd32_kldstat_args *uap)
 {
 	struct kld_file_stat *stat;
 	struct kld32_file_stat *stat32;
 	int error, version;
 
 	if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
 	    != 0)
 		return (error);
 	if (version != sizeof(struct kld32_file_stat_1) &&
 	    version != sizeof(struct kld32_file_stat))
 		return (EINVAL);
 
 	stat = malloc(sizeof(*stat), M_TEMP, M_WAITOK | M_ZERO);
 	stat32 = malloc(sizeof(*stat32), M_TEMP, M_WAITOK | M_ZERO);
 	error = kern_kldstat(td, uap->fileid, stat);
 	if (error == 0) {
 		bcopy(&stat->name[0], &stat32->name[0], sizeof(stat->name));
 		CP(*stat, *stat32, refs);
 		CP(*stat, *stat32, id);
 		PTROUT_CP(*stat, *stat32, address);
 		CP(*stat, *stat32, size);
 		bcopy(&stat->pathname[0], &stat32->pathname[0],
 		    sizeof(stat->pathname));
 		stat32->version  = version;
 		error = copyout(stat32, uap->stat, version);
 	}
 	free(stat, M_TEMP);
 	free(stat32, M_TEMP);
 	return (error);
 }
 
 int
 freebsd32_posix_fallocate(struct thread *td,
     struct freebsd32_posix_fallocate_args *uap)
 {
 	int error;
 
 	error = kern_posix_fallocate(td, uap->fd,
 	    PAIR32TO64(off_t, uap->offset), PAIR32TO64(off_t, uap->len));
 	return (kern_posix_error(td, error));
 }
 
 int
 freebsd32_posix_fadvise(struct thread *td,
     struct freebsd32_posix_fadvise_args *uap)
 {
 	int error;
 
 	error = kern_posix_fadvise(td, uap->fd, PAIR32TO64(off_t, uap->offset),
 	    PAIR32TO64(off_t, uap->len), uap->advice);
 	return (kern_posix_error(td, error));
 }
 
 int
 convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
 {
 
 	CP(*sig32, *sig, sigev_notify);
 	switch (sig->sigev_notify) {
 	case SIGEV_NONE:
 		break;
 	case SIGEV_THREAD_ID:
 		CP(*sig32, *sig, sigev_notify_thread_id);
 		/* FALLTHROUGH */
 	case SIGEV_SIGNAL:
 		CP(*sig32, *sig, sigev_signo);
 		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
 		break;
 	case SIGEV_KEVENT:
 		CP(*sig32, *sig, sigev_notify_kqueue);
 		CP(*sig32, *sig, sigev_notify_kevent_flags);
 		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 freebsd32_procctl(struct thread *td, struct freebsd32_procctl_args *uap)
 {
 	void *data;
 	union {
 		struct procctl_reaper_status rs;
 		struct procctl_reaper_pids rp;
 		struct procctl_reaper_kill rk;
 	} x;
 	union {
 		struct procctl_reaper_pids32 rp;
 	} x32;
 	int error, error1, flags, signum;
 
 	if (uap->com >= PROC_PROCCTL_MD_MIN)
 		return (cpu_procctl(td, uap->idtype, PAIR32TO64(id_t, uap->id),
 		    uap->com, PTRIN(uap->data)));
 
 	switch (uap->com) {
 	case PROC_ASLR_CTL:
 	case PROC_PROTMAX_CTL:
 	case PROC_SPROTECT:
 	case PROC_STACKGAP_CTL:
 	case PROC_TRACE_CTL:
 	case PROC_TRAPCAP_CTL:
 		error = copyin(PTRIN(uap->data), &flags, sizeof(flags));
 		if (error != 0)
 			return (error);
 		data = &flags;
 		break;
 	case PROC_REAP_ACQUIRE:
 	case PROC_REAP_RELEASE:
 		if (uap->data != NULL)
 			return (EINVAL);
 		data = NULL;
 		break;
 	case PROC_REAP_STATUS:
 		data = &x.rs;
 		break;
 	case PROC_REAP_GETPIDS:
 		error = copyin(uap->data, &x32.rp, sizeof(x32.rp));
 		if (error != 0)
 			return (error);
 		CP(x32.rp, x.rp, rp_count);
 		PTRIN_CP(x32.rp, x.rp, rp_pids);
 		data = &x.rp;
 		break;
 	case PROC_REAP_KILL:
 		error = copyin(uap->data, &x.rk, sizeof(x.rk));
 		if (error != 0)
 			return (error);
 		data = &x.rk;
 		break;
 	case PROC_ASLR_STATUS:
 	case PROC_PROTMAX_STATUS:
 	case PROC_STACKGAP_STATUS:
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 		data = &flags;
 		break;
 	case PROC_PDEATHSIG_CTL:
 		error = copyin(uap->data, &signum, sizeof(signum));
 		if (error != 0)
 			return (error);
 		data = &signum;
 		break;
 	case PROC_PDEATHSIG_STATUS:
 		data = &signum;
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = kern_procctl(td, uap->idtype, PAIR32TO64(id_t, uap->id),
 	    uap->com, data);
 	switch (uap->com) {
 	case PROC_REAP_STATUS:
 		if (error == 0)
 			error = copyout(&x.rs, uap->data, sizeof(x.rs));
 		break;
 	case PROC_REAP_KILL:
 		error1 = copyout(&x.rk, uap->data, sizeof(x.rk));
 		if (error == 0)
 			error = error1;
 		break;
 	case PROC_ASLR_STATUS:
 	case PROC_PROTMAX_STATUS:
 	case PROC_STACKGAP_STATUS:
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 		if (error == 0)
 			error = copyout(&flags, uap->data, sizeof(flags));
 		break;
 	case PROC_PDEATHSIG_STATUS:
 		if (error == 0)
 			error = copyout(&signum, uap->data, sizeof(signum));
 		break;
 	}
 	return (error);
 }
 
 int
 freebsd32_fcntl(struct thread *td, struct freebsd32_fcntl_args *uap)
 {
 	long tmp;
 
 	switch (uap->cmd) {
 	/*
 	 * Do unsigned conversion for arg when operation
 	 * interprets it as flags or pointer.
 	 */
 	case F_SETLK_REMOTE:
 	case F_SETLKW:
 	case F_SETLK:
 	case F_GETLK:
 	case F_SETFD:
 	case F_SETFL:
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		tmp = (unsigned int)(uap->arg);
 		break;
 	default:
 		tmp = uap->arg;
 		break;
 	}
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, tmp));
 }
 
 int
 freebsd32_ppoll(struct thread *td, struct freebsd32_ppoll_args *uap)
 {
 	struct timespec32 ts32;
 	struct timespec ts, *tsp;
 	sigset_t set, *ssp;
 	int error;
 
 	if (uap->ts != NULL) {
 		error = copyin(uap->ts, &ts32, sizeof(ts32));
 		if (error != 0)
 			return (error);
 		CP(ts32, ts, tv_sec);
 		CP(ts32, ts, tv_nsec);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 	if (uap->set != NULL) {
 		error = copyin(uap->set, &set, sizeof(set));
 		if (error != 0)
 			return (error);
 		ssp = &set;
 	} else
 		ssp = NULL;
 
 	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
 }
 
 int
 freebsd32_sched_rr_get_interval(struct thread *td,
     struct freebsd32_sched_rr_get_interval_args *uap)
 {
 	struct timespec ts;
 	struct timespec32 ts32;
 	int error;
 
 	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
 	if (error == 0) {
 		CP(ts, ts32, tv_sec);
 		CP(ts, ts32, tv_nsec);
 		error = copyout(&ts32, uap->interval, sizeof(ts32));
 	}
 	return (error);
 }
 
 static void
 timex_to_32(struct timex32 *dst, struct timex *src)
 {
 	CP(*src, *dst, modes);
 	CP(*src, *dst, offset);
 	CP(*src, *dst, freq);
 	CP(*src, *dst, maxerror);
 	CP(*src, *dst, esterror);
 	CP(*src, *dst, status);
 	CP(*src, *dst, constant);
 	CP(*src, *dst, precision);
 	CP(*src, *dst, tolerance);
 	CP(*src, *dst, ppsfreq);
 	CP(*src, *dst, jitter);
 	CP(*src, *dst, shift);
 	CP(*src, *dst, stabil);
 	CP(*src, *dst, jitcnt);
 	CP(*src, *dst, calcnt);
 	CP(*src, *dst, errcnt);
 	CP(*src, *dst, stbcnt);
 }
 
 static void
 timex_from_32(struct timex *dst, struct timex32 *src)
 {
 	CP(*src, *dst, modes);
 	CP(*src, *dst, offset);
 	CP(*src, *dst, freq);
 	CP(*src, *dst, maxerror);
 	CP(*src, *dst, esterror);
 	CP(*src, *dst, status);
 	CP(*src, *dst, constant);
 	CP(*src, *dst, precision);
 	CP(*src, *dst, tolerance);
 	CP(*src, *dst, ppsfreq);
 	CP(*src, *dst, jitter);
 	CP(*src, *dst, shift);
 	CP(*src, *dst, stabil);
 	CP(*src, *dst, jitcnt);
 	CP(*src, *dst, calcnt);
 	CP(*src, *dst, errcnt);
 	CP(*src, *dst, stbcnt);
 }
 
 int
 freebsd32_ntp_adjtime(struct thread *td, struct freebsd32_ntp_adjtime_args *uap)
 {
 	struct timex tx;
 	struct timex32 tx32;
 	int error, retval;
 
 	error = copyin(uap->tp, &tx32, sizeof(tx32));
 	if (error == 0) {
 		timex_from_32(&tx, &tx32);
 		error = kern_ntp_adjtime(td, &tx, &retval);
 		if (error == 0) {
 			timex_to_32(&tx32, &tx);
 			error = copyout(&tx32, uap->tp, sizeof(tx32));
 			if (error == 0)
 				td->td_retval[0] = retval;
 		}
 	}
 	return (error);
 }
diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
index b4b4be1f7b49..dfb4588392cc 100644
--- a/sys/compat/linux/linux_event.c
+++ b/sys/compat/linux/linux_event.c
@@ -1,1047 +1,1048 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007 Roman Divacky
  * Copyright (c) 2014 Dmitry Chagin
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/callout.h>
 #include <sys/capsicum.h>
 #include <sys/types.h>
 #include <sys/user.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/errno.h>
 #include <sys/event.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/selinfo.h>
 #include <sys/specialfd.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/timespec.h>
 #include <sys/eventfd.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_event.h>
 #include <compat/linux/linux_file.h>
 #include <compat/linux/linux_timer.h>
 #include <compat/linux/linux_util.h>
 
 /*
  * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
  * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
  * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
  * data verbatuim. Therefore we allocate 64-bit memory block to pass
  * user supplied data for every file descriptor.
  */
 
 typedef uint64_t	epoll_udata_t;
 
 struct epoll_emuldata {
 	uint32_t	fdc;		/* epoll udata max index */
 	epoll_udata_t	udata[1];	/* epoll user data vector */
 };
 
 #define	EPOLL_DEF_SZ		16
 #define	EPOLL_SIZE(fdn)			\
 	(sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t))
 
 struct epoll_event {
 	uint32_t	events;
 	epoll_udata_t	data;
 }
 #if defined(__amd64__)
 __attribute__((packed))
 #endif
 ;
 
 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
 
 static void	epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata);
 static int	epoll_to_kevent(struct thread *td, int fd,
 		    struct epoll_event *l_event, struct kevent *kevent,
 		    int *nkevents);
 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
 		    int fd, int filter, unsigned int flags);
 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
 		    int fd);
 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
 		    int fd);
 
 struct epoll_copyin_args {
 	struct kevent	*changelist;
 };
 
 struct epoll_copyout_args {
 	struct epoll_event	*leventlist;
 	struct proc		*p;
 	uint32_t		count;
 	int			error;
 };
 
 /* timerfd */
 typedef uint64_t	timerfd_t;
 
 static fo_rdwr_t	timerfd_read;
 static fo_ioctl_t	timerfd_ioctl;
 static fo_poll_t	timerfd_poll;
 static fo_kqfilter_t	timerfd_kqfilter;
 static fo_stat_t	timerfd_stat;
 static fo_close_t	timerfd_close;
 static fo_fill_kinfo_t	timerfd_fill_kinfo;
 
 static struct fileops timerfdops = {
 	.fo_read = timerfd_read,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = timerfd_ioctl,
 	.fo_poll = timerfd_poll,
 	.fo_kqfilter = timerfd_kqfilter,
 	.fo_stat = timerfd_stat,
 	.fo_close = timerfd_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = timerfd_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_timerfddetach(struct knote *kn);
 static int	filt_timerfdread(struct knote *kn, long hint);
 
 static struct filterops timerfd_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_timerfddetach,
 	.f_event = filt_timerfdread
 };
 
 struct timerfd {
 	clockid_t	tfd_clockid;
 	struct itimerspec tfd_time;
 	struct callout	tfd_callout;
 	timerfd_t	tfd_count;
 	bool		tfd_canceled;
 	struct selinfo	tfd_sel;
 	struct mtx	tfd_lock;
 };
 
 static void	linux_timerfd_expire(void *);
 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
 
 static void
 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata)
 {
 	struct linux_pemuldata *pem;
 	struct epoll_emuldata *emd;
 	struct proc *p;
 
 	p = td->td_proc;
 
 	pem = pem_find(p);
 	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
 
 	LINUX_PEM_XLOCK(pem);
 	if (pem->epoll == NULL) {
 		emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
 		emd->fdc = fd;
 		pem->epoll = emd;
 	} else {
 		emd = pem->epoll;
 		if (fd > emd->fdc) {
 			emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
 			emd->fdc = fd;
 			pem->epoll = emd;
 		}
 	}
 	emd->udata[fd] = udata;
 	LINUX_PEM_XUNLOCK(pem);
 }
 
 static int
 epoll_create_common(struct thread *td, int flags)
 {
 	int error;
 
 	error = kern_kqueue(td, flags, NULL);
 	if (error != 0)
 		return (error);
 
 	epoll_fd_install(td, EPOLL_DEF_SZ, 0);
 
 	return (0);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
 {
 
 	/*
 	 * args->size is unused. Linux just tests it
 	 * and then forgets it as well.
 	 */
 	if (args->size <= 0)
 		return (EINVAL);
 
 	return (epoll_create_common(td, 0));
 }
 #endif
 
 int
 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
 {
 	int flags;
 
 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	flags = 0;
 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
 		flags |= O_CLOEXEC;
 
 	return (epoll_create_common(td, flags));
 }
 
 /* Structure converting function from epoll to kevent. */
 static int
 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
     struct kevent *kevent, int *nkevents)
 {
 	uint32_t levents = l_event->events;
 	struct linux_pemuldata *pem;
 	struct proc *p;
 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
 
 	/* flags related to how event is registered */
 	if ((levents & LINUX_EPOLLONESHOT) != 0)
 		kev_flags |= EV_DISPATCH;
 	if ((levents & LINUX_EPOLLET) != 0)
 		kev_flags |= EV_CLEAR;
 	if ((levents & LINUX_EPOLLERR) != 0)
 		kev_flags |= EV_ERROR;
 	if ((levents & LINUX_EPOLLRDHUP) != 0)
 		kev_flags |= EV_EOF;
 
 	/* flags related to what event is registered */
 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
 		EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0, 0);
 		++(*nkevents);
 	}
 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
 		EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
 		++(*nkevents);
 	}
 	/* zero event mask is legal */
 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
 		++(*nkevents);
 	}
 
 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
 		p = td->td_proc;
 
 		pem = pem_find(p);
 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
 		KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n"));
 
 		LINUX_PEM_XLOCK(pem);
 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
 			pem->flags |= LINUX_XUNSUP_EPOLL;
 			LINUX_PEM_XUNLOCK(pem);
 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
 			    levents);
 		} else
 			LINUX_PEM_XUNLOCK(pem);
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 /*
  * Structure converting function from kevent to epoll. In a case
  * this is called on error in registration we store the error in
  * event->data and pick it up later in linux_epoll_ctl().
  */
 static void
 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
 {
 
 	if ((kevent->flags & EV_ERROR) != 0) {
 		l_event->events = LINUX_EPOLLERR;
 		return;
 	}
 
 	/* XXX EPOLLPRI, EPOLLHUP */
 	switch (kevent->filter) {
 	case EVFILT_READ:
 		l_event->events = LINUX_EPOLLIN;
 		if ((kevent->flags & EV_EOF) != 0)
 			l_event->events |= LINUX_EPOLLRDHUP;
 	break;
 	case EVFILT_WRITE:
 		l_event->events = LINUX_EPOLLOUT;
 	break;
 	}
 }
 
 /*
  * Copyout callback used by kevent. This converts kevent
  * events to epoll events and copies them back to the
  * userspace. This is also called on error on registering
  * of the filter.
  */
 static int
 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct epoll_copyout_args *args;
 	struct linux_pemuldata *pem;
 	struct epoll_emuldata *emd;
 	struct epoll_event *eep;
 	int error, fd, i;
 
 	args = (struct epoll_copyout_args*) arg;
 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
 
 	pem = pem_find(args->p);
 	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
 	LINUX_PEM_SLOCK(pem);
 	emd = pem->epoll;
 	KASSERT(emd != NULL, ("epoll proc epolldata not found.\n"));
 
 	for (i = 0; i < count; i++) {
 		kevent_to_epoll(&kevp[i], &eep[i]);
 
 		fd = kevp[i].ident;
 		KASSERT(fd <= emd->fdc, ("epoll user data vector"
 						    " is too small.\n"));
 		eep[i].data = emd->udata[fd];
 	}
 	LINUX_PEM_SUNLOCK(pem);
 
 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
 	if (error == 0) {
 		args->leventlist += count;
 		args->count += count;
 	} else if (args->error == 0)
 		args->error = error;
 
 	free(eep, M_EPOLL);
 	return (error);
 }
 
 /*
  * Copyin callback used by kevent. This copies already
  * converted filters from kernel memory to the kevent
  * internal kernel memory. Hence the memcpy instead of
  * copyin.
  */
 static int
 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct epoll_copyin_args *args;
 
 	args = (struct epoll_copyin_args*) arg;
 
 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
 	args->changelist += count;
 
 	return (0);
 }
 
 /*
  * Load epoll filter, convert it to kevent filter
  * and load it into kevent subsystem.
  */
 int
 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
 {
 	struct file *epfp, *fp;
 	struct epoll_copyin_args ciargs;
 	struct kevent kev[2];
 	struct kevent_copyops k_ops = { &ciargs,
 					NULL,
 					epoll_kev_copyin};
 	struct epoll_event le;
 	cap_rights_t rights;
 	int nchanges = 0;
 	int error;
 
 	if (args->op != LINUX_EPOLL_CTL_DEL) {
 		error = copyin(args->event, &le, sizeof(le));
 		if (error != 0)
 			return (error);
 	}
 
 	error = fget(td, args->epfd,
-	    cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp);
+	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
 	if (error != 0)
 		return (error);
 	if (epfp->f_type != DTYPE_KQUEUE) {
 		error = EINVAL;
 		goto leave1;
 	}
 
 	 /* Protect user data vector from incorrectly supplied fd. */
-	error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
+	error = fget(td, args->fd,
+		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
 	if (error != 0)
 		goto leave1;
 
 	/* Linux disallows spying on himself */
 	if (epfp == fp) {
 		error = EINVAL;
 		goto leave0;
 	}
 
 	ciargs.changelist = kev;
 
 	if (args->op != LINUX_EPOLL_CTL_DEL) {
 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
 		if (error != 0)
 			goto leave0;
 	}
 
 	switch (args->op) {
 	case LINUX_EPOLL_CTL_MOD:
 		error = epoll_delete_all_events(td, epfp, args->fd);
 		if (error != 0)
 			goto leave0;
 		break;
 
 	case LINUX_EPOLL_CTL_ADD:
 		if (epoll_fd_registered(td, epfp, args->fd)) {
 			error = EEXIST;
 			goto leave0;
 		}
 		break;
 
 	case LINUX_EPOLL_CTL_DEL:
 		/* CTL_DEL means unregister this fd with this epoll */
 		error = epoll_delete_all_events(td, epfp, args->fd);
 		goto leave0;
 
 	default:
 		error = EINVAL;
 		goto leave0;
 	}
 
 	epoll_fd_install(td, args->fd, le.data);
 
 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
 
 leave0:
 	fdrop(fp, td);
 
 leave1:
 	fdrop(epfp, td);
 	return (error);
 }
 
 /*
  * Wait for a filter to be triggered on the epoll file descriptor.
  */
 static int
 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
     int maxevents, int timeout, sigset_t *uset)
 {
 	struct epoll_copyout_args coargs;
 	struct kevent_copyops k_ops = { &coargs,
 					epoll_kev_copyout,
 					NULL};
 	struct timespec ts, *tsp;
 	cap_rights_t rights;
 	struct file *epfp;
 	sigset_t omask;
 	int error;
 
 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
 		return (EINVAL);
 
 	error = fget(td, epfd,
-	    cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp);
+	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
 	if (error != 0)
 		return (error);
 	if (epfp->f_type != DTYPE_KQUEUE) {
 		error = EINVAL;
 		goto leave;
 	}
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &omask, 0);
 		if (error != 0)
 			goto leave;
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 
 	coargs.leventlist = events;
 	coargs.p = td->td_proc;
 	coargs.count = 0;
 	coargs.error = 0;
 
 	/*
 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
 	 * to block indefinitely. Real implementation does it if any negative
 	 * timeout value is passed.
 	 */
 	if (timeout >= 0) {
 		/* Convert from milliseconds to timespec. */
 		ts.tv_sec = timeout / 1000;
 		ts.tv_nsec = (timeout % 1000) * 1000000;
 		tsp = &ts;
 	} else {
 		tsp = NULL;
 	}
 
 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
 	if (error == 0 && coargs.error != 0)
 		error = coargs.error;
 
 	/*
 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
 	 * Maybe we should translate that but I don't think it matters at all.
 	 */
 	if (error == 0)
 		td->td_retval[0] = coargs.count;
 
 	if (uset != NULL)
 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
 		    NULL, 0);
 leave:
 	fdrop(epfp, td);
 	return (error);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
 {
 
 	return (linux_epoll_wait_common(td, args->epfd, args->events,
 	    args->maxevents, args->timeout, NULL));
 }
 #endif
 
 int
 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
 {
 	sigset_t mask, *pmask;
 	l_sigset_t lmask;
 	int error;
 
 	if (args->mask != NULL) {
 		if (args->sigsetsize != sizeof(l_sigset_t))
 			return (EINVAL);
 		error = copyin(args->mask, &lmask, sizeof(l_sigset_t));
 		if (error != 0)
 			return (error);
 		linux_to_bsd_sigset(&lmask, &mask);
 		pmask = &mask;
 	} else
 		pmask = NULL;
 	return (linux_epoll_wait_common(td, args->epfd, args->events,
 	    args->maxevents, args->timeout, pmask));
 }
 
 static int
 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
     unsigned int flags)
 {
 	struct epoll_copyin_args ciargs;
 	struct kevent kev;
 	struct kevent_copyops k_ops = { &ciargs,
 					NULL,
 					epoll_kev_copyin};
 
 	ciargs.changelist = &kev;
 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
 
 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
 }
 
 static int
 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
 {
 	/*
 	 * Set empty filter flags to avoid accidental modification of already
 	 * registered events. In the case of event re-registration:
 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
 	 * 2. If event does exists, it's enabled/disabled state is preserved
 	 *    but fflags, data and udata fields are overwritten. So we can not
 	 *    set socket lowats and store user's context pointer in udata.
 	 */
 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
 		return (1);
 
 	return (0);
 }
 
 static int
 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
 {
 	int error1, error2;
 
 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
 
 	/* return 0 if at least one result positive */
 	return (error1 == 0 ? 0 : error2);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
 {
 	struct specialfd_eventfd ae;
 
 	bzero(&ae, sizeof(ae));
 	ae.initval = args->initval;
 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
 }
 #endif
 
 int
 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
 {
 	struct specialfd_eventfd ae;
 	int flags;
 
 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
 	    LINUX_EFD_SEMAPHORE)) != 0)
 		return (EINVAL);
 	flags = 0;
 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
 		flags |= EFD_CLOEXEC;
 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
 		flags |= EFD_NONBLOCK;
 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
 		flags |= EFD_SEMAPHORE;
 
 	bzero(&ae, sizeof(ae));
 	ae.flags = flags;
 	ae.initval = args->initval;
 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
 }
 
 int
 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
 {
 	struct filedesc *fdp;
 	struct timerfd *tfd;
 	struct file *fp;
 	clockid_t clockid;
 	int fflags, fd, error;
 
 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
 		return (EINVAL);
 
 	error = linux_to_native_clockid(&clockid, args->clockid);
 	if (error != 0)
 		return (error);
 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
 		return (EINVAL);
 
 	fflags = 0;
 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
 		fflags |= O_CLOEXEC;
 
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd, fflags);
 	if (error != 0)
 		return (error);
 
 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
 	tfd->tfd_clockid = clockid;
 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
 
 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
 
 	fflags = FREAD;
 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 
 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (error);
 }
 
 static int
 timerfd_close(struct file *fp, struct thread *td)
 {
 	struct timerfd *tfd;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	timespecclear(&tfd->tfd_time.it_value);
 	timespecclear(&tfd->tfd_time.it_interval);
 
 	mtx_lock(&tfd->tfd_lock);
 	callout_drain(&tfd->tfd_callout);
 	mtx_unlock(&tfd->tfd_lock);
 
 	seldrain(&tfd->tfd_sel);
 	knlist_destroy(&tfd->tfd_sel.si_note);
 
 	fp->f_ops = &badfileops;
 	mtx_destroy(&tfd->tfd_lock);
 	free(tfd, M_EPOLL);
 
 	return (0);
 }
 
 static int
 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct timerfd *tfd;
 	timerfd_t count;
 	int error;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	if (uio->uio_resid < sizeof(timerfd_t))
 		return (EINVAL);
 
 	error = 0;
 	mtx_lock(&tfd->tfd_lock);
 retry:
 	if (tfd->tfd_canceled) {
 		tfd->tfd_count = 0;
 		mtx_unlock(&tfd->tfd_lock);
 		return (ECANCELED);
 	}
 	if (tfd->tfd_count == 0) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&tfd->tfd_lock);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
 		if (error == 0)
 			goto retry;
 	}
 	if (error == 0) {
 		count = tfd->tfd_count;
 		tfd->tfd_count = 0;
 		mtx_unlock(&tfd->tfd_lock);
 		error = uiomove(&count, sizeof(timerfd_t), uio);
 	} else
 		mtx_unlock(&tfd->tfd_lock);
 
 	return (error);
 }
 
 static int
 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct timerfd *tfd;
 	int revents = 0;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (POLLERR);
 
 	mtx_lock(&tfd->tfd_lock);
 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
 		revents |= events & (POLLIN|POLLRDNORM);
 	if (revents == 0)
 		selrecord(td, &tfd->tfd_sel);
 	mtx_unlock(&tfd->tfd_lock);
 
 	return (revents);
 }
 
 static int
 timerfd_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct timerfd *tfd;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	if (kn->kn_filter == EVFILT_READ)
 		kn->kn_fop = &timerfd_rfiltops;
 	else
 		return (EINVAL);
 
 	kn->kn_hook = tfd;
 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_timerfddetach(struct knote *kn)
 {
 	struct timerfd *tfd = kn->kn_hook;
 
 	mtx_lock(&tfd->tfd_lock);
 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
 	mtx_unlock(&tfd->tfd_lock);
 }
 
 static int
 filt_timerfdread(struct knote *kn, long hint)
 {
 	struct timerfd *tfd = kn->kn_hook;
 
 	return (tfd->tfd_count > 0);
 }
 
 static int
 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
 		return (EINVAL);
 
 	switch (cmd) {
 	case FIONBIO:
 	case FIOASYNC:
 		return (0);
 	}
 
 	return (ENOTTY);
 }
 
 static int
 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (ENXIO);
 }
 
 static int
 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	return (0);
 }
 
 static void
 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
 {
 
 	if (tfd->tfd_clockid == CLOCK_REALTIME)
 		getnanotime(ts);
 	else	/* CLOCK_MONOTONIC */
 		getnanouptime(ts);
 }
 
 static void
 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
 {
 	struct timespec cts;
 
 	linux_timerfd_clocktime(tfd, &cts);
 	*ots = tfd->tfd_time;
 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
 		timespecsub(&ots->it_value, &cts, &ots->it_value);
 		if (ots->it_value.tv_sec < 0 ||
 		    (ots->it_value.tv_sec == 0 &&
 		     ots->it_value.tv_nsec == 0)) {
 			ots->it_value.tv_sec  = 0;
 			ots->it_value.tv_nsec = 1;
 		}
 	}
 }
 
 int
 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
 {
 	struct l_itimerspec lots;
 	struct itimerspec ots;
 	struct timerfd *tfd;
 	struct file *fp;
 	int error;
 
 	error = fget(td, args->fd, &cap_read_rights, &fp);
 	if (error != 0)
 		return (error);
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	mtx_lock(&tfd->tfd_lock);
 	linux_timerfd_curval(tfd, &ots);
 	mtx_unlock(&tfd->tfd_lock);
 
 	error = native_to_linux_itimerspec(&lots, &ots);
 	if (error == 0)
 		error = copyout(&lots, args->old_value, sizeof(lots));
 
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
 {
 	struct l_itimerspec lots;
 	struct itimerspec nts, ots;
 	struct timespec cts, ts;
 	struct timerfd *tfd;
 	struct timeval tv;
 	struct file *fp;
 	int error;
 
 	if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
 		return (EINVAL);
 
 	error = copyin(args->new_value, &lots, sizeof(lots));
 	if (error != 0)
 		return (error);
 	error = linux_to_native_itimerspec(&nts, &lots);
 	if (error != 0)
 		return (error);
 
 	error = fget(td, args->fd, &cap_write_rights, &fp);
 	if (error != 0)
 		return (error);
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	mtx_lock(&tfd->tfd_lock);
 	if (!timespecisset(&nts.it_value))
 		timespecclear(&nts.it_interval);
 	if (args->old_value != NULL)
 		linux_timerfd_curval(tfd, &ots);
 
 	tfd->tfd_time = nts;
 	if (timespecisset(&nts.it_value)) {
 		linux_timerfd_clocktime(tfd, &cts);
 		ts = nts.it_value;
 		if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
 			timespecadd(&tfd->tfd_time.it_value, &cts,
 				&tfd->tfd_time.it_value);
 		} else {
 			timespecsub(&ts, &cts, &ts);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 			linux_timerfd_expire, tfd);
 		tfd->tfd_canceled = false;
 	} else {
 		tfd->tfd_canceled = true;
 		callout_stop(&tfd->tfd_callout);
 	}
 	mtx_unlock(&tfd->tfd_lock);
 
 	if (args->old_value != NULL) {
 		error = native_to_linux_itimerspec(&lots, &ots);
 		if (error == 0)
 			error = copyout(&lots, args->old_value, sizeof(lots));
 	}
 
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static void
 linux_timerfd_expire(void *arg)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct timerfd *tfd;
 
 	tfd = (struct timerfd *)arg;
 
 	linux_timerfd_clocktime(tfd, &cts);
 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
 		if (timespecisset(&tfd->tfd_time.it_interval))
 			timespecadd(&tfd->tfd_time.it_value,
 				    &tfd->tfd_time.it_interval,
 				    &tfd->tfd_time.it_value);
 		else
 			/* single shot timer */
 			timespecclear(&tfd->tfd_time.it_value);
 		if (timespecisset(&tfd->tfd_time.it_value)) {
 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 				linux_timerfd_expire, tfd);
 		}
 		tfd->tfd_count++;
 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
 		selwakeup(&tfd->tfd_sel);
 		wakeup(&tfd->tfd_count);
 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 		    linux_timerfd_expire, tfd);
 	}
 }
diff --git a/sys/dev/aac/aac_linux.c b/sys/dev/aac/aac_linux.c
index 02babb5c0576..3f2d8a827441 100644
--- a/sys/dev/aac/aac_linux.c
+++ b/sys/dev/aac/aac_linux.c
@@ -1,96 +1,97 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002 Scott Long
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Linux ioctl handler for the aac device driver
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #ifdef __amd64__
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_ioctl.h>
 
 /* There are multiple ioctl number ranges that need to be handled */
 #define AAC_LINUX_IOCTL_MIN  0x0000
 #define AAC_LINUX_IOCTL_MAX  0x21ff
 
 static linux_ioctl_function_t aac_linux_ioctl;
 static struct linux_ioctl_handler aac_linux_handler = {aac_linux_ioctl,
 						       AAC_LINUX_IOCTL_MIN,
 						       AAC_LINUX_IOCTL_MAX};
 
 SYSINIT  (aac_linux_register,   SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_register_handler, &aac_linux_handler);
 SYSUNINIT(aac_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_unregister_handler, &aac_linux_handler);
 
 static int
 aac_linux_modevent(module_t mod, int type, void *data)
 {
 	/* Do we care about any specific load/unload actions? */
 	return (0);
 }
 
 DEV_MODULE(aac_linux, aac_linux_modevent, NULL);
 MODULE_DEPEND(aac_linux, linux, 1, 1, 1);
 
 static int
 aac_linux_ioctl(struct thread *td, struct linux_ioctl_args *args)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	u_long cmd;
 	int error;
 
-	error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	error = fget(td, args->fd, cap_rights_init_one(&rights, CAP_IOCTL),
+	    &fp);
 	if (error != 0)
 		return (error);
 	cmd = args->cmd;
 
 	/*
 	 * Pass the ioctl off to our standard handler.
 	 */
 	error = (fo_ioctl(fp, cmd, (caddr_t)args->arg, td->td_ucred, td));
 	fdrop(fp, td);
 	return (error);
 }
diff --git a/sys/dev/aacraid/aacraid_linux.c b/sys/dev/aacraid/aacraid_linux.c
index b58b8bacd7f3..7592a0d5a8cb 100644
--- a/sys/dev/aacraid/aacraid_linux.c
+++ b/sys/dev/aacraid/aacraid_linux.c
@@ -1,100 +1,100 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002 Scott Long
  * Copyright (c) 2002-2010 Adaptec, Inc.
  * Copyright (c) 2010-2012 PMC-Sierra, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Linux ioctl handler for the aac device driver
  */
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #ifdef __amd64__
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_ioctl.h>
 
 /* There are multiple ioctl number ranges that need to be handled */
 #define AAC_LINUX_IOCTL_MIN  0x0000
 #define AAC_LINUX_IOCTL_MAX  0x21ff
 
 static linux_ioctl_function_t aacraid_linux_ioctl;
 static struct linux_ioctl_handler aacraid_linux_handler = {aacraid_linux_ioctl,
 						       AAC_LINUX_IOCTL_MIN,
 						       AAC_LINUX_IOCTL_MAX};
 
 SYSINIT  (aacraid_linux_register,   SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_register_handler, &aacraid_linux_handler);
 SYSUNINIT(aacraid_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_unregister_handler, &aacraid_linux_handler);
 
 static int
 aacraid_linux_modevent(module_t mod, int type, void *data)
 {
 	/* Do we care about any specific load/unload actions? */
 	return (0);
 }
 
 DEV_MODULE(aacraid_linux, aacraid_linux_modevent, NULL);
 MODULE_DEPEND(aacraid_linux, linux, 1, 1, 1);
 
 static int
 aacraid_linux_ioctl(struct thread *td, struct linux_ioctl_args *args)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	u_long cmd;
 	int error;
 
 	if ((error = fget(td, args->fd,
-	    cap_rights_init(&rights, CAP_IOCTL),
+	    cap_rights_init_one(&rights, CAP_IOCTL),
 	    &fp)) != 0) {
 		return (error);
 	}
 	cmd = args->cmd;
 
 	/*
 	 * Pass the ioctl off to our standard handler.
 	 */
 	error = (fo_ioctl(fp, cmd, (caddr_t)args->arg, td->td_ucred, td));
 	fdrop(fp, td);
 	return (error);
 }
diff --git a/sys/dev/amr/amr_linux.c b/sys/dev/amr/amr_linux.c
index 967907cc6846..9e0a5cac693e 100644
--- a/sys/dev/amr/amr_linux.c
+++ b/sys/dev/amr/amr_linux.c
@@ -1,87 +1,87 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 Paul Saab
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 
 #if defined(__amd64__) /* Assume amd64 wants 32 bit Linux */
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_ioctl.h>
 
 /* There are multiple ioctl number ranges that need to be handled */
 #define AMR_LINUX_IOCTL_MIN  0x6d00
 #define AMR_LINUX_IOCTL_MAX  0x6d01
 
 static linux_ioctl_function_t amr_linux_ioctl;
 static struct linux_ioctl_handler amr_linux_handler = {amr_linux_ioctl,
 						       AMR_LINUX_IOCTL_MIN,
 						       AMR_LINUX_IOCTL_MAX};
 
 SYSINIT  (amr_register,   SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_register_handler, &amr_linux_handler);
 SYSUNINIT(amr_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_unregister_handler, &amr_linux_handler);
 
 static int
 amr_linux_modevent(module_t mod, int cmd, void *data)
 {
 	return (0);
 }
 
 DEV_MODULE(amr_linux, amr_linux_modevent, NULL);
 MODULE_DEPEND(amr, linux, 1, 1, 1);
 
 static int
 amr_linux_ioctl(struct thread *p, struct linux_ioctl_args *args)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	int error;
 
-	error = fget(p, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	error = fget(p, args->fd, cap_rights_init_one(&rights, CAP_IOCTL), &fp);
 	if (error != 0)
 		return (error);
 	error = fo_ioctl(fp, args->cmd, (caddr_t)args->arg, p->td_ucred, p);
 	fdrop(fp, p);
 	return (error);
 }
diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
index f5d4f03476cb..5588d3cb9511 100644
--- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -1,1098 +1,1098 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * Copyright (c) 2015 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * cxgbei implementation of iSCSI Common Layer kobj(9) interface.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/uio.h>
 #include <machine/bus.h>
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <icl_conn_if.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_error.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_ioctl.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_compat.h>
 #include <cam/scsi/scsi_message.h>
 
 #include "common/common.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 #include "cxgbei.h"
 
 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Chelsio iSCSI offload");
 static int coalesce = 1;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, coalesce, CTLFLAG_RWTUN,
 	&coalesce, 0, "Try to coalesce PDUs before sending");
 static int partial_receive_len = 128 * 1024;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
     &partial_receive_len, 0, "Minimum read size for partially received "
     "data segment");
 static int sendspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN,
     &sendspace, 0, "Default send socket buffer size");
 static int recvspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN,
     &recvspace, 0, "Default receive socket buffer size");
 
 static uma_zone_t prsv_zone;
 static volatile u_int icl_cxgbei_ncons;
 
 #define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
 #define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
 #define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
 #define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
 
 struct icl_pdu *icl_cxgbei_new_pdu(int);
 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
 
 static icl_conn_new_pdu_t	icl_cxgbei_conn_new_pdu;
 icl_conn_pdu_free_t	icl_cxgbei_conn_pdu_free;
 static icl_conn_pdu_data_segment_length_t
 				    icl_cxgbei_conn_pdu_data_segment_length;
 static icl_conn_pdu_append_data_t	icl_cxgbei_conn_pdu_append_data;
 static icl_conn_pdu_get_data_t	icl_cxgbei_conn_pdu_get_data;
 static icl_conn_pdu_queue_t	icl_cxgbei_conn_pdu_queue;
 static icl_conn_handoff_t	icl_cxgbei_conn_handoff;
 static icl_conn_free_t		icl_cxgbei_conn_free;
 static icl_conn_close_t		icl_cxgbei_conn_close;
 static icl_conn_task_setup_t	icl_cxgbei_conn_task_setup;
 static icl_conn_task_done_t	icl_cxgbei_conn_task_done;
 static icl_conn_transfer_setup_t	icl_cxgbei_conn_transfer_setup;
 static icl_conn_transfer_done_t	icl_cxgbei_conn_transfer_done;
 
 static kobj_method_t icl_cxgbei_methods[] = {
 	KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu),
 	KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free),
 	KOBJMETHOD(icl_conn_pdu_data_segment_length,
 	    icl_cxgbei_conn_pdu_data_segment_length),
 	KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data),
 	KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data),
 	KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue),
 	KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff),
 	KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free),
 	KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close),
 	KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup),
 	KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done),
 	KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup),
 	KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done),
 	{ 0, 0 }
 };
 
 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn));
 
 void
 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
 {
 #ifdef INVARIANTS
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 #endif
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 	MPASS(ip->ip_bhs_mbuf != NULL);
 
 	m_freem(ip->ip_ahs_mbuf);
 	m_freem(ip->ip_data_mbuf);
 	m_freem(ip->ip_bhs_mbuf);	/* storage for icl_cxgbei_pdu itself */
 
 #ifdef DIAGNOSTIC
 	if (__predict_true(ic != NULL))
 		refcount_release(&ic->ic_outstanding_pdus);
 #endif
 }
 
 struct icl_pdu *
 icl_cxgbei_new_pdu(int flags)
 {
 	struct icl_cxgbei_pdu *icp;
 	struct icl_pdu *ip;
 	struct mbuf *m;
 	uintptr_t a;
 
 	m = m_gethdr(flags, MT_DATA);
 	if (__predict_false(m == NULL))
 		return (NULL);
 
 	a = roundup2(mtod(m, uintptr_t), _Alignof(struct icl_cxgbei_pdu));
 	icp = (struct icl_cxgbei_pdu *)a;
 	bzero(icp, sizeof(*icp));
 
 	icp->icp_signature = CXGBEI_PDU_SIGNATURE;
 	ip = &icp->ip;
 	ip->ip_bhs_mbuf = m;
 
 	a = roundup2((uintptr_t)(icp + 1), _Alignof(struct iscsi_bhs *));
 	ip->ip_bhs = (struct iscsi_bhs *)a;
 #ifdef INVARIANTS
 	/* Everything must fit entirely in the mbuf. */
 	a = (uintptr_t)(ip->ip_bhs + 1);
 	MPASS(a <= (uintptr_t)m + MSIZE);
 #endif
 	bzero(ip->ip_bhs, sizeof(*ip->ip_bhs));
 
 	m->m_data = (void *)ip->ip_bhs;
 	m->m_len = sizeof(struct iscsi_bhs);
 	m->m_pkthdr.len = m->m_len;
 
 	return (ip);
 }
 
 void
 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic)
 {
 
 	ip->ip_conn = ic;
 #ifdef DIAGNOSTIC
 	refcount_acquire(&ic->ic_outstanding_pdus);
 #endif
 }
 
 /*
  * Allocate icl_pdu with empty BHS to fill up by the caller.
  */
 static struct icl_pdu *
 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags)
 {
 	struct icl_pdu *ip;
 
 	ip = icl_cxgbei_new_pdu(flags);
 	if (__predict_false(ip == NULL))
 		return (NULL);
 	icl_cxgbei_new_pdu_set_conn(ip, ic);
 
 	return (ip);
 }
 
 static size_t
 icl_pdu_data_segment_length(const struct icl_pdu *request)
 {
 	uint32_t len = 0;
 
 	len += request->ip_bhs->bhs_data_segment_len[0];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[1];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[2];
 
 	return (len);
 }
 
 size_t
 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic,
     const struct icl_pdu *request)
 {
 
 	return (icl_pdu_data_segment_length(request));
 }
 
 static struct mbuf *
 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
 {
 	struct icl_pdu *ip = &icp->ip;
 	uint8_t ulp_submode, padding;
 	struct mbuf *m, *last;
 	struct iscsi_bhs *bhs;
 
 	/*
 	 * Fix up the data segment mbuf first.
 	 */
 	m = ip->ip_data_mbuf;
 	ulp_submode = icc->ulp_submode;
 	if (m) {
 		last = m_last(m);
 
 		/*
 		 * Round up the data segment to a 4B boundary.  Pad with 0 if
 		 * necessary.  There will definitely be room in the mbuf.
 		 */
 		padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len;
 		if (padding) {
 			bzero(mtod(last, uint8_t *) + last->m_len, padding);
 			last->m_len += padding;
 		}
 	} else {
 		MPASS(ip->ip_data_len == 0);
 		ulp_submode &= ~ULP_CRC_DATA;
 		padding = 0;
 	}
 
 	/*
 	 * Now the header mbuf that has the BHS.
 	 */
 	m = ip->ip_bhs_mbuf;
 	MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs));
 	MPASS(m->m_len == sizeof(struct iscsi_bhs));
 
 	bhs = ip->ip_bhs;
 	bhs->bhs_data_segment_len[2] = ip->ip_data_len;
 	bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8;
 	bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16;
 
 	/* "Convert" PDU to mbuf chain.  Do not use icp/ip after this. */
 	m->m_pkthdr.len = sizeof(struct iscsi_bhs) + ip->ip_data_len + padding;
 	m->m_next = ip->ip_data_mbuf;
 	set_mbuf_ulp_submode(m, ulp_submode);
 #ifdef INVARIANTS
 	bzero(icp, sizeof(*icp));
 #endif
 #ifdef DIAGNOSTIC
 	refcount_release(&icc->ic.ic_outstanding_pdus);
 #endif
 
 	return (m);
 }
 
 int
 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip,
     const void *addr, size_t len, int flags)
 {
 	struct mbuf *m;
 #ifdef INVARIANTS
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 #endif
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 	KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len));
 
 	m = ip->ip_data_mbuf;
 	if (m == NULL) {
 		m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES);
 		if (__predict_false(m == NULL))
 			return (ENOMEM);
 
 		ip->ip_data_mbuf = m;
 	}
 
 	if (__predict_true(m_append(m, len, addr) != 0)) {
 		ip->ip_data_len += len;
 		MPASS(ip->ip_data_len <= ic->ic_max_data_segment_length);
 		return (0);
 	} else {
 	    	if (flags & M_WAITOK) {
 			CXGBE_UNIMPLEMENTED("fail safe append");
 		}
 		ip->ip_data_len = m_length(m, NULL);
 		return (1);
 	}
 }
 
 void
 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
     size_t off, void *addr, size_t len)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	if (icp->icp_flags & ICPF_RX_DDP)
 		return; /* data is DDP'ed, no need to copy */
 	m_copydata(ip->ip_data_mbuf, off, len, addr);
 }
 
 void
 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	struct socket *so = ic->ic_socket;
 	struct toepcb *toep = icc->toep;
 	struct inpcb *inp;
 	struct mbuf *m;
 
 	MPASS(ic == ip->ip_conn);
 	MPASS(ip->ip_bhs_mbuf != NULL);
 	/* The kernel doesn't generate PDUs with AHS. */
 	MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0);
 
 	ICL_CONN_LOCK_ASSERT(ic);
 	/* NOTE: sowriteable without so_snd lock is a mostly harmless race. */
 	if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) {
 		icl_cxgbei_conn_pdu_free(ic, ip);
 		return;
 	}
 
 	m = finalize_pdu(icc, icp);
 	M_ASSERTPKTHDR(m);
 	MPASS((m->m_pkthdr.len & 3) == 0);
 
 	/*
 	 * Do not get inp from toep->inp as the toepcb might have detached
 	 * already.
 	 */
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) ||
 	    __predict_false((toep->flags & TPF_ATTACHED) == 0))
 		m_freem(m);
 	else {
 		mbufq_enqueue(&toep->ulp_pduq, m);
 		t4_push_pdus(icc->sc, toep, 0);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static struct icl_conn *
 icl_cxgbei_new_conn(const char *name, struct mtx *lock)
 {
 	struct icl_cxgbei_conn *icc;
 	struct icl_conn *ic;
 
 	refcount_acquire(&icl_cxgbei_ncons);
 
 	icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE,
 	    M_WAITOK | M_ZERO);
 	icc->icc_signature = CXGBEI_CONN_SIGNATURE;
 	STAILQ_INIT(&icc->rcvd_pdus);
 
 	ic = &icc->ic;
 	ic->ic_lock = lock;
 
 	/* XXXNP: review.  Most of these icl_conn fields aren't really used */
 	STAILQ_INIT(&ic->ic_to_send);
 	cv_init(&ic->ic_send_cv, "icl_cxgbei_tx");
 	cv_init(&ic->ic_receive_cv, "icl_cxgbei_rx");
 #ifdef DIAGNOSTIC
 	refcount_init(&ic->ic_outstanding_pdus, 0);
 #endif
 	/* This is a stop-gap value that will be corrected during handoff. */
 	ic->ic_max_data_segment_length = 16384;
 	ic->ic_name = name;
 	ic->ic_offload = "cxgbei";
 	ic->ic_unmapped = false;
 
 	CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
 
 	return (ic);
 }
 
 void
 icl_cxgbei_conn_free(struct icl_conn *ic)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 
 	CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
 
 	cv_destroy(&ic->ic_send_cv);
 	cv_destroy(&ic->ic_receive_cv);
 
 	kobj_delete((struct kobj *)icc, M_CXGBE);
 	refcount_release(&icl_cxgbei_ncons);
 }
 
 static int
 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace,
     int rspace)
 {
 	struct sockopt opt;
 	int error, one = 1, ss, rs;
 
 	ss = max(sendspace, sspace);
 	rs = max(recvspace, rspace);
 
 	error = soreserve(so, ss, rs);
 	if (error != 0) {
 		icl_cxgbei_conn_close(ic);
 		return (error);
 	}
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/*
 	 * Disable Nagle.
 	 */
 	bzero(&opt, sizeof(opt));
 	opt.sopt_dir = SOPT_SET;
 	opt.sopt_level = IPPROTO_TCP;
 	opt.sopt_name = TCP_NODELAY;
 	opt.sopt_val = &one;
 	opt.sopt_valsize = sizeof(one);
 	error = sosetopt(so, &opt);
 	if (error != 0) {
 		icl_cxgbei_conn_close(ic);
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Request/response structure used to find out the adapter offloading a socket.
  */
 struct find_ofld_adapter_rr {
 	struct socket *so;
 	struct adapter *sc;	/* result */
 };
 
 static void
 find_offload_adapter(struct adapter *sc, void *arg)
 {
 	struct find_ofld_adapter_rr *fa = arg;
 	struct socket *so = fa->so;
 	struct tom_data *td = sc->tom_softc;
 	struct tcpcb *tp;
 	struct inpcb *inp;
 
 	/* Non-TCP were filtered out earlier. */
 	MPASS(so->so_proto->pr_protocol == IPPROTO_TCP);
 
 	if (fa->sc != NULL)
 		return;	/* Found already. */
 
 	if (td == NULL)
 		return;	/* TOE not enabled on this adapter. */
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
 		tp = intotcpcb(inp);
 		if (tp->t_flags & TF_TOE && tp->tod == &td->tod)
 			fa->sc = sc;	/* Found. */
 	}
 	INP_WUNLOCK(inp);
 }
 
 /* XXXNP: move this to t4_tom. */
 static void
 send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	const u_int nparams = 1;
 	u_int flowclen;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
 	flowc->mnemval[0].val = htobe32(maxlen);
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
         t4_wrq_tx(sc, wr);
 }
 
 static void
 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, int hcrc, int dcrc)
 {
 	uint64_t val = ULP_MODE_ISCSI;
 
 	if (hcrc)
 		val |= ULP_CRC_HEADER << 4;
 	if (dcrc)
 		val |= ULP_CRC_DATA << 4;
 
 	CTR4(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, CRC hdr=%d data=%d",
 	    __func__, toep->tid, hcrc, dcrc);
 
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE,
 	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val,
 	    0, 0);
 }
 
 /*
  * XXXNP: Who is responsible for cleaning up the socket if this returns with an
  * error?  Review all error paths.
  *
  * XXXNP: What happens to the socket's fd reference if the operation is
  * successful, and how does that affect the socket's life cycle?
  */
 int
 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct cxgbei_data *ci;
 	struct find_ofld_adapter_rr fa;
 	struct file *fp;
 	struct socket *so;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	cap_rights_t rights;
 	int error;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	/*
 	 * Steal the socket from userland.
 	 */
 	error = fget(curthread, fd,
-	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
+	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM ||
 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 
 	ICL_CONN_LOCK(ic);
 	if (ic->ic_socket != NULL) {
 		ICL_CONN_UNLOCK(ic);
 		fdrop(fp, curthread);
 		return (EBUSY);
 	}
 	ic->ic_disconnecting = false;
 	ic->ic_socket = so;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fdrop(fp, curthread);
 	ICL_CONN_UNLOCK(ic);
 
 	/* Find the adapter offloading this socket. */
 	fa.sc = NULL;
 	fa.so = so;
 	t4_iterate(find_offload_adapter, &fa);
 	if (fa.sc == NULL)
 		return (EINVAL);
 	icc->sc = fa.sc;
 	ci = icc->sc->iscsi_ulp_softc;
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))
 		error = EBUSY;
 	else {
 		/*
 		 * socket could not have been "unoffloaded" if here.
 		 */
 		MPASS(tp->t_flags & TF_TOE);
 		MPASS(tp->tod != NULL);
 		MPASS(tp->t_toe != NULL);
 		toep = tp->t_toe;
 		MPASS(toep->vi->adapter == icc->sc);
 		icc->toep = toep;
 		icc->cwt = cxgbei_select_worker_thread(icc);
 
 		/*
 		 * We maintain the _send_ DSL in this field just to have a
 		 * convenient way to assert that the kernel never sends
 		 * oversized PDUs.  This field is otherwise unused in the driver
 		 * or the kernel.
 		 */
 		ic->ic_max_data_segment_length = ci->max_tx_pdu_len -
 		    ISCSI_BHS_SIZE;
 
 		icc->ulp_submode = 0;
 		if (ic->ic_header_crc32c) {
 			icc->ulp_submode |= ULP_CRC_HEADER;
 			ic->ic_max_data_segment_length -=
 			    ISCSI_HEADER_DIGEST_SIZE;
 		}
 		if (ic->ic_data_crc32c) {
 			icc->ulp_submode |= ULP_CRC_DATA;
 			ic->ic_max_data_segment_length -=
 			    ISCSI_DATA_DIGEST_SIZE;
 		}
 		so->so_options |= SO_NO_DDP;
 		toep->params.ulp_mode = ULP_MODE_ISCSI;
 		toep->ulpcb = icc;
 
 		send_iscsi_flowc_wr(icc->sc, toep, ci->max_tx_pdu_len);
 		set_ulp_mode_iscsi(icc->sc, toep, ic->ic_header_crc32c,
 		    ic->ic_data_crc32c);
 		error = 0;
 	}
 	INP_WUNLOCK(inp);
 
 	if (error == 0) {
 		error = icl_cxgbei_setsockopt(ic, so, ci->max_tx_pdu_len,
 		    ci->max_rx_pdu_len);
 	}
 
 	return (error);
 }
 
 void
 icl_cxgbei_conn_close(struct icl_conn *ic)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct icl_pdu *ip;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct toepcb *toep = icc->toep;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	ICL_CONN_LOCK(ic);
 	so = ic->ic_socket;
 	if (ic->ic_disconnecting || so == NULL) {
 		CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p",
 		    __func__, icc, ic->ic_disconnecting, so);
 		ICL_CONN_UNLOCK(ic);
 		return;
 	}
 	ic->ic_disconnecting = true;
 
 	/* These are unused in this driver right now. */
 	MPASS(STAILQ_EMPTY(&ic->ic_to_send));
 	MPASS(ic->ic_receive_pdu == NULL);
 
 #ifdef DIAGNOSTIC
 	KASSERT(ic->ic_outstanding_pdus == 0,
 	    ("destroying session with %d outstanding PDUs",
 	     ic->ic_outstanding_pdus));
 #endif
 	ICL_CONN_UNLOCK(ic);
 
 	CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1,
 	    icc);
 	inp = sotoinpcb(so);
 	sb = &so->so_rcv;
 	INP_WLOCK(inp);
 	if (toep != NULL) {	/* NULL if connection was never offloaded. */
 		toep->ulpcb = NULL;
 		mbufq_drain(&toep->ulp_pduq);
 		SOCKBUF_LOCK(sb);
 		if (icc->rx_flags & RXF_ACTIVE) {
 			volatile u_int *p = &icc->rx_flags;
 
 			SOCKBUF_UNLOCK(sb);
 			INP_WUNLOCK(inp);
 
 			while (*p & RXF_ACTIVE)
 				pause("conclo", 1);
 
 			INP_WLOCK(inp);
 			SOCKBUF_LOCK(sb);
 		}
 
 		while (!STAILQ_EMPTY(&icc->rcvd_pdus)) {
 			ip = STAILQ_FIRST(&icc->rcvd_pdus);
 			STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next);
 			icl_cxgbei_conn_pdu_free(ic, ip);
 		}
 		SOCKBUF_UNLOCK(sb);
 	}
 	INP_WUNLOCK(inp);
 
 	ICL_CONN_LOCK(ic);
 	ic->ic_socket = NULL;
 	ICL_CONN_UNLOCK(ic);
 
 	/*
 	 * XXXNP: we should send RST instead of FIN when PDUs held in various
 	 * queues were purged instead of delivered reliably but soabort isn't
 	 * really general purpose and wouldn't do the right thing here.
 	 */
 	soclose(so);
 }
 
 int
 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
     struct ccb_scsiio *csio, uint32_t *ittp, void **arg)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct ppod_reservation *prsv;
 	uint32_t itt;
 	int rc = 0;
 
 	/* This is for the offload driver's state.  Must not be set already. */
 	MPASS(arg != NULL);
 	MPASS(*arg == NULL);
 
 	if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN ||
 	    csio->dxfer_len < ci->ddp_threshold) {
 no_ddp:
 		/*
 		 * No DDP for this I/O.  Allocate an ITT (based on the one
 		 * passed in) that cannot be a valid hardware DDP tag in the
 		 * iSCSI region.
 		 */
 		itt = *ittp & M_PPOD_TAG;
 		itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit;
 		*ittp = htobe32(itt);
 		MPASS(*arg == NULL);	/* State is maintained for DDP only. */
 		if (rc != 0)
 			counter_u64_add(ci->ddp_setup_error, 1);
 		return (0);
 	}
 
 	/*
 	 * Reserve resources for DDP, update the itt that should be used in the
 	 * PDU, and save DDP specific state for this I/O in *arg.
 	 */
 
 	prsv = uma_zalloc(prsv_zone, M_NOWAIT);
 	if (prsv == NULL) {
 		rc = ENOMEM;
 		goto no_ddp;
 	}
 
 	/* XXX add support for all CAM_DATA_ types */
 	MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR);
 	rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr,
 	    csio->dxfer_len, prsv);
 	if (rc != 0) {
 		uma_zfree(prsv_zone, prsv);
 		goto no_ddp;
 	}
 
 	rc = t4_write_page_pods_for_buf(sc, toep->ofld_txq, toep->tid, prsv,
 	    (vm_offset_t)csio->data_ptr, csio->dxfer_len);
 	if (rc != 0) {
 		t4_free_page_pods(prsv);
 		uma_zfree(prsv_zone, prsv);
 		goto no_ddp;
 	}
 
 	*ittp = htobe32(prsv->prsv_tag);
 	*arg = prsv;
 	counter_u64_add(ci->ddp_setup_ok, 1);
 	return (0);
 }
 
 void
 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg)
 {
 
 	if (arg != NULL) {
 		struct ppod_reservation *prsv = arg;
 
 		t4_free_page_pods(prsv);
 		uma_zfree(prsv_zone, prsv);
 	}
 }
 
 /* XXXNP: PDU should be passed in as parameter, like on the initiator. */
 #define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr)
 #define io_to_ppod_reservation(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr)
 
 int
 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
     uint32_t *tttp, void **arg)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct ctl_scsiio *ctsio = &io->scsiio;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct ppod_reservation *prsv;
 	uint32_t ttt;
 	int xferlen, rc = 0, alias;
 
 	/* This is for the offload driver's state.  Must not be set already. */
 	MPASS(arg != NULL);
 	MPASS(*arg == NULL);
 
 	if (ctsio->ext_data_filled == 0) {
 		int first_burst;
 		struct icl_pdu *ip = io_to_request_pdu(io);
 		vm_offset_t buf;
 #ifdef INVARIANTS
 		struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 		MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 		MPASS(ic == ip->ip_conn);
 		MPASS(ip->ip_bhs_mbuf != NULL);
 #endif
 		first_burst = icl_pdu_data_segment_length(ip);
 
 		/*
 		 * Note that ICL calls conn_transfer_setup even if the first
 		 * burst had everything and there's nothing left to transfer.
 		 */
 		MPASS(ctsio->kern_data_len >= first_burst);
 		xferlen = ctsio->kern_data_len;
 		if (xferlen - first_burst < ci->ddp_threshold) {
 no_ddp:
 			/*
 			 * No DDP for this transfer.  Allocate a TTT (based on
 			 * the one passed in) that cannot be a valid hardware
 			 * DDP tag in the iSCSI region.
 			 */
 			ttt = *tttp & M_PPOD_TAG;
 			ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit;
 			*tttp = htobe32(ttt);
 			MPASS(io_to_ppod_reservation(io) == NULL);
 			if (rc != 0)
 				counter_u64_add(ci->ddp_setup_error, 1);
 			return (0);
 		}
 
 		if (ctsio->kern_sg_entries == 0)
 			buf = (vm_offset_t)ctsio->kern_data_ptr;
 		else if (ctsio->kern_sg_entries == 1) {
 			struct ctl_sg_entry *sgl = (void *)ctsio->kern_data_ptr;
 
 			MPASS(sgl->len == xferlen);
 			buf = (vm_offset_t)sgl->addr;
 		} else {
 			rc = EAGAIN;	/* XXX implement */
 			goto no_ddp;
 		}
 
 
 		/*
 		 * Reserve resources for DDP, update the ttt that should be used
 		 * in the PDU, and save DDP specific state for this I/O.
 		 */
 
 		MPASS(io_to_ppod_reservation(io) == NULL);
 		prsv = uma_zalloc(prsv_zone, M_NOWAIT);
 		if (prsv == NULL) {
 			rc = ENOMEM;
 			goto no_ddp;
 		}
 
 		rc = t4_alloc_page_pods_for_buf(pr, buf, xferlen, prsv);
 		if (rc != 0) {
 			uma_zfree(prsv_zone, prsv);
 			goto no_ddp;
 		}
 
 		rc = t4_write_page_pods_for_buf(sc, toep->ofld_txq, toep->tid,
 		    prsv, buf, xferlen);
 		if (rc != 0) {
 			t4_free_page_pods(prsv);
 			uma_zfree(prsv_zone, prsv);
 			goto no_ddp;
 		}
 
 		*tttp = htobe32(prsv->prsv_tag);
 		io_to_ppod_reservation(io) = prsv;
 		*arg = ctsio;
 		counter_u64_add(ci->ddp_setup_ok, 1);
 		return (0);
 	}
 
 	/*
 	 * In the middle of an I/O.  A non-NULL page pod reservation indicates
 	 * that a DDP buffer is being used for the I/O.
 	 */
 
 	prsv = io_to_ppod_reservation(ctsio);
 	if (prsv == NULL)
 		goto no_ddp;
 
 	alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift;
 	alias++;
 	prsv->prsv_tag &= ~pr->pr_alias_mask;
 	prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask;
 
 	*tttp = htobe32(prsv->prsv_tag);
 	*arg = ctsio;
 
 	return (0);
 }
 
 void
 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg)
 {
 	struct ctl_scsiio *ctsio = arg;
 
 	if (ctsio != NULL && ctsio->kern_data_len == ctsio->ext_data_filled) {
 		struct ppod_reservation *prsv;
 
 		prsv = io_to_ppod_reservation(ctsio);
 		MPASS(prsv != NULL);
 
 		t4_free_page_pods(prsv);
 		uma_zfree(prsv_zone, prsv);
 	}
 }
 
 static void
 cxgbei_limits(struct adapter *sc, void *arg)
 {
 	struct icl_drv_limits *idl = arg;
 	struct cxgbei_data *ci;
 	int max_dsl;
 
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0)
 		return;
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		ci = sc->iscsi_ulp_softc;
 		MPASS(ci != NULL);
 
 		/*
 		 * AHS is not supported by the kernel so we'll not account for
 		 * it either in our PDU len -> data segment len conversions.
 		 */
 
 		max_dsl = ci->max_rx_pdu_len - ISCSI_BHS_SIZE -
 		    ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE;
 		if (idl->idl_max_recv_data_segment_length > max_dsl)
 			idl->idl_max_recv_data_segment_length = max_dsl;
 
 		max_dsl = ci->max_tx_pdu_len - ISCSI_BHS_SIZE -
 		    ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE;
 		if (idl->idl_max_send_data_segment_length > max_dsl)
 			idl->idl_max_send_data_segment_length = max_dsl;
 	}
 
 	end_synchronized_op(sc, LOCK_HELD);
 }
 
 static int
 icl_cxgbei_limits(struct icl_drv_limits *idl)
 {
 
 	/* Maximum allowed by the RFC.  cxgbei_limits will clip them. */
 	idl->idl_max_recv_data_segment_length = (1 << 24) - 1;
 	idl->idl_max_send_data_segment_length = (1 << 24) - 1;
 
 	/* These are somewhat arbitrary. */
 	idl->idl_max_burst_length = 2 * 1024 * 1024;
 	idl->idl_first_burst_length = 8192;
 
 	t4_iterate(cxgbei_limits, idl);
 
 	return (0);
 }
 
 int
 icl_cxgbei_mod_load(void)
 {
 	int rc;
 
 	/*
 	 * Space to track pagepod reservations.
 	 */
 	prsv_zone = uma_zcreate("Pagepod reservations",
 	    sizeof(struct ppod_reservation), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 
 	refcount_init(&icl_cxgbei_ncons, 0);
 
 	rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits,
 	    icl_cxgbei_new_conn);
 
 	return (rc);
 }
 
 int
 icl_cxgbei_mod_unload(void)
 {
 
 	if (icl_cxgbei_ncons != 0)
 		return (EBUSY);
 
 	icl_unregister("cxgbei", false);
 
 	uma_zdestroy(prsv_zone);
 
 	return (0);
 }
 #endif
diff --git a/sys/dev/filemon/filemon_wrapper.c b/sys/dev/filemon/filemon_wrapper.c
index 52dd3bb483eb..4e64d8254f8d 100644
--- a/sys/dev/filemon/filemon_wrapper.c
+++ b/sys/dev/filemon/filemon_wrapper.c
@@ -1,460 +1,460 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011, David E. O'Brien.
  * Copyright (c) 2009-2011, Juniper Networks, Inc.
  * Copyright (c) 2015-2016, EMC Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY JUNIPER NETWORKS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL JUNIPER NETWORKS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/priv.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 static void filemon_output_event(struct filemon *filemon, const char *fmt, ...)
     __printflike(2, 3);
 
 static eventhandler_tag filemon_exec_tag;
 static eventhandler_tag filemon_exit_tag;
 static eventhandler_tag filemon_fork_tag;
 
 static void
 filemon_output(struct filemon *filemon, char *msg, size_t len)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (filemon->fp == NULL)
 		return;
 
 	aiov.iov_base = msg;
 	aiov.iov_len = len;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = len;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = curthread;
 	auio.uio_offset = (off_t) -1;
 
 	if (filemon->fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 
 	error = fo_write(filemon->fp, &auio, filemon->cred, 0, curthread);
 	if (error != 0 && filemon->error == 0)
 		filemon->error = error;
 }
 
 static void
 filemon_output_event(struct filemon *filemon, const char *fmt, ...)
 {
 	va_list ap;
 	size_t len;
 
 	va_start(ap, fmt);
 	len = vsnprintf(filemon->msgbufr, sizeof(filemon->msgbufr), fmt, ap);
 	va_end(ap);
 	/* The event is truncated but still worth logging. */
 	if (len >= sizeof(filemon->msgbufr))
 		len = sizeof(filemon->msgbufr) - 1;
 	filemon_output(filemon, filemon->msgbufr, len);
 }
 
 static int
 filemon_wrapper_chdir(struct thread *td, struct chdir_args *uap)
 {
 	int error, ret;
 	struct filemon *filemon;
 
 	if ((ret = sys_chdir(td, uap)) == 0) {
 		if ((filemon = filemon_proc_get(curproc)) != NULL) {
 			if ((error = copyinstr(uap->path, filemon->fname1,
 			    sizeof(filemon->fname1), NULL)) != 0) {
 				filemon->error = error;
 				goto copyfail;
 			}
 
 			filemon_output_event(filemon, "C %d %s\n",
 			    curproc->p_pid, filemon->fname1);
 copyfail:
 			filemon_drop(filemon);
 		}
 	}
 
 	return (ret);
 }
 
 static void
 filemon_event_process_exec(void *arg __unused, struct proc *p,
     struct image_params *imgp)
 {
 	struct filemon *filemon;
 
 	if ((filemon = filemon_proc_get(p)) != NULL) {
 		filemon_output_event(filemon, "E %d %s\n",
 		    p->p_pid,
 		    imgp->execpath != NULL ? imgp->execpath : "<unknown>");
 
 		/* If the credentials changed then cease tracing. */
 		if (imgp->newcred != NULL &&
 		    imgp->credential_setid &&
 		    priv_check_cred(filemon->cred, PRIV_DEBUG_DIFFCRED) != 0) {
 			/*
 			 * It may have changed to NULL already, but
 			 * will not be re-attached by anything else.
 			 */
 			if (p->p_filemon != NULL) {
 				KASSERT(p->p_filemon == filemon,
 				    ("%s: proc %p didn't have expected"
 				    " filemon %p", __func__, p, filemon));
 				filemon_proc_drop(p);
 			}
 		}
 
 
 		filemon_drop(filemon);
 	}
 }
 
 static void
 _filemon_wrapper_openat(struct thread *td, const char *upath, int flags,
     int fd)
 {
 	int error;
 	struct file *fp;
 	struct filemon *filemon;
 	char *atpath, *freepath;
 	cap_rights_t rights;
 
 	if ((filemon = filemon_proc_get(curproc)) != NULL) {
 		atpath = "";
 		freepath = NULL;
 		fp = NULL;
 
 		if ((error = copyinstr(upath, filemon->fname1,
 		    sizeof(filemon->fname1), NULL)) != 0) {
 			filemon->error = error;
 			goto copyfail;
 		}
 
 		if (filemon->fname1[0] != '/' && fd != AT_FDCWD) {
 			/*
 			 * rats - we cannot do too much about this.
 			 * the trace should show a dir we read
 			 * recently.. output an A record as a clue
 			 * until we can do better.
 			 * XXX: This may be able to come out with
 			 * the namecache lookup now.
 			 */
 			filemon_output_event(filemon, "A %d %s\n",
 			    curproc->p_pid, filemon->fname1);
 			/*
 			 * Try to resolve the path from the vnode using the
 			 * namecache.  It may be inaccurate, but better
 			 * than nothing.
 			 */
 			if (getvnode(td, fd,
-			    cap_rights_init(&rights, CAP_LOOKUP), &fp) == 0) {
+			    cap_rights_init_one(&rights, CAP_LOOKUP), &fp) == 0) {
 				vn_fullpath(fp->f_vnode, &atpath, &freepath);
 			}
 		}
 		if (flags & O_RDWR) {
 			/*
 			 * We'll get the W record below, but need
 			 * to also output an R to distinguish from
 			 * O_WRONLY.
 			 */
 			filemon_output_event(filemon, "R %d %s%s%s\n",
 			    curproc->p_pid, atpath,
 			    atpath[0] != '\0' ? "/" : "", filemon->fname1);
 		}
 
 		filemon_output_event(filemon, "%c %d %s%s%s\n",
 		    (flags & O_ACCMODE) ? 'W':'R',
 		    curproc->p_pid, atpath,
 		    atpath[0] != '\0' ? "/" : "", filemon->fname1);
 copyfail:
 		filemon_drop(filemon);
 		if (fp != NULL)
 			fdrop(fp, td);
 		free(freepath, M_TEMP);
 	}
 }
 
 static int
 filemon_wrapper_open(struct thread *td, struct open_args *uap)
 {
 	int ret;
 
 	if ((ret = sys_open(td, uap)) == 0)
 		_filemon_wrapper_openat(td, uap->path, uap->flags, AT_FDCWD);
 
 	return (ret);
 }
 
 static int
 filemon_wrapper_openat(struct thread *td, struct openat_args *uap)
 {
 	int ret;
 
 	if ((ret = sys_openat(td, uap)) == 0)
 		_filemon_wrapper_openat(td, uap->path, uap->flag, uap->fd);
 
 	return (ret);
 }
 
 static int
 filemon_wrapper_rename(struct thread *td, struct rename_args *uap)
 {
 	int error, ret;
 	struct filemon *filemon;
 
 	if ((ret = sys_rename(td, uap)) == 0) {
 		if ((filemon = filemon_proc_get(curproc)) != NULL) {
 			if (((error = copyinstr(uap->from, filemon->fname1,
 			     sizeof(filemon->fname1), NULL)) != 0) ||
 			    ((error = copyinstr(uap->to, filemon->fname2,
 			     sizeof(filemon->fname2), NULL)) != 0)) {
 				filemon->error = error;
 				goto copyfail;
 			}
 
 			filemon_output_event(filemon, "M %d '%s' '%s'\n",
 			    curproc->p_pid, filemon->fname1, filemon->fname2);
 copyfail:
 			filemon_drop(filemon);
 		}
 	}
 
 	return (ret);
 }
 
 static void
 _filemon_wrapper_link(struct thread *td, const char *upath1,
     const char *upath2)
 {
 	struct filemon *filemon;
 	int error;
 
 	if ((filemon = filemon_proc_get(curproc)) != NULL) {
 		if (((error = copyinstr(upath1, filemon->fname1,
 		     sizeof(filemon->fname1), NULL)) != 0) ||
 		    ((error = copyinstr(upath2, filemon->fname2,
 		     sizeof(filemon->fname2), NULL)) != 0)) {
 			filemon->error = error;
 			goto copyfail;
 		}
 
 		filemon_output_event(filemon, "L %d '%s' '%s'\n",
 		    curproc->p_pid, filemon->fname1, filemon->fname2);
 copyfail:
 		filemon_drop(filemon);
 	}
 }
 
 static int
 filemon_wrapper_link(struct thread *td, struct link_args *uap)
 {
 	int ret;
 
 	if ((ret = sys_link(td, uap)) == 0)
 		_filemon_wrapper_link(td, uap->path, uap->link);
 
 	return (ret);
 }
 
 static int
 filemon_wrapper_symlink(struct thread *td, struct symlink_args *uap)
 {
 	int ret;
 
 	if ((ret = sys_symlink(td, uap)) == 0)
 		_filemon_wrapper_link(td, uap->path, uap->link);
 
 	return (ret);
 }
 
 static int
 filemon_wrapper_linkat(struct thread *td, struct linkat_args *uap)
 {
 	int ret;
 
 	if ((ret = sys_linkat(td, uap)) == 0)
 		_filemon_wrapper_link(td, uap->path1, uap->path2);
 
 	return (ret);
 }
 
 static void
 filemon_event_process_exit(void *arg __unused, struct proc *p)
 {
 	struct filemon *filemon;
 
 	if ((filemon = filemon_proc_get(p)) != NULL) {
 		filemon_output_event(filemon, "X %d %d %d\n",
 		    p->p_pid, p->p_xexit, p->p_xsig);
 
 		/*
 		 * filemon_untrack_processes() may have dropped this p_filemon
 		 * already while in filemon_proc_get() before acquiring the
 		 * filemon lock.
 		 */
 		KASSERT(p->p_filemon == NULL || p->p_filemon == filemon,
 		    ("%s: p %p was attached while exiting, expected "
 		    "filemon %p or NULL", __func__, p, filemon));
 		if (p->p_filemon == filemon)
 			filemon_proc_drop(p);
 
 		filemon_drop(filemon);
 	}
 }
 
 static int
 filemon_wrapper_unlink(struct thread *td, struct unlink_args *uap)
 {
 	int error, ret;
 	struct filemon *filemon;
 
 	if ((ret = sys_unlink(td, uap)) == 0) {
 		if ((filemon = filemon_proc_get(curproc)) != NULL) {
 			if ((error = copyinstr(uap->path, filemon->fname1,
 			    sizeof(filemon->fname1), NULL)) != 0) {
 				filemon->error = error;
 				goto copyfail;
 			}
 
 			filemon_output_event(filemon, "D %d %s\n",
 			    curproc->p_pid, filemon->fname1);
 copyfail:
 			filemon_drop(filemon);
 		}
 	}
 
 	return (ret);
 }
 
 static void
 filemon_event_process_fork(void *arg __unused, struct proc *p1,
     struct proc *p2, int flags __unused)
 {
 	struct filemon *filemon;
 
 	if ((filemon = filemon_proc_get(p1)) != NULL) {
 		filemon_output_event(filemon, "F %d %d\n",
 		    p1->p_pid, p2->p_pid);
 
 		/*
 		 * filemon_untrack_processes() or
 		 * filemon_ioctl(FILEMON_SET_PID) may have changed the parent's
 		 * p_filemon while in filemon_proc_get() before acquiring the
 		 * filemon lock.  Only inherit if the parent is still traced by
 		 * this filemon.
 		 */
 		if (p1->p_filemon == filemon) {
 			PROC_LOCK(p2);
 			/*
 			 * It may have been attached to already by a new
 			 * filemon.
 			 */
 			if (p2->p_filemon == NULL) {
 				p2->p_filemon = filemon_acquire(filemon);
 				++filemon->proccnt;
 			}
 			PROC_UNLOCK(p2);
 		}
 
 		filemon_drop(filemon);
 	}
 }
 
 static void
 filemon_wrapper_install(void)
 {
 
 	sysent[SYS_chdir].sy_call = (sy_call_t *) filemon_wrapper_chdir;
 	sysent[SYS_open].sy_call = (sy_call_t *) filemon_wrapper_open;
 	sysent[SYS_openat].sy_call = (sy_call_t *) filemon_wrapper_openat;
 	sysent[SYS_rename].sy_call = (sy_call_t *) filemon_wrapper_rename;
 	sysent[SYS_unlink].sy_call = (sy_call_t *) filemon_wrapper_unlink;
 	sysent[SYS_link].sy_call = (sy_call_t *) filemon_wrapper_link;
 	sysent[SYS_symlink].sy_call = (sy_call_t *) filemon_wrapper_symlink;
 	sysent[SYS_linkat].sy_call = (sy_call_t *) filemon_wrapper_linkat;
 
 #if defined(COMPAT_FREEBSD32)
 	freebsd32_sysent[FREEBSD32_SYS_chdir].sy_call = (sy_call_t *) filemon_wrapper_chdir;
 	freebsd32_sysent[FREEBSD32_SYS_open].sy_call = (sy_call_t *) filemon_wrapper_open;
 	freebsd32_sysent[FREEBSD32_SYS_openat].sy_call = (sy_call_t *) filemon_wrapper_openat;
 	freebsd32_sysent[FREEBSD32_SYS_rename].sy_call = (sy_call_t *) filemon_wrapper_rename;
 	freebsd32_sysent[FREEBSD32_SYS_unlink].sy_call = (sy_call_t *) filemon_wrapper_unlink;
 	freebsd32_sysent[FREEBSD32_SYS_link].sy_call = (sy_call_t *) filemon_wrapper_link;
 	freebsd32_sysent[FREEBSD32_SYS_symlink].sy_call = (sy_call_t *) filemon_wrapper_symlink;
 	freebsd32_sysent[FREEBSD32_SYS_linkat].sy_call = (sy_call_t *) filemon_wrapper_linkat;
 #endif	/* COMPAT_FREEBSD32 */
 
 	filemon_exec_tag = EVENTHANDLER_REGISTER(process_exec,
 	    filemon_event_process_exec, NULL, EVENTHANDLER_PRI_LAST);
 	filemon_exit_tag = EVENTHANDLER_REGISTER(process_exit,
 	    filemon_event_process_exit, NULL, EVENTHANDLER_PRI_LAST);
 	filemon_fork_tag = EVENTHANDLER_REGISTER(process_fork,
 	    filemon_event_process_fork, NULL, EVENTHANDLER_PRI_LAST);
 }
 
 static void
 filemon_wrapper_deinstall(void)
 {
 
 	sysent[SYS_chdir].sy_call = (sy_call_t *)sys_chdir;
 	sysent[SYS_open].sy_call = (sy_call_t *)sys_open;
 	sysent[SYS_openat].sy_call = (sy_call_t *)sys_openat;
 	sysent[SYS_rename].sy_call = (sy_call_t *)sys_rename;
 	sysent[SYS_unlink].sy_call = (sy_call_t *)sys_unlink;
 	sysent[SYS_link].sy_call = (sy_call_t *)sys_link;
 	sysent[SYS_symlink].sy_call = (sy_call_t *)sys_symlink;
 	sysent[SYS_linkat].sy_call = (sy_call_t *)sys_linkat;
 
 #if defined(COMPAT_FREEBSD32)
 	freebsd32_sysent[FREEBSD32_SYS_chdir].sy_call = (sy_call_t *)sys_chdir;
 	freebsd32_sysent[FREEBSD32_SYS_open].sy_call = (sy_call_t *)sys_open;
 	freebsd32_sysent[FREEBSD32_SYS_openat].sy_call = (sy_call_t *)sys_openat;
 	freebsd32_sysent[FREEBSD32_SYS_rename].sy_call = (sy_call_t *)sys_rename;
 	freebsd32_sysent[FREEBSD32_SYS_unlink].sy_call = (sy_call_t *)sys_unlink;
 	freebsd32_sysent[FREEBSD32_SYS_link].sy_call = (sy_call_t *)sys_link;
 	freebsd32_sysent[FREEBSD32_SYS_symlink].sy_call = (sy_call_t *)sys_symlink;
 	freebsd32_sysent[FREEBSD32_SYS_linkat].sy_call = (sy_call_t *)sys_linkat;
 #endif	/* COMPAT_FREEBSD32 */
 
 	EVENTHANDLER_DEREGISTER(process_exec, filemon_exec_tag);
 	EVENTHANDLER_DEREGISTER(process_exit, filemon_exit_tag);
 	EVENTHANDLER_DEREGISTER(process_fork, filemon_fork_tag);
 }
diff --git a/sys/dev/ipmi/ipmi_linux.c b/sys/dev/ipmi/ipmi_linux.c
index 136712e8cfb5..4b33c5b014d8 100644
--- a/sys/dev/ipmi/ipmi_linux.c
+++ b/sys/dev/ipmi/ipmi_linux.c
@@ -1,118 +1,119 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 IronPort Systems Inc. <ambrisko@ironport.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Linux ioctl handler for the ipmi device driver
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #ifdef __amd64__
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_ioctl.h>
 #include <sys/ioccom.h>
 #include <sys/ipmi.h>
 
 /* There are multiple ioctl number ranges that need to be handled */
 #define IPMI_LINUX_IOCTL_MIN  0x690b
 #define IPMI_LINUX_IOCTL_MAX  0x6915
 
 /* Linux versions of ioctl's */
 #define L_IPMICTL_RECEIVE_MSG_TRUNC       _IOWR(IPMI_IOC_MAGIC, 11, struct ipmi_recv)
 #define L_IPMICTL_RECEIVE_MSG             _IOWR(IPMI_IOC_MAGIC, 12, struct ipmi_recv)
 #define L_IPMICTL_SEND_COMMAND            _IOW(IPMI_IOC_MAGIC, 13, struct ipmi_req)
 #define L_IPMICTL_REGISTER_FOR_CMD        _IOW(IPMI_IOC_MAGIC, 14, struct ipmi_cmdspec)
 #define L_IPMICTL_UNREGISTER_FOR_CMD      _IOW(IPMI_IOC_MAGIC, 15, struct ipmi_cmdspec)
 #define L_IPMICTL_SET_GETS_EVENTS_CMD     _IOW(IPMI_IOC_MAGIC, 16, int)
 #define L_IPMICTL_SET_MY_ADDRESS_CMD      _IOW(IPMI_IOC_MAGIC, 17, unsigned int)
 #define L_IPMICTL_GET_MY_ADDRESS_CMD      _IOW(IPMI_IOC_MAGIC, 18, unsigned int)
 #define L_IPMICTL_SET_MY_LUN_CMD          _IOW(IPMI_IOC_MAGIC, 19, unsigned int)
 #define L_IPMICTL_GET_MY_LUN_CMD          _IOW(IPMI_IOC_MAGIC, 20, unsigned int)
 
 static linux_ioctl_function_t ipmi_linux_ioctl;
 static struct linux_ioctl_handler ipmi_linux_handler = {ipmi_linux_ioctl,
 						       IPMI_LINUX_IOCTL_MIN,
 						       IPMI_LINUX_IOCTL_MAX};
 
 SYSINIT  (ipmi_linux_register,   SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_register_handler, &ipmi_linux_handler);
 SYSUNINIT(ipmi_linux_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_unregister_handler, &ipmi_linux_handler);
 
 static int
 ipmi_linux_modevent(module_t mod, int type, void *data)
 {
 	/* Do we care about any specific load/unload actions? */
 	return (0);
 }
 
 DEV_MODULE(ipmi_linux, ipmi_linux_modevent, NULL);
 MODULE_DEPEND(ipmi_linux, linux, 1, 1, 1);
 
 static int
 ipmi_linux_ioctl(struct thread *td, struct linux_ioctl_args *args)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	u_long cmd;
 	int error;
 
-	error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	error = fget(td, args->fd, cap_rights_init_one(&rights, CAP_IOCTL),
+	    &fp);
 	if (error != 0)
 		return (error);
 	cmd = args->cmd;
 
 	switch(cmd) {
 	case L_IPMICTL_GET_MY_ADDRESS_CMD:
 		cmd = IPMICTL_GET_MY_ADDRESS_CMD;
 		break;
 	case L_IPMICTL_GET_MY_LUN_CMD:
 		cmd = IPMICTL_GET_MY_LUN_CMD;
 		break;
 	}
 	/*
 	 * Pass the ioctl off to our standard handler.
 	 */
 	error = (fo_ioctl(fp, cmd, (caddr_t)args->arg, td->td_ucred, td));
 	fdrop(fp, td);
 	return (error);
 }
diff --git a/sys/dev/iscsi/icl_soft.c b/sys/dev/iscsi/icl_soft.c
index f8bcbb9eaa1c..bd4d9db81bb0 100644
--- a/sys/dev/iscsi/icl_soft.c
+++ b/sys/dev/iscsi/icl_soft.c
@@ -1,1635 +1,1635 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 The FreeBSD Foundation
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * Software implementation of iSCSI Common Layer kobj(9) interface.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/gsb_crc32.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/uio.h>
 #include <vm/uma.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <icl_conn_if.h>
 
 struct icl_soft_pdu {
 	struct icl_pdu	 ip;
 
 	/* soft specific stuff goes here. */
 	u_int		 ref_cnt;
 	icl_pdu_cb	 cb;
 	int		 error;
 };
 
 static int coalesce = 1;
 SYSCTL_INT(_kern_icl, OID_AUTO, coalesce, CTLFLAG_RWTUN,
     &coalesce, 0, "Try to coalesce PDUs before sending");
 static int partial_receive_len = 128 * 1024;
 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
     &partial_receive_len, 0, "Minimum read size for partially received "
     "data segment");
 static int sendspace = 1048576;
 SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN,
     &sendspace, 0, "Default send socket buffer size");
 static int recvspace = 1048576;
 SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
     &recvspace, 0, "Default receive socket buffer size");
 
 static MALLOC_DEFINE(M_ICL_SOFT, "icl_soft", "iSCSI software backend");
 static uma_zone_t icl_soft_pdu_zone;
 
 static volatile u_int	icl_ncons;
 
 #define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
 #define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
 #define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
 #define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
 
 STAILQ_HEAD(icl_pdu_stailq, icl_pdu);
 
 static icl_conn_new_pdu_t	icl_soft_conn_new_pdu;
 static icl_conn_pdu_free_t	icl_soft_conn_pdu_free;
 static icl_conn_pdu_data_segment_length_t
 				    icl_soft_conn_pdu_data_segment_length;
 static icl_conn_pdu_append_data_t	icl_soft_conn_pdu_append_data;
 static icl_conn_pdu_get_data_t	icl_soft_conn_pdu_get_data;
 static icl_conn_pdu_queue_t	icl_soft_conn_pdu_queue;
 static icl_conn_pdu_queue_cb_t	icl_soft_conn_pdu_queue_cb;
 static icl_conn_handoff_t	icl_soft_conn_handoff;
 static icl_conn_free_t		icl_soft_conn_free;
 static icl_conn_close_t		icl_soft_conn_close;
 static icl_conn_task_setup_t	icl_soft_conn_task_setup;
 static icl_conn_task_done_t	icl_soft_conn_task_done;
 static icl_conn_transfer_setup_t	icl_soft_conn_transfer_setup;
 static icl_conn_transfer_done_t	icl_soft_conn_transfer_done;
 #ifdef ICL_KERNEL_PROXY
 static icl_conn_connect_t	icl_soft_conn_connect;
 #endif
 
 static kobj_method_t icl_soft_methods[] = {
 	KOBJMETHOD(icl_conn_new_pdu, icl_soft_conn_new_pdu),
 	KOBJMETHOD(icl_conn_pdu_free, icl_soft_conn_pdu_free),
 	KOBJMETHOD(icl_conn_pdu_data_segment_length,
 	    icl_soft_conn_pdu_data_segment_length),
 	KOBJMETHOD(icl_conn_pdu_append_data, icl_soft_conn_pdu_append_data),
 	KOBJMETHOD(icl_conn_pdu_get_data, icl_soft_conn_pdu_get_data),
 	KOBJMETHOD(icl_conn_pdu_queue, icl_soft_conn_pdu_queue),
 	KOBJMETHOD(icl_conn_pdu_queue_cb, icl_soft_conn_pdu_queue_cb),
 	KOBJMETHOD(icl_conn_handoff, icl_soft_conn_handoff),
 	KOBJMETHOD(icl_conn_free, icl_soft_conn_free),
 	KOBJMETHOD(icl_conn_close, icl_soft_conn_close),
 	KOBJMETHOD(icl_conn_task_setup, icl_soft_conn_task_setup),
 	KOBJMETHOD(icl_conn_task_done, icl_soft_conn_task_done),
 	KOBJMETHOD(icl_conn_transfer_setup, icl_soft_conn_transfer_setup),
 	KOBJMETHOD(icl_conn_transfer_done, icl_soft_conn_transfer_done),
 #ifdef ICL_KERNEL_PROXY
 	KOBJMETHOD(icl_conn_connect, icl_soft_conn_connect),
 #endif
 	{ 0, 0 }
 };
 
 DEFINE_CLASS(icl_soft, icl_soft_methods, sizeof(struct icl_conn));
 
 static void
 icl_conn_fail(struct icl_conn *ic)
 {
 	if (ic->ic_socket == NULL)
 		return;
 
 	/*
 	 * XXX
 	 */
 	ic->ic_socket->so_error = EDOOFUS;
 	(ic->ic_error)(ic);
 }
 
 static struct mbuf *
 icl_conn_receive(struct icl_conn *ic, size_t len)
 {
 	struct uio uio;
 	struct socket *so;
 	struct mbuf *m;
 	int error, flags;
 
 	so = ic->ic_socket;
 
 	memset(&uio, 0, sizeof(uio));
 	uio.uio_resid = len;
 
 	flags = MSG_DONTWAIT;
 	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
 	if (error != 0) {
 		ICL_DEBUG("soreceive error %d", error);
 		return (NULL);
 	}
 	if (uio.uio_resid != 0) {
 		m_freem(m);
 		ICL_DEBUG("short read");
 		return (NULL);
 	}
 
 	return (m);
 }
 
 static int
 icl_conn_receive_buf(struct icl_conn *ic, void *buf, size_t len)
 {
 	struct iovec iov[1];
 	struct uio uio;
 	struct socket *so;
 	int error, flags;
 
 	so = ic->ic_socket;
 
 	memset(&uio, 0, sizeof(uio));
 	iov[0].iov_base = buf;
 	iov[0].iov_len = len;
 	uio.uio_iov = iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 
 	flags = MSG_DONTWAIT;
 	error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
 	if (error != 0) {
 		ICL_DEBUG("soreceive error %d", error);
 		return (-1);
 	}
 	if (uio.uio_resid != 0) {
 		ICL_DEBUG("short read");
 		return (-1);
 	}
 
 	return (0);
 }
 
 static void
 icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
 
 	KASSERT(isp->ref_cnt == 0, ("freeing active PDU"));
 	m_freem(ip->ip_bhs_mbuf);
 	m_freem(ip->ip_ahs_mbuf);
 	m_freem(ip->ip_data_mbuf);
 	uma_zfree(icl_soft_pdu_zone, isp);
 #ifdef DIAGNOSTIC
 	refcount_release(&ic->ic_outstanding_pdus);
 #endif
 }
 
 static void
 icl_soft_pdu_call_cb(struct icl_pdu *ip)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
 
 	if (isp->cb != NULL)
 		isp->cb(ip, isp->error);
 #ifdef DIAGNOSTIC
 	refcount_release(&ip->ip_conn->ic_outstanding_pdus);
 #endif
 	uma_zfree(icl_soft_pdu_zone, isp);
 }
 
 static void
 icl_soft_pdu_done(struct icl_pdu *ip, int error)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
 
 	if (error != 0)
 		isp->error = error;
 
 	m_freem(ip->ip_bhs_mbuf);
 	ip->ip_bhs_mbuf = NULL;
 	m_freem(ip->ip_ahs_mbuf);
 	ip->ip_ahs_mbuf = NULL;
 	m_freem(ip->ip_data_mbuf);
 	ip->ip_data_mbuf = NULL;
 
 	if (atomic_fetchadd_int(&isp->ref_cnt, -1) == 1)
 		icl_soft_pdu_call_cb(ip);
 }
 
 static void
 icl_soft_mbuf_done(struct mbuf *mb)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)mb->m_ext.ext_arg1;
 
 	icl_soft_pdu_call_cb(&isp->ip);
 }
 
 /*
  * Allocate icl_pdu with empty BHS to fill up by the caller.
  */
 struct icl_pdu *
 icl_soft_conn_new_pdu(struct icl_conn *ic, int flags)
 {
 	struct icl_soft_pdu *isp;
 	struct icl_pdu *ip;
 
 #ifdef DIAGNOSTIC
 	refcount_acquire(&ic->ic_outstanding_pdus);
 #endif
 	isp = uma_zalloc(icl_soft_pdu_zone, flags | M_ZERO);
 	if (isp == NULL) {
 		ICL_WARN("failed to allocate soft PDU");
 #ifdef DIAGNOSTIC
 		refcount_release(&ic->ic_outstanding_pdus);
 #endif
 		return (NULL);
 	}
 	ip = &isp->ip;
 	ip->ip_conn = ic;
 
 	CTASSERT(sizeof(struct iscsi_bhs) <= MHLEN);
 	ip->ip_bhs_mbuf = m_gethdr(flags, MT_DATA);
 	if (ip->ip_bhs_mbuf == NULL) {
 		ICL_WARN("failed to allocate BHS mbuf");
 		icl_soft_conn_pdu_free(ic, ip);
 		return (NULL);
 	}
 	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
 	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
 	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
 
 	return (ip);
 }
 
 static int
 icl_pdu_ahs_length(const struct icl_pdu *request)
 {
 
 	return (request->ip_bhs->bhs_total_ahs_len * 4);
 }
 
 static size_t
 icl_pdu_data_segment_length(const struct icl_pdu *request)
 {
 	uint32_t len = 0;
 
 	len += request->ip_bhs->bhs_data_segment_len[0];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[1];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[2];
 
 	return (len);
 }
 
 size_t
 icl_soft_conn_pdu_data_segment_length(struct icl_conn *ic,
     const struct icl_pdu *request)
 {
 
 	return (icl_pdu_data_segment_length(request));
 }
 
 static void
 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
 {
 
 	response->ip_bhs->bhs_data_segment_len[2] = len;
 	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
 	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
 }
 
 static size_t
 icl_pdu_padding(const struct icl_pdu *ip)
 {
 
 	if ((ip->ip_data_len % 4) != 0)
 		return (4 - (ip->ip_data_len % 4));
 
 	return (0);
 }
 
 static size_t
 icl_pdu_size(const struct icl_pdu *response)
 {
 	size_t len;
 
 	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
 
 	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
 	    icl_pdu_padding(response);
 	if (response->ip_conn->ic_header_crc32c)
 		len += ISCSI_HEADER_DIGEST_SIZE;
 	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
 		len += ISCSI_DATA_DIGEST_SIZE;
 
 	return (len);
 }
 
 static int
 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
 {
 
 	if (icl_conn_receive_buf(request->ip_conn,
 	    request->ip_bhs, sizeof(struct iscsi_bhs))) {
 		ICL_DEBUG("failed to receive BHS");
 		return (-1);
 	}
 
 	*availablep -= sizeof(struct iscsi_bhs);
 	return (0);
 }
 
 static int
 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
 {
 
 	request->ip_ahs_len = icl_pdu_ahs_length(request);
 	if (request->ip_ahs_len == 0)
 		return (0);
 
 	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
 	    request->ip_ahs_len);
 	if (request->ip_ahs_mbuf == NULL) {
 		ICL_DEBUG("failed to receive AHS");
 		return (-1);
 	}
 
 	*availablep -= request->ip_ahs_len;
 	return (0);
 }
 
 static uint32_t
 icl_mbuf_to_crc32c(const struct mbuf *m0)
 {
 	uint32_t digest = 0xffffffff;
 	const struct mbuf *m;
 
 	for (m = m0; m != NULL; m = m->m_next)
 		digest = calculate_crc32c(digest,
 		    mtod(m, const void *), m->m_len);
 
 	digest = digest ^ 0xffffffff;
 
 	return (digest);
 }
 
 static int
 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
 {
 	uint32_t received_digest, valid_digest;
 
 	if (request->ip_conn->ic_header_crc32c == false)
 		return (0);
 
 	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
 	if (icl_conn_receive_buf(request->ip_conn,
 	    &received_digest, ISCSI_HEADER_DIGEST_SIZE)) {
 		ICL_DEBUG("failed to receive header digest");
 		return (-1);
 	}
 	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
 
 	/* Temporary attach AHS to BHS to calculate header digest. */
 	request->ip_bhs_mbuf->m_next = request->ip_ahs_mbuf;
 	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
 	request->ip_bhs_mbuf->m_next = NULL;
 	if (received_digest != valid_digest) {
 		ICL_WARN("header digest check failed; got 0x%x, "
 		    "should be 0x%x", received_digest, valid_digest);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Return the number of bytes that should be waiting in the receive socket
  * before icl_pdu_receive_data_segment() gets called.
  */
 static size_t
 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
 {
 	size_t len;
 
 	len = icl_pdu_data_segment_length(request);
 	if (len == 0)
 		return (0);
 
 	/*
 	 * Account for the parts of data segment already read from
 	 * the socket buffer.
 	 */
 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
 	len -= request->ip_data_len;
 
 	/*
 	 * Don't always wait for the full data segment to be delivered
 	 * to the socket; this might badly affect performance due to
 	 * TCP window scaling.
 	 */
 	if (len > partial_receive_len) {
 #if 0
 		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
 		    len, partial_receive_len));
 #endif
 		len = partial_receive_len;
 
 		return (len);
 	}
 
 	/*
 	 * Account for padding.  Note that due to the way code is written,
 	 * the icl_pdu_receive_data_segment() must always receive padding
 	 * along with the last part of data segment, because it would be
 	 * impossible to tell whether we've already received the full data
 	 * segment including padding, or without it.
 	 */
 	if ((len % 4) != 0)
 		len += 4 - (len % 4);
 
 #if 0
 	ICL_DEBUG("need %zd bytes of data", len));
 #endif
 
 	return (len);
 }
 
 static int
 icl_pdu_receive_data_segment(struct icl_pdu *request,
     size_t *availablep, bool *more_neededp)
 {
 	struct icl_conn *ic;
 	size_t len, padding = 0;
 	struct mbuf *m;
 
 	ic = request->ip_conn;
 
 	*more_neededp = false;
 	ic->ic_receive_len = 0;
 
 	len = icl_pdu_data_segment_length(request);
 	if (len == 0)
 		return (0);
 
 	if ((len % 4) != 0)
 		padding = 4 - (len % 4);
 
 	/*
 	 * Account for already received parts of data segment.
 	 */
 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
 	len -= request->ip_data_len;
 
 	if (len + padding > *availablep) {
 		/*
 		 * Not enough data in the socket buffer.  Receive as much
 		 * as we can.  Don't receive padding, since, obviously, it's
 		 * not the end of data segment yet.
 		 */
 #if 0
 		ICL_DEBUG("limited from %zd to %zd",
 		    len + padding, *availablep - padding));
 #endif
 		len = *availablep - padding;
 		*more_neededp = true;
 		padding = 0;
 	}
 
 	/*
 	 * Must not try to receive padding without at least one byte
 	 * of actual data segment.
 	 */
 	if (len > 0) {
 		m = icl_conn_receive(request->ip_conn, len + padding);
 		if (m == NULL) {
 			ICL_DEBUG("failed to receive data segment");
 			return (-1);
 		}
 
 		if (request->ip_data_mbuf == NULL)
 			request->ip_data_mbuf = m;
 		else
 			m_cat(request->ip_data_mbuf, m);
 
 		request->ip_data_len += len;
 		*availablep -= len + padding;
 	} else
 		ICL_DEBUG("len 0");
 
 	if (*more_neededp)
 		ic->ic_receive_len =
 		    icl_pdu_data_segment_receive_len(request);
 
 	return (0);
 }
 
 static int
 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
 {
 	uint32_t received_digest, valid_digest;
 
 	if (request->ip_conn->ic_data_crc32c == false)
 		return (0);
 
 	if (request->ip_data_len == 0)
 		return (0);
 
 	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
 	if (icl_conn_receive_buf(request->ip_conn,
 	    &received_digest, ISCSI_DATA_DIGEST_SIZE)) {
 		ICL_DEBUG("failed to receive data digest");
 		return (-1);
 	}
 	*availablep -= ISCSI_DATA_DIGEST_SIZE;
 
 	/*
 	 * Note that ip_data_mbuf also contains padding; since digest
 	 * calculation is supposed to include that, we iterate over
 	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
 	 */
 	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
 	if (received_digest != valid_digest) {
 		ICL_WARN("data digest check failed; got 0x%x, "
 		    "should be 0x%x", received_digest, valid_digest);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Somewhat contrary to the name, this attempts to receive only one
  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
  */
 static struct icl_pdu *
 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
 {
 	struct icl_pdu *request;
 	struct socket *so;
 	size_t len;
 	int error;
 	bool more_needed;
 
 	so = ic->ic_socket;
 
 	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
 		KASSERT(ic->ic_receive_pdu == NULL,
 		    ("ic->ic_receive_pdu != NULL"));
 		request = icl_soft_conn_new_pdu(ic, M_NOWAIT);
 		if (request == NULL) {
 			ICL_DEBUG("failed to allocate PDU; "
 			    "dropping connection");
 			icl_conn_fail(ic);
 			return (NULL);
 		}
 		ic->ic_receive_pdu = request;
 	} else {
 		KASSERT(ic->ic_receive_pdu != NULL,
 		    ("ic->ic_receive_pdu == NULL"));
 		request = ic->ic_receive_pdu;
 	}
 
 	if (*availablep < ic->ic_receive_len) {
 #if 0
 		ICL_DEBUG("not enough data; need %zd, "
 		    "have %zd", ic->ic_receive_len, *availablep);
 #endif
 		return (NULL);
 	}
 
 	switch (ic->ic_receive_state) {
 	case ICL_CONN_STATE_BHS:
 		//ICL_DEBUG("receiving BHS");
 		error = icl_pdu_receive_bhs(request, availablep);
 		if (error != 0) {
 			ICL_DEBUG("failed to receive BHS; "
 			    "dropping connection");
 			break;
 		}
 
 		/*
 		 * We don't enforce any limit for AHS length;
 		 * its length is stored in 8 bit field.
 		 */
 
 		len = icl_pdu_data_segment_length(request);
 		if (len > ic->ic_max_data_segment_length) {
 			ICL_WARN("received data segment "
 			    "length %zd is larger than negotiated "
 			    "MaxDataSegmentLength %zd; "
 			    "dropping connection",
 			    len, ic->ic_max_data_segment_length);
 			error = EINVAL;
 			break;
 		}
 
 		ic->ic_receive_state = ICL_CONN_STATE_AHS;
 		ic->ic_receive_len = icl_pdu_ahs_length(request);
 		break;
 
 	case ICL_CONN_STATE_AHS:
 		//ICL_DEBUG("receiving AHS");
 		error = icl_pdu_receive_ahs(request, availablep);
 		if (error != 0) {
 			ICL_DEBUG("failed to receive AHS; "
 			    "dropping connection");
 			break;
 		}
 		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
 		if (ic->ic_header_crc32c == false)
 			ic->ic_receive_len = 0;
 		else
 			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
 		break;
 
 	case ICL_CONN_STATE_HEADER_DIGEST:
 		//ICL_DEBUG("receiving header digest");
 		error = icl_pdu_check_header_digest(request, availablep);
 		if (error != 0) {
 			ICL_DEBUG("header digest failed; "
 			    "dropping connection");
 			break;
 		}
 
 		ic->ic_receive_state = ICL_CONN_STATE_DATA;
 		ic->ic_receive_len =
 		    icl_pdu_data_segment_receive_len(request);
 		break;
 
 	case ICL_CONN_STATE_DATA:
 		//ICL_DEBUG("receiving data segment");
 		error = icl_pdu_receive_data_segment(request, availablep,
 		    &more_needed);
 		if (error != 0) {
 			ICL_DEBUG("failed to receive data segment;"
 			    "dropping connection");
 			break;
 		}
 
 		if (more_needed)
 			break;
 
 		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
 		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
 			ic->ic_receive_len = 0;
 		else
 			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
 		break;
 
 	case ICL_CONN_STATE_DATA_DIGEST:
 		//ICL_DEBUG("receiving data digest");
 		error = icl_pdu_check_data_digest(request, availablep);
 		if (error != 0) {
 			ICL_DEBUG("data digest failed; "
 			    "dropping connection");
 			break;
 		}
 
 		/*
 		 * We've received complete PDU; reset the receive state machine
 		 * and return the PDU.
 		 */
 		ic->ic_receive_state = ICL_CONN_STATE_BHS;
 		ic->ic_receive_len = sizeof(struct iscsi_bhs);
 		ic->ic_receive_pdu = NULL;
 		return (request);
 
 	default:
 		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
 	}
 
 	if (error != 0) {
 		/*
 		 * Don't free the PDU; it's pointed to by ic->ic_receive_pdu
 		 * and will get freed in icl_soft_conn_close().
 		 */
 		icl_conn_fail(ic);
 	}
 
 	return (NULL);
 }
 
 static void
 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
 {
 	struct icl_pdu *response;
 	struct socket *so;
 
 	so = ic->ic_socket;
 
 	/*
 	 * This can never happen; we're careful to only mess with ic->ic_socket
 	 * pointer when the send/receive threads are not running.
 	 */
 	KASSERT(so != NULL, ("NULL socket"));
 
 	for (;;) {
 		if (ic->ic_disconnecting)
 			return;
 
 		if (so->so_error != 0) {
 			ICL_DEBUG("connection error %d; "
 			    "dropping connection", so->so_error);
 			icl_conn_fail(ic);
 			return;
 		}
 
 		/*
 		 * Loop until we have a complete PDU or there is not enough
 		 * data in the socket buffer.
 		 */
 		if (available < ic->ic_receive_len) {
 #if 0
 			ICL_DEBUG("not enough data; have %zd, "
 			    "need %zd", available,
 			    ic->ic_receive_len);
 #endif
 			return;
 		}
 
 		response = icl_conn_receive_pdu(ic, &available);
 		if (response == NULL)
 			continue;
 
 		if (response->ip_ahs_len > 0) {
 			ICL_WARN("received PDU with unsupported "
 			    "AHS; opcode 0x%x; dropping connection",
 			    response->ip_bhs->bhs_opcode);
 			icl_soft_conn_pdu_free(ic, response);
 			icl_conn_fail(ic);
 			return;
 		}
 
 		(ic->ic_receive)(response);
 	}
 }
 
 static void
 icl_receive_thread(void *arg)
 {
 	struct icl_conn *ic;
 	size_t available;
 	struct socket *so;
 
 	ic = arg;
 	so = ic->ic_socket;
 
 	for (;;) {
 		if (ic->ic_disconnecting) {
 			//ICL_DEBUG("terminating");
 			break;
 		}
 
 		/*
 		 * Set the low watermark, to be checked by
 		 * soreadable() in icl_soupcall_receive()
 		 * to avoid unnecessary wakeups until there
 		 * is enough data received to read the PDU.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		available = sbavail(&so->so_rcv);
 		if (available < ic->ic_receive_len) {
 			so->so_rcv.sb_lowat = ic->ic_receive_len;
 			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
 		} else
 			so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 
 		icl_conn_receive_pdus(ic, available);
 	}
 
 	ICL_CONN_LOCK(ic);
 	ic->ic_receive_running = false;
 	cv_signal(&ic->ic_send_cv);
 	ICL_CONN_UNLOCK(ic);
 	kthread_exit();
 }
 
 static int
 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
 {
 	struct icl_conn *ic;
 
 	if (!soreadable(so))
 		return (SU_OK);
 
 	ic = arg;
 	cv_signal(&ic->ic_receive_cv);
 	return (SU_OK);
 }
 
 static int
 icl_pdu_finalize(struct icl_pdu *request)
 {
 	size_t padding, pdu_len;
 	uint32_t digest, zero = 0;
 	int ok;
 	struct icl_conn *ic;
 
 	ic = request->ip_conn;
 
 	icl_pdu_set_data_segment_length(request, request->ip_data_len);
 
 	pdu_len = icl_pdu_size(request);
 
 	if (ic->ic_header_crc32c) {
 		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
 		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
 		    (void *)&digest);
 		if (ok != 1) {
 			ICL_WARN("failed to append header digest");
 			return (1);
 		}
 	}
 
 	if (request->ip_data_len != 0) {
 		padding = icl_pdu_padding(request);
 		if (padding > 0) {
 			ok = m_append(request->ip_data_mbuf, padding,
 			    (void *)&zero);
 			if (ok != 1) {
 				ICL_WARN("failed to append padding");
 				return (1);
 			}
 		}
 
 		if (ic->ic_data_crc32c) {
 			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
 
 			ok = m_append(request->ip_data_mbuf, sizeof(digest),
 			    (void *)&digest);
 			if (ok != 1) {
 				ICL_WARN("failed to append data digest");
 				return (1);
 			}
 		}
 
 		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
 		request->ip_data_mbuf = NULL;
 	}
 
 	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
 
 	return (0);
 }
 
 static void
 icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
 {
 	struct icl_pdu *request, *request2;
 	struct socket *so;
 	long available, size, size2;
 	int coalesced, error;
 
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	so = ic->ic_socket;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * Check how much space do we have for transmit.  We can't just
 	 * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE,
 	 * as it always frees the mbuf chain passed to it, even in case
 	 * of error.
 	 */
 	available = sbspace(&so->so_snd);
 
 	/*
 	 * Notify the socket upcall that we don't need wakeups
 	 * for the time being.
 	 */
 	so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	while (!STAILQ_EMPTY(queue)) {
 		request = STAILQ_FIRST(queue);
 		size = icl_pdu_size(request);
 		if (available < size) {
 			/*
 			 * Set the low watermark, to be checked by
 			 * sowriteable() in icl_soupcall_send()
 			 * to avoid unnecessary wakeups until there
 			 * is enough space for the PDU to fit.
 			 */
 			SOCKBUF_LOCK(&so->so_snd);
 			available = sbspace(&so->so_snd);
 			if (available < size) {
 #if 1
 				ICL_DEBUG("no space to send; "
 				    "have %ld, need %ld",
 				    available, size);
 #endif
 				so->so_snd.sb_lowat = max(size,
 				    so->so_snd.sb_hiwat / 8);
 				SOCKBUF_UNLOCK(&so->so_snd);
 				return;
 			}
 			SOCKBUF_UNLOCK(&so->so_snd);
 		}
 		STAILQ_REMOVE_HEAD(queue, ip_next);
 		error = icl_pdu_finalize(request);
 		if (error != 0) {
 			ICL_DEBUG("failed to finalize PDU; "
 			    "dropping connection");
 			icl_soft_pdu_done(request, EIO);
 			icl_conn_fail(ic);
 			return;
 		}
 		if (coalesce) {
 			coalesced = 1;
 			for (;;) {
 				request2 = STAILQ_FIRST(queue);
 				if (request2 == NULL)
 					break;
 				size2 = icl_pdu_size(request2);
 				if (available < size + size2)
 					break;
 				STAILQ_REMOVE_HEAD(queue, ip_next);
 				error = icl_pdu_finalize(request2);
 				if (error != 0) {
 					ICL_DEBUG("failed to finalize PDU; "
 					    "dropping connection");
 					icl_soft_pdu_done(request, EIO);
 					icl_soft_pdu_done(request2, EIO);
 					icl_conn_fail(ic);
 					return;
 				}
 				m_cat(request->ip_bhs_mbuf, request2->ip_bhs_mbuf);
 				request2->ip_bhs_mbuf = NULL;
 				request->ip_bhs_mbuf->m_pkthdr.len += size2;
 				size += size2;
 				STAILQ_REMOVE_AFTER(queue, request, ip_next);
 				icl_soft_pdu_done(request2, 0);
 				coalesced++;
 			}
 #if 0
 			if (coalesced > 1) {
 				ICL_DEBUG("coalesced %d PDUs into %ld bytes",
 				    coalesced, size);
 			}
 #endif
 		}
 		available -= size;
 		error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
 		    NULL, MSG_DONTWAIT, curthread);
 		request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
 		if (error != 0) {
 			ICL_DEBUG("failed to send PDU, error %d; "
 			    "dropping connection", error);
 			icl_soft_pdu_done(request, error);
 			icl_conn_fail(ic);
 			return;
 		}
 		icl_soft_pdu_done(request, 0);
 	}
 }
 
 static void
 icl_send_thread(void *arg)
 {
 	struct icl_conn *ic;
 	struct icl_pdu_stailq queue;
 
 	ic = arg;
 
 	STAILQ_INIT(&queue);
 
 	ICL_CONN_LOCK(ic);
 	for (;;) {
 		for (;;) {
 			/*
 			 * If the local queue is empty, populate it from
 			 * the main one.  This way the icl_conn_send_pdus()
 			 * can go through all the queued PDUs without holding
 			 * any locks.
 			 */
 			if (STAILQ_EMPTY(&queue))
 				STAILQ_SWAP(&ic->ic_to_send, &queue, icl_pdu);
 
 			ic->ic_check_send_space = false;
 			ICL_CONN_UNLOCK(ic);
 			icl_conn_send_pdus(ic, &queue);
 			ICL_CONN_LOCK(ic);
 
 			/*
 			 * The icl_soupcall_send() was called since the last
 			 * call to sbspace(); go around;
 			 */
 			if (ic->ic_check_send_space)
 				continue;
 
 			/*
 			 * Local queue is empty, but we still have PDUs
 			 * in the main one; go around.
 			 */
 			if (STAILQ_EMPTY(&queue) &&
 			    !STAILQ_EMPTY(&ic->ic_to_send))
 				continue;
 
 			/*
 			 * There might be some stuff in the local queue,
 			 * which didn't get sent due to not having enough send
 			 * space.  Wait for socket upcall.
 			 */
 			break;
 		}
 
 		if (ic->ic_disconnecting) {
 			//ICL_DEBUG("terminating");
 			break;
 		}
 
 		cv_wait(&ic->ic_send_cv, ic->ic_lock);
 	}
 
 	/*
 	 * We're exiting; move PDUs back to the main queue, so they can
 	 * get freed properly.  At this point ordering doesn't matter.
 	 */
 	STAILQ_CONCAT(&ic->ic_to_send, &queue);
 
 	ic->ic_send_running = false;
 	cv_signal(&ic->ic_send_cv);
 	ICL_CONN_UNLOCK(ic);
 	kthread_exit();
 }
 
 static int
 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
 {
 	struct icl_conn *ic;
 
 	if (!sowriteable(so))
 		return (SU_OK);
 
 	ic = arg;
 
 	ICL_CONN_LOCK(ic);
 	ic->ic_check_send_space = true;
 	ICL_CONN_UNLOCK(ic);
 
 	cv_signal(&ic->ic_send_cv);
 
 	return (SU_OK);
 }
 
 static int
 icl_soft_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
     const void *addr, size_t len, int flags)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)request;
 	struct mbuf *mb, *newmb;
 	size_t copylen, off = 0;
 
 	KASSERT(len > 0, ("len == 0"));
 
 	if (flags & ICL_NOCOPY) {
 		newmb = m_get(flags & ~ICL_NOCOPY, MT_DATA);
 		if (newmb == NULL) {
 			ICL_WARN("failed to allocate mbuf");
 			return (ENOMEM);
 		}
 
 		newmb->m_flags |= M_RDONLY;
 		m_extaddref(newmb, __DECONST(char *, addr), len, &isp->ref_cnt,
 		    icl_soft_mbuf_done, isp, NULL);
 		newmb->m_len = len;
 	} else {
 		newmb = m_getm2(NULL, len, flags, MT_DATA, 0);
 		if (newmb == NULL) {
 			ICL_WARN("failed to allocate mbuf for %zd bytes", len);
 			return (ENOMEM);
 		}
 
 		for (mb = newmb; mb != NULL; mb = mb->m_next) {
 			copylen = min(M_TRAILINGSPACE(mb), len - off);
 			memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
 			mb->m_len = copylen;
 			off += copylen;
 		}
 		KASSERT(off == len, ("%s: off != len", __func__));
 	}
 
 	if (request->ip_data_mbuf == NULL) {
 		request->ip_data_mbuf = newmb;
 		request->ip_data_len = len;
 	} else {
 		m_cat(request->ip_data_mbuf, newmb);
 		request->ip_data_len += len;
 	}
 
 	return (0);
 }
 
 void
 icl_soft_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
     size_t off, void *addr, size_t len)
 {
 
 	m_copydata(ip->ip_data_mbuf, off, len, addr);
 }
 
 static void
 icl_soft_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
 {
 
 	icl_soft_conn_pdu_queue_cb(ic, ip, NULL);
 }
 
 static void
 icl_soft_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip,
     icl_pdu_cb cb)
 {
 	struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip;
 
 	ICL_CONN_LOCK_ASSERT(ic);
 	isp->ref_cnt++;
 	isp->cb = cb;
 
 	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
 		ICL_DEBUG("icl_pdu_queue on closed connection");
 		icl_soft_pdu_done(ip, ENOTCONN);
 		return;
 	}
 
 	if (!STAILQ_EMPTY(&ic->ic_to_send)) {
 		STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
 		/*
 		 * If the queue is not empty, someone else had already
 		 * signaled the send thread; no need to do that again,
 		 * just return.
 		 */
 		return;
 	}
 
 	STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
 	cv_signal(&ic->ic_send_cv);
 }
 
 static struct icl_conn *
 icl_soft_new_conn(const char *name, struct mtx *lock)
 {
 	struct icl_conn *ic;
 
 	refcount_acquire(&icl_ncons);
 
 	ic = (struct icl_conn *)kobj_create(&icl_soft_class, M_ICL_SOFT, M_WAITOK | M_ZERO);
 
 	STAILQ_INIT(&ic->ic_to_send);
 	ic->ic_lock = lock;
 	cv_init(&ic->ic_send_cv, "icl_tx");
 	cv_init(&ic->ic_receive_cv, "icl_rx");
 #ifdef DIAGNOSTIC
 	refcount_init(&ic->ic_outstanding_pdus, 0);
 #endif
 	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
 	ic->ic_name = name;
 	ic->ic_offload = "None";
 	ic->ic_unmapped = false;
 
 	return (ic);
 }
 
 void
 icl_soft_conn_free(struct icl_conn *ic)
 {
 
 #ifdef DIAGNOSTIC
 	KASSERT(ic->ic_outstanding_pdus == 0,
 	    ("destroying session with %d outstanding PDUs",
 	     ic->ic_outstanding_pdus));
 #endif
 	cv_destroy(&ic->ic_send_cv);
 	cv_destroy(&ic->ic_receive_cv);
 	kobj_delete((struct kobj *)ic, M_ICL_SOFT);
 	refcount_release(&icl_ncons);
 }
 
 static int
 icl_conn_start(struct icl_conn *ic)
 {
 	size_t minspace;
 	struct sockopt opt;
 	int error, one = 1;
 
 	ICL_CONN_LOCK(ic);
 
 	/*
 	 * XXX: Ugly hack.
 	 */
 	if (ic->ic_socket == NULL) {
 		ICL_CONN_UNLOCK(ic);
 		return (EINVAL);
 	}
 
 	ic->ic_receive_state = ICL_CONN_STATE_BHS;
 	ic->ic_receive_len = sizeof(struct iscsi_bhs);
 	ic->ic_disconnecting = false;
 
 	ICL_CONN_UNLOCK(ic);
 
 	/*
 	 * For sendspace, this is required because the current code cannot
 	 * send a PDU in pieces; thus, the minimum buffer size is equal
 	 * to the maximum PDU size.  "+4" is to account for possible padding.
 	 *
 	 * What we should actually do here is to use autoscaling, but set
 	 * some minimal buffer size to "minspace".  I don't know a way to do
 	 * that, though.
 	 */
 	minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
 	    ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
 	if (sendspace < minspace) {
 		ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
 		    minspace);
 		sendspace = minspace;
 	}
 	if (recvspace < minspace) {
 		ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
 		    minspace);
 		recvspace = minspace;
 	}
 
 	error = soreserve(ic->ic_socket, sendspace, recvspace);
 	if (error != 0) {
 		ICL_WARN("soreserve failed with error %d", error);
 		icl_soft_conn_close(ic);
 		return (error);
 	}
 	ic->ic_socket->so_snd.sb_flags |= SB_AUTOSIZE;
 	ic->ic_socket->so_rcv.sb_flags |= SB_AUTOSIZE;
 
 	/*
 	 * Disable Nagle.
 	 */
 	bzero(&opt, sizeof(opt));
 	opt.sopt_dir = SOPT_SET;
 	opt.sopt_level = IPPROTO_TCP;
 	opt.sopt_name = TCP_NODELAY;
 	opt.sopt_val = &one;
 	opt.sopt_valsize = sizeof(one);
 	error = sosetopt(ic->ic_socket, &opt);
 	if (error != 0) {
 		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
 		icl_soft_conn_close(ic);
 		return (error);
 	}
 
 	/*
 	 * Register socket upcall, to get notified about incoming PDUs
 	 * and free space to send outgoing ones.
 	 */
 	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
 	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
 	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
 	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
 	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
 	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
 
 	/*
 	 * Start threads.
 	 */
 	ICL_CONN_LOCK(ic);
 	ic->ic_send_running = ic->ic_receive_running = true;
 	ICL_CONN_UNLOCK(ic);
 	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
 	    ic->ic_name);
 	if (error != 0) {
 		ICL_WARN("kthread_add(9) failed with error %d", error);
 		ICL_CONN_LOCK(ic);
 		ic->ic_send_running = ic->ic_receive_running = false;
 		cv_signal(&ic->ic_send_cv);
 		ICL_CONN_UNLOCK(ic);
 		icl_soft_conn_close(ic);
 		return (error);
 	}
 	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
 	    ic->ic_name);
 	if (error != 0) {
 		ICL_WARN("kthread_add(9) failed with error %d", error);
 		ICL_CONN_LOCK(ic);
 		ic->ic_receive_running = false;
 		cv_signal(&ic->ic_send_cv);
 		ICL_CONN_UNLOCK(ic);
 		icl_soft_conn_close(ic);
 		return (error);
 	}
 
 	return (0);
 }
 
 int
 icl_soft_conn_handoff(struct icl_conn *ic, int fd)
 {
 	struct file *fp;
 	struct socket *so;
 	cap_rights_t rights;
 	int error;
 
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 #ifdef ICL_KERNEL_PROXY
 	/*
 	 * We're transitioning to Full Feature phase, and we don't
 	 * really care.
 	 */
 	if (fd == 0) {
 		ICL_CONN_LOCK(ic);
 		if (ic->ic_socket == NULL) {
 			ICL_CONN_UNLOCK(ic);
 			ICL_WARN("proxy handoff without connect"); 
 			return (EINVAL);
 		}
 		ICL_CONN_UNLOCK(ic);
 		return (0);
 	}
 #endif
 
 	/*
 	 * Steal the socket from userland.
 	 */
 	error = fget(curthread, fd,
-	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
+	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 
 	ICL_CONN_LOCK(ic);
 
 	if (ic->ic_socket != NULL) {
 		ICL_CONN_UNLOCK(ic);
 		fdrop(fp, curthread);
 		return (EBUSY);
 	}
 
 	ic->ic_socket = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fdrop(fp, curthread);
 	ICL_CONN_UNLOCK(ic);
 
 	error = icl_conn_start(ic);
 
 	return (error);
 }
 
 void
 icl_soft_conn_close(struct icl_conn *ic)
 {
 	struct icl_pdu *pdu;
 	struct socket *so;
 
 	ICL_CONN_LOCK(ic);
 
 	/*
 	 * Wake up the threads, so they can properly terminate.
 	 */
 	ic->ic_disconnecting = true;
 	while (ic->ic_receive_running || ic->ic_send_running) {
 		cv_signal(&ic->ic_receive_cv);
 		cv_signal(&ic->ic_send_cv);
 		cv_wait(&ic->ic_send_cv, ic->ic_lock);
 	}
 
 	/* Some other thread could close the connection same time. */
 	so = ic->ic_socket;
 	if (so == NULL) {
 		ICL_CONN_UNLOCK(ic);
 		return;
 	}
 	ic->ic_socket = NULL;
 
 	/*
 	 * Deregister socket upcalls.
 	 */
 	ICL_CONN_UNLOCK(ic);
 	SOCKBUF_LOCK(&so->so_snd);
 	if (so->so_snd.sb_upcall != NULL)
 		soupcall_clear(so, SO_SND);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (so->so_rcv.sb_upcall != NULL)
 		soupcall_clear(so, SO_RCV);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	soclose(so);
 	ICL_CONN_LOCK(ic);
 
 	if (ic->ic_receive_pdu != NULL) {
 		//ICL_DEBUG("freeing partially received PDU");
 		icl_soft_conn_pdu_free(ic, ic->ic_receive_pdu);
 		ic->ic_receive_pdu = NULL;
 	}
 
 	/*
 	 * Remove any outstanding PDUs from the send queue.
 	 */
 	while (!STAILQ_EMPTY(&ic->ic_to_send)) {
 		pdu = STAILQ_FIRST(&ic->ic_to_send);
 		STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
 		icl_soft_pdu_done(pdu, ENOTCONN);
 	}
 
 	KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
 	    ("destroying session with non-empty send queue"));
 	ICL_CONN_UNLOCK(ic);
 }
 
 int
 icl_soft_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
     struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp)
 {
 
 	return (0);
 }
 
 void
 icl_soft_conn_task_done(struct icl_conn *ic, void *prv)
 {
 }
 
 int
 icl_soft_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
     uint32_t *transfer_tag, void **prvp)
 {
 
 	return (0);
 }
 
 void
 icl_soft_conn_transfer_done(struct icl_conn *ic, void *prv)
 {
 }
 
 static int
 icl_soft_limits(struct icl_drv_limits *idl)
 {
 
 	idl->idl_max_recv_data_segment_length = 128 * 1024;
 	idl->idl_max_send_data_segment_length = 128 * 1024;
 	idl->idl_max_burst_length = 262144;
 	idl->idl_first_burst_length = 65536;
 
 	return (0);
 }
 
 #ifdef ICL_KERNEL_PROXY
 int
 icl_soft_conn_connect(struct icl_conn *ic, int domain, int socktype,
     int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa)
 {
 
 	return (icl_soft_proxy_connect(ic, domain, socktype, protocol,
 	    from_sa, to_sa));
 }
 
 int
 icl_soft_handoff_sock(struct icl_conn *ic, struct socket *so)
 {
 	int error;
 
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 
 	ICL_CONN_LOCK(ic);
 	if (ic->ic_socket != NULL) {
 		ICL_CONN_UNLOCK(ic);
 		return (EBUSY);
 	}
 	ic->ic_socket = so;
 	ICL_CONN_UNLOCK(ic);
 
 	error = icl_conn_start(ic);
 
 	return (error);
 }
 #endif /* ICL_KERNEL_PROXY */
 
 static int
 icl_soft_load(void)
 {
 	int error;
 
 	icl_soft_pdu_zone = uma_zcreate("icl_soft_pdu",
 	    sizeof(struct icl_soft_pdu), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	refcount_init(&icl_ncons, 0);
 
 	/*
 	 * The reason we call this "none" is that to the user,
 	 * it's known as "offload driver"; "offload driver: soft"
 	 * doesn't make much sense.
 	 */
 	error = icl_register("none", false, 0,
 	    icl_soft_limits, icl_soft_new_conn);
 	KASSERT(error == 0, ("failed to register"));
 
 #if defined(ICL_KERNEL_PROXY) && 0
 	/*
 	 * Debugging aid for kernel proxy functionality.
 	 */
 	error = icl_register("proxytest", true, 0,
 	    icl_soft_limits, icl_soft_new_conn);
 	KASSERT(error == 0, ("failed to register"));
 #endif
 
 	return (error);
 }
 
 static int
 icl_soft_unload(void)
 {
 
 	if (icl_ncons != 0)
 		return (EBUSY);
 
 	icl_unregister("none", false);
 #if defined(ICL_KERNEL_PROXY) && 0
 	icl_unregister("proxytest", true);
 #endif
 
 	uma_zdestroy(icl_soft_pdu_zone);
 
 	return (0);
 }
 
 static int
 icl_soft_modevent(module_t mod, int what, void *arg)
 {
 
 	switch (what) {
 	case MOD_LOAD:
 		return (icl_soft_load());
 	case MOD_UNLOAD:
 		return (icl_soft_unload());
 	default:
 		return (EINVAL);
 	}
 }
 
 moduledata_t icl_soft_data = {
 	"icl_soft",
 	icl_soft_modevent,
 	0
 };
 
 DECLARE_MODULE(icl_soft, icl_soft_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
 MODULE_DEPEND(icl_soft, icl, 1, 1, 1);
 MODULE_VERSION(icl_soft, 1);
diff --git a/sys/dev/iscsi_initiator/iscsi.c b/sys/dev/iscsi_initiator/iscsi.c
index 19c45f87d132..ce190c5c892a 100644
--- a/sys/dev/iscsi_initiator/iscsi.c
+++ b/sys/dev/iscsi_initiator/iscsi.c
@@ -1,880 +1,880 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005-2011 Daniel Braniss <danny@cs.huji.ac.il>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 /*
  | $Id: iscsi.c 752 2009-08-20 11:23:28Z danny $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_iscsi_initiator.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #ifdef DO_EVENTHANDLER
 #include <sys/eventhandler.h>
 #endif
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/conf.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/ctype.h>
 #include <sys/errno.h>
 #include <sys/sysctl.h>
 #include <sys/file.h>
 #include <sys/uio.h>
 #include <sys/socketvar.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/ioccom.h>
 #include <sys/queue.h>
 #include <sys/kthread.h>
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <vm/uma.h>
 #include <sys/sx.h>
 
 #include <dev/iscsi_initiator/iscsi.h>
 #include <dev/iscsi_initiator/iscsivar.h>
 static char *iscsi_driver_version = "2.3.1";
 
 static struct isc_softc *isc;
 
 MALLOC_DEFINE(M_ISCSI, "iSCSI", "iSCSI driver");
 MALLOC_DEFINE(M_ISCSIBUF, "iSCbuf", "iSCSI buffers");
 static MALLOC_DEFINE(M_TMP, "iSCtmp", "iSCSI tmp");
 
 #ifdef ISCSI_INITIATOR_DEBUG
 int iscsi_debug = ISCSI_INITIATOR_DEBUG;
 SYSCTL_INT(_debug, OID_AUTO, iscsi_initiator, CTLFLAG_RW, &iscsi_debug, 0,
 	"iSCSI driver debug flag");
 
 struct mtx iscsi_dbg_mtx;
 #endif
 
 static int max_sessions = MAX_SESSIONS;
 SYSCTL_INT(_net, OID_AUTO, iscsi_initiator_max_sessions, CTLFLAG_RDTUN,
     &max_sessions, 0, "Max sessions allowed");
 static int max_pdus = MAX_PDUS;
 SYSCTL_INT(_net, OID_AUTO, iscsi_initiator_max_pdus, CTLFLAG_RDTUN,
     &max_pdus, 0, "Max PDU pool");
 
 static char isid[6+1] = {
      0x80,
      'D',
      'I',
      'B',
      '0',
      '0',
      0
 };
 
 static int	i_create_session(struct cdev *dev, int *ndev);
 
 static int	i_ping(struct cdev *dev);
 static int	i_send(struct cdev *dev, caddr_t arg, struct thread *td);
 static int	i_recv(struct cdev *dev, caddr_t arg, struct thread *td);
 static int	i_setsoc(isc_session_t *sp, int fd, struct thread *td);
 static int	i_fullfeature(struct cdev *dev, int flag);
 
 static d_open_t iscsi_open;
 static d_close_t iscsi_close;
 static d_ioctl_t iscsi_ioctl;
 #ifdef ISCSI_INITIATOR_DEBUG
 static d_read_t iscsi_read;
 #endif
 
 static struct cdevsw iscsi_cdevsw = {
      .d_version = D_VERSION,
      .d_open	= iscsi_open,
      .d_close	= iscsi_close,
      .d_ioctl	= iscsi_ioctl,
 #ifdef ISCSI_INITIATOR_DEBUG
      .d_read	= iscsi_read,
 #endif
      .d_name	= "iSCSI",
 };
 
 static int
 iscsi_open(struct cdev *dev, int flags, int otype, struct thread *td)
 {
      debug_called(8);
 
      debug(7, "dev=%d", dev2unit(dev));
 
      if(dev2unit(dev) > max_sessions) {
 	  // should not happen
           return ENODEV;
      }
      return 0;
 }
 
 static int
 iscsi_close(struct cdev *dev, int flag, int otyp, struct thread *td)
 {
      isc_session_t	*sp;
 
      debug_called(8);
 
      debug(3, "session=%d flag=%x", dev2unit(dev), flag);
 
      if(dev2unit(dev) == max_sessions) {
 	  return 0;
      }
      sp = dev->si_drv2;
      if(sp != NULL) {
 	  sdebug(3, "sp->flags=%x", sp->flags );
 	  /*
 	   | if still in full phase, this probably means
 	   | that something went really bad.
 	   | it could be a result from 'shutdown', in which case
 	   | we will ignore it (so buffers can be flushed).
 	   | the problem is that there is no way of differentiating
 	   | between a shutdown procedure and 'iscontrol' dying.
 	   */
 	  if(sp->flags & ISC_FFPHASE)
 	       // delay in case this is a shutdown.
 	       tsleep(sp, PRIBIO, "isc-cls", 60*hz);
 	  ism_stop(sp);
      }
      debug(2, "done");
      return 0;
 }
 
 static int
 iscsi_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int mode, struct thread *td)
 {
      struct isc_softc	*sc;
      isc_session_t	*sp;
      isc_opt_t		*opt;
      int		error;
 
      debug_called(8);
 
      error = 0;
      if(dev2unit(dev) == max_sessions) {
 	  /*
 	   | non Session commands
 	   */
 	  sc = dev->si_drv1;
 	  if(sc == NULL)
 	       return ENXIO;
 
 	  switch(cmd) {
 	  case ISCSISETSES:
 	       error = i_create_session(dev, (int *)arg);
 	       if(error == 0)
 		    break;
 
 	  default:
 	       error = ENXIO;
 	  }
 	  return error;
      }
      /*
       | session commands
       */
      sp = dev->si_drv2;
      if(sp == NULL)
 	  return ENXIO;
 
      sdebug(6, "dev=%d cmd=%d", dev2unit(dev), (int)(cmd & 0xff));
 
      switch(cmd) {
      case ISCSISETSOC:
 	  error = i_setsoc(sp, *(u_int *)arg, td);
 	  break;
 
      case ISCSISETOPT:
 	  opt = (isc_opt_t *)arg;
 	  error = i_setopt(sp, opt);
 	  break;
 
      case ISCSISEND:
 	  error = i_send(dev, arg, td);
 	  break;
 
      case ISCSIRECV:
 	  error = i_recv(dev, arg, td);
 	  break;
 
      case ISCSIPING:
 	  error = i_ping(dev);
 	  break;
 
      case ISCSISTART:
 	  error = sp->soc == NULL? ENOTCONN: i_fullfeature(dev, 1);
 	  if(error == 0) {
 	       sp->proc = td->td_proc;
 	       SYSCTL_ADD_INT(&sp->clist, SYSCTL_CHILDREN(sp->oid),
 			       OID_AUTO, "pid", CTLFLAG_RD,
 			       &sp->proc->p_pid, sizeof(pid_t), "control process id");
 	  }
 	  break;
 
      case ISCSIRESTART:
 	  error = sp->soc == NULL? ENOTCONN: i_fullfeature(dev, 2);
 	  break;
 
      case ISCSISTOP:
 	  error = i_fullfeature(dev, 0);
 	  break;
 	  
      case ISCSISIGNAL: {
 	  int sig = *(int *)arg;
 
 	  if(sig < 0 || sig > _SIG_MAXSIG)
 	       error = EINVAL;
 	  else
 		sp->signal = sig;
 	  break;
      }
 
      case ISCSIGETCAM: {
 	  iscsi_cam_t *cp = (iscsi_cam_t *)arg;
 
 	  error = ic_getCamVals(sp, cp);
 	  break;
      }
 
      default:
 	  error = ENOIOCTL;
      }
 
      return error;
 }
 
 static int
 iscsi_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 #ifdef  ISCSI_INITIATOR_DEBUG
      struct isc_softc	*sc;
      isc_session_t	*sp;
      pduq_t 		*pq;
      char		buf[1024];
 
      sc = dev->si_drv1;
      sp = dev->si_drv2;
      if(dev2unit(dev) == max_sessions) {
 	  sprintf(buf, "/----- Session ------/\n");
 	  uiomove(buf, strlen(buf), uio);
 	  int	i = 0;
 
 	  TAILQ_FOREACH(sp, &sc->isc_sess, sp_link) {
 	       if(uio->uio_resid == 0)
 		    return 0;
 	       sprintf(buf, "%03d] '%s' '%s'\n", i++, sp->opt.targetAddress, sp->opt.targetName);
 	       uiomove(buf, strlen(buf), uio);
 	  }
 	  sprintf(buf, "free npdu_alloc=%d, npdu_max=%d\n", sc->npdu_alloc, sc->npdu_max);
 	  uiomove(buf, strlen(buf), uio);
      }
      else {
 	  int	i = 0;
 	  struct socket	*so = sp->soc;
 #define pukeit(i, pq) do {\
 	       sprintf(buf, "%03d] %06x %02x %06x %06x %jd\n",\
 		       i, ntohl(pq->pdu.ipdu.bhs.CmdSN),\
 		       pq->pdu.ipdu.bhs.opcode, ntohl(pq->pdu.ipdu.bhs.itt),\
 		       ntohl(pq->pdu.ipdu.bhs.ExpStSN),\
 		       (intmax_t)pq->ts.sec);\
 	       } while(0)
 
 	  sprintf(buf, "%d/%d /---- hld -----/\n", sp->stats.nhld, sp->stats.max_hld);
 	  uiomove(buf, strlen(buf), uio);
 	  TAILQ_FOREACH(pq, &sp->hld, pq_link) {
 	       if(uio->uio_resid == 0)
 		    return 0;
 	       pukeit(i, pq); i++;
 	       uiomove(buf, strlen(buf), uio);
 	  }
 	  sprintf(buf, "%d/%d /---- rsp -----/\n", sp->stats.nrsp, sp->stats.max_rsp);
 	  uiomove(buf, strlen(buf), uio);
 	  i = 0;
 	  TAILQ_FOREACH(pq, &sp->rsp, pq_link) {
 	       if(uio->uio_resid == 0)
 		    return 0;
 	       pukeit(i, pq); i++;
 	       uiomove(buf, strlen(buf), uio);
 	  }
 	  sprintf(buf, "%d/%d /---- csnd -----/\n", sp->stats.ncsnd, sp->stats.max_csnd);
 	  i = 0;
 	  uiomove(buf, strlen(buf), uio);
 	  TAILQ_FOREACH(pq, &sp->csnd, pq_link) {
 	       if(uio->uio_resid == 0)
 		    return 0;
 	       pukeit(i, pq); i++;
 	       uiomove(buf, strlen(buf), uio);
 	  }
 	  sprintf(buf, "%d/%d /---- wsnd -----/\n", sp->stats.nwsnd, sp->stats.max_wsnd);
 	  i = 0;
 	  uiomove(buf, strlen(buf), uio);
 	  TAILQ_FOREACH(pq, &sp->wsnd, pq_link) {
 	       if(uio->uio_resid == 0)
 		    return 0;
 	       pukeit(i, pq); i++;
 	       uiomove(buf, strlen(buf), uio);
 	  }
 	  sprintf(buf, "%d/%d /---- isnd -----/\n", sp->stats.nisnd, sp->stats.max_isnd);
 	  i = 0;
 	  uiomove(buf, strlen(buf), uio);
 	  TAILQ_FOREACH(pq, &sp->isnd, pq_link) {
 	       if(uio->uio_resid == 0)
 		    return 0;
 	       pukeit(i, pq); i++;
 	       uiomove(buf, strlen(buf), uio);
 	  }
 
 	  sprintf(buf, "/---- Stats ---/\n");
 	  uiomove(buf, strlen(buf), uio);
 
 	  sprintf(buf, "recv=%d sent=%d\n", sp->stats.nrecv, sp->stats.nsent);
 	  uiomove(buf, strlen(buf), uio);
 
 	  sprintf(buf, "flags=%x pdus: alloc=%d max=%d\n", 
 		  sp->flags, sc->npdu_alloc, sc->npdu_max);
 	  uiomove(buf, strlen(buf), uio);
 
 	  sprintf(buf, "cws=%d last cmd=%x exp=%x max=%x stat=%x itt=%x\n",
 		  sp->cws, sp->sn.cmd, sp->sn.expCmd, sp->sn.maxCmd, sp->sn.stat, sp->sn.itt);
 	  uiomove(buf, strlen(buf), uio);
 
 	  sprintf(buf, "/---- socket -----/\nso_count=%d so_state=%x\n", so->so_count, so->so_state);
 	  uiomove(buf, strlen(buf), uio);
      }
 #endif
      return 0;
 }
 
 static int
 i_ping(struct cdev *dev)
 {
      return 0;
 }
 /*
  | low level I/O
  */
 static int
 i_setsoc(isc_session_t *sp, int fd, struct thread *td)
 {
      cap_rights_t rights;
      int error = 0;
 
      if(sp->soc != NULL)
 	  isc_stop_receiver(sp);
 
-     error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_SOCK_CLIENT),
-	     &sp->fp, NULL, NULL);
+     error = getsock_cap(td, fd, cap_rights_init_one(&rights, CAP_SOCK_CLIENT),
+         &sp->fp, NULL, NULL);
      if(error)
 	  return error;
 
      sp->soc = sp->fp->f_data;
      sp->td = td;
      isc_start_receiver(sp);
 
      return error;
 }
 
 static int
 i_send(struct cdev *dev, caddr_t arg, struct thread *td)
 {
      isc_session_t	*sp = dev->si_drv2;
      caddr_t		bp;
      pduq_t		*pq;
      pdu_t		*pp;
      int		n, error;
 
      debug_called(8);
 
      if(sp->soc == NULL)
 	  return ENOTCONN;
 
      if((pq = pdu_alloc(sp->isc, M_NOWAIT)) == NULL)
 	  return EAGAIN;
      pp = &pq->pdu;
      pq->pdu = *(pdu_t *)arg;
      if((error = i_prepPDU(sp, pq)) != 0)
 	  goto out;
 
      bp = NULL;
      if((pq->len - sizeof(union ipdu_u)) > 0) {
 	  pq->buf = bp = malloc(pq->len - sizeof(union ipdu_u), M_ISCSIBUF, M_NOWAIT);
 	  if(pq->buf == NULL) {
 	       error = EAGAIN;
 	       goto out;
 	  }
      }
      else
 	  pq->buf = NULL; // just in case?
 
      sdebug(2, "len=%d ahs_len=%d ds_len=%d buf=%zu@%p",
 	    pq->len, pp->ahs_len, pp->ds_len, pq->len - sizeof(union ipdu_u), bp);
 
      if(pp->ahs_len) {
 	  // XXX: never tested, looks suspicious
 	  n = pp->ahs_len;
 	  error = copyin(pp->ahs_addr, bp, n);
 	  if(error != 0) {
 	       sdebug(3, "copyin ahs: error=%d", error);
 	       goto out;
 	  }
 	  pp->ahs_addr = (ahs_t *)bp;
 	  bp += n;
      }
      if(pp->ds_len) {
 	  n = pp->ds_len;
 	  error = copyin(pp->ds_addr, bp, n);
 	  if(error != 0) {
 	       sdebug(3, "copyin ds: error=%d", error);
 	       goto out;
 	  }
 	  pp->ds_addr = bp;
 	  bp += n;
 	  while(n & 03) {
 	       n++;
 	       *bp++ = 0;
 	  }
      }
 
      error = isc_qout(sp, pq);
      if(error == 0)
 	  wakeup(&sp->flags); // XXX: to 'push' proc_out ...
 out:
      if(error)
 	  pdu_free(sp->isc, pq);
 
      return error;
 }
 
 static int
 i_recv(struct cdev *dev, caddr_t arg, struct thread *td)
 {
      isc_session_t	*sp = dev->si_drv2;
      pduq_t		*pq;
      pdu_t		*pp, *up;
      caddr_t		bp;
      int		error, mustfree, cnt;
      size_t		need, have, n;
 
      debug_called(8);
 
      if(sp == NULL)
 	  return EIO;
 
      if(sp->soc == NULL)
 	  return ENOTCONN;
      cnt = 6;     // XXX: maybe the user can request a time out?
      mtx_lock(&sp->rsp_mtx);
      while((pq = TAILQ_FIRST(&sp->rsp)) == NULL) {
 	  msleep(&sp->rsp, &sp->rsp_mtx, PRIBIO, "isc_rsp", hz*10);
 	  if(cnt-- == 0) break; // XXX: for now, needs work
      }
      if(pq != NULL) {
 	  sp->stats.nrsp--;
 	  TAILQ_REMOVE(&sp->rsp, pq, pq_link);
      }
      mtx_unlock(&sp->rsp_mtx);
 
      sdebug(6, "cnt=%d", cnt);
 
      if(pq == NULL) {
 	  error = ENOTCONN;
 	  sdebug(3, "error=%d sp->flags=%x ", error, sp->flags);
 	  return error;
      }
      up = (pdu_t *)arg;
      pp = &pq->pdu;
      up->ipdu = pp->ipdu;
      n = 0;
      up->ds_len = 0;
      up->ahs_len = 0;
      error = 0;
 
      if(pq->mp) {
 	  u_int	len;
 
 	  // Grr...
 	  len = 0;
 	  if(pp->ahs_len) {
 	       len += pp->ahs_len;
 	  }
 	  if(pp->ds_len) {
 	       len += pp->ds_len;
 	  }
 
 	  mustfree = 0;
 	  if(len > pq->mp->m_len) {
 	       mustfree++;
 	       bp = malloc(len, M_TMP, M_WAITOK);
 	       sdebug(4, "need mbufcopy: %d", len);
 	       i_mbufcopy(pq->mp, bp, len);
 	  } 
 	  else
 	       bp = mtod(pq->mp, caddr_t);
 
 	  if(pp->ahs_len) {
 	       need = pp->ahs_len;
 	       n = MIN(up->ahs_size, need);
 	       error = copyout(bp, (caddr_t)up->ahs_addr, n);
 	       up->ahs_len = n;
 	       bp += need;
 	  }
 	  if(!error && pp->ds_len) {
 	       need = pp->ds_len;
 	       if((have = up->ds_size) == 0) {
 		    have = up->ahs_size - n;
 		    up->ds_addr = (caddr_t)up->ahs_addr + n;
 	       }
 	       n = MIN(have, need);
 	       error = copyout(bp, (caddr_t)up->ds_addr, n);
 	       up->ds_len = n;
 	  }
 
 	  if(mustfree)
 	       free(bp, M_TMP);
      }
 
      sdebug(6, "len=%d ahs_len=%d ds_len=%d", pq->len, pp->ahs_len, pp->ds_len);
 
      pdu_free(sp->isc, pq);
 
      return error;
 }
 
 static int
 i_fullfeature(struct cdev *dev, int flag)
 {
      isc_session_t	*sp = dev->si_drv2;
      int		error;
 
      sdebug(2, "flag=%d", flag);
 
      error = 0;
      switch(flag) {
      case 0: // stop
          sp->flags &= ~ISC_FFPHASE;
          break;
      case 1: // start
          sp->flags |= ISC_FFPHASE;
          error = ic_init(sp);
          break;
      case 2: // restart
          sp->flags |= ISC_FFPHASE;
          ism_restart(sp);
          break;
      }
      return error;
 }
 
 static int
 i_create_session(struct cdev *dev, int *ndev)
 { 
      struct isc_softc	*sc = dev->si_drv1;
      isc_session_t	*sp;
      int		error, n;
 
      debug_called(8);
 
      sp = malloc(sizeof(isc_session_t), M_ISCSI, M_WAITOK | M_ZERO);
      if(sp == NULL)
 	  return ENOMEM;
 
      sx_xlock(&sc->unit_sx);
      if((n = alloc_unr(sc->unit)) < 0) {
 	  sx_unlock(&sc->unit_sx);
 	  free(sp, M_ISCSI);
 	  xdebug("too many sessions!");
 	  return EPERM;
      }
      sx_unlock(&sc->unit_sx);
 
      mtx_lock(&sc->isc_mtx);
      TAILQ_INSERT_TAIL(&sc->isc_sess, sp, sp_link);
      isc->nsess++;
      mtx_unlock(&sc->isc_mtx);
 
      sp->dev = make_dev(&iscsi_cdevsw, n, UID_ROOT, GID_WHEEL, 0600, "iscsi%d", n);
      *ndev = sp->sid = n;
      sp->isc = sc;
      sp->dev->si_drv1 = sc;
      sp->dev->si_drv2 = sp;
 
      sp->opt.maxRecvDataSegmentLength = 8192;
      sp->opt.maxXmitDataSegmentLength = 8192;
      sp->opt.maxBurstLength = 65536;	// 64k
      sp->opt.maxluns = ISCSI_MAX_LUNS;
 
      error = ism_start(sp);
 
      return error;
 }
 
 #ifdef notused
 static void
 iscsi_counters(isc_session_t *sp)
 {
      int	h, r, s;
      pduq_t	*pq;
 
 #define _puke(i, pq) do {\
 	       debug(2, "%03d] %06x %02x %x %ld %jd %x\n",\
 		       i, ntohl( pq->pdu.ipdu.bhs.CmdSN), \
 		       pq->pdu.ipdu.bhs.opcode, ntohl(pq->pdu.ipdu.bhs.itt),\
 		       (long)pq->ts.sec, pq->ts.frac, pq->flags);\
 	       } while(0)
 
      h = r = s = 0; 
      TAILQ_FOREACH(pq, &sp->hld, pq_link) {
 	  _puke(h, pq);
 	  h++;
      }
      TAILQ_FOREACH(pq, &sp->rsp, pq_link) r++;
      TAILQ_FOREACH(pq, &sp->csnd, pq_link) s++;
      TAILQ_FOREACH(pq, &sp->wsnd, pq_link) s++;
      TAILQ_FOREACH(pq, &sp->isnd, pq_link) s++;
      debug(2, "hld=%d rsp=%d snd=%d", h, r, s);
 }
 #endif
 
 static void
 iscsi_shutdown(void *v)
 {
      struct isc_softc	*sc = v;
      isc_session_t	*sp;
      int	n;
 
      debug_called(8);
      if(sc == NULL) {
 	  xdebug("sc is NULL!");
 	  return;
      }
 #ifdef DO_EVENTHANDLER
      if(sc->eh == NULL)
 	  debug(2, "sc->eh is NULL");
      else {
 	  EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->eh);
 	  debug(2, "done n=%d", sc->nsess);
      }
 #endif
      n = 0;
      TAILQ_FOREACH(sp, &sc->isc_sess, sp_link) {
 	  debug(2, "%2d] sp->flags=0x%08x", n, sp->flags);
 	  n++;
      }
      debug(2, "done");
 }
 
 static void
 free_pdus(struct isc_softc *sc)
 {
      debug_called(8);
 
      if(sc->pdu_zone != NULL) {
 	  uma_zdestroy(sc->pdu_zone);
 	  sc->pdu_zone = NULL;
      }
 }
 
 static int
 iscsi_start(void)
 {
      debug_called(8);
 
      isc =  malloc(sizeof(struct isc_softc), M_ISCSI, M_ZERO|M_WAITOK);
      mtx_init(&isc->isc_mtx, "iscsi-isc", NULL, MTX_DEF);
 
      TAILQ_INIT(&isc->isc_sess);
      /*
       | now init the free pdu list
       */
      isc->pdu_zone = uma_zcreate("pdu", sizeof(pduq_t),
 				 NULL, NULL, NULL, NULL,
 				 0, 0);
      uma_zone_set_max(isc->pdu_zone, max_pdus);
      isc->unit = new_unrhdr(0, max_sessions-1, NULL);
      sx_init(&isc->unit_sx, "iscsi sx");
 
 #ifdef DO_EVENTHANDLER
      if((isc->eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, iscsi_shutdown,
 					sc, SHUTDOWN_PRI_DEFAULT-1)) == NULL)
 	  xdebug("shutdown event registration failed\n");
 #endif
      /*
       | sysctl stuff
       */
      sysctl_ctx_init(&isc->clist);
      isc->oid = SYSCTL_ADD_NODE(&isc->clist,
 			       SYSCTL_STATIC_CHILDREN(_net),
 			       OID_AUTO,
 			       "iscsi_initiator",
 			       CTLFLAG_RD | CTLFLAG_MPSAFE,
 			       0,
 			       "iSCSI Subsystem");
 
      SYSCTL_ADD_STRING(&isc->clist,
 		       SYSCTL_CHILDREN(isc->oid),
 		       OID_AUTO,
 		       "driver_version",
 		       CTLFLAG_RD,
 		       iscsi_driver_version,
 		       0,
 		       "iscsi driver version");
 
      SYSCTL_ADD_STRING(&isc->clist,
 		       SYSCTL_CHILDREN(isc->oid),
 		       OID_AUTO,
 		       "isid",
 		       CTLFLAG_RW,
 		       isid,
 		       6+1,
 		       "initiator part of the Session Identifier");
 
      SYSCTL_ADD_INT(&isc->clist,
 		    SYSCTL_CHILDREN(isc->oid),
 		    OID_AUTO,
 		    "sessions",
 		    CTLFLAG_RD,
 		    &isc->nsess,
 		    sizeof(isc->nsess),
 		    "number of active session");
 
 #ifdef ISCSI_INITIATOR_DEBUG
      mtx_init(&iscsi_dbg_mtx, "iscsi_dbg", NULL, MTX_DEF);
 #endif
 
      isc->dev = make_dev_credf(MAKEDEV_CHECKNAME, &iscsi_cdevsw, max_sessions,
 			       NULL, UID_ROOT, GID_WHEEL, 0600, "iscsi");
      if (isc->dev == NULL) {
 	  xdebug("iscsi_initiator: make_dev_credf failed");
 	  return (EEXIST);
      }
      isc->dev->si_drv1 = isc;
 
      printf("iscsi: version %s\n", iscsi_driver_version);
      return (0);
 }
 
 /*
  | Notes:
  |	unload SHOULD fail if there is activity
  |	activity: there is/are active session/s
  */
 static void
 iscsi_stop(void)
 {
      isc_session_t	*sp, *sp_tmp;
 
      debug_called(8);
 
      /*
       | go through all the sessions
       | Note: close should have done this ...
       */
      TAILQ_FOREACH_SAFE(sp, &isc->isc_sess, sp_link, sp_tmp) {
 	  //XXX: check for activity ...
 	  ism_stop(sp);
      }
      mtx_destroy(&isc->isc_mtx);
      sx_destroy(&isc->unit_sx);
 
      free_pdus(isc);
 
      if(isc->dev)
 	  destroy_dev(isc->dev);
 
      if(sysctl_ctx_free(&isc->clist))
 	  xdebug("sysctl_ctx_free failed");
 
      iscsi_shutdown(isc); // XXX: check EVENTHANDLER_ ...
 
 #ifdef ISCSI_INITIATOR_DEBUG
      mtx_destroy(&iscsi_dbg_mtx);
 #endif
 
      free(isc, M_ISCSI);
 }
 
 static int
 iscsi_modevent(module_t mod, int what, void *arg)
 {
      int error = 0;
 
      debug_called(8);
 
      switch(what) {
      case MOD_LOAD:
 	  error = iscsi_start();
 	  break;
 
      case MOD_QUIESCE:
 	  if(isc->nsess) {
 	       xdebug("iscsi module busy(nsess=%d), cannot unload", isc->nsess);
 	       log(LOG_ERR, "iscsi module busy, cannot unload");
 	  }
 	  return isc->nsess;
 
      case MOD_SHUTDOWN:
 	  break;
 
      case MOD_UNLOAD:
 	  iscsi_stop();
 	  break;
 
      default:
 	  break;
      }
      return (error);
 }
 
 moduledata_t iscsi_mod = {
          "iscsi_initiator",
          (modeventhand_t) iscsi_modevent,
          0
 };
 
 #ifdef ISCSI_ROOT
 static void
 iscsi_rootconf(void)
 {
 #if 0
 	nfs_setup_diskless();
 	if (nfs_diskless_valid)
 		rootdevnames[0] = "nfs:";
 #endif
 	printf("** iscsi_rootconf **\n");
 }
 
 SYSINIT(cpu_rootconf1, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, iscsi_rootconf, NULL)
 #endif
 
 DECLARE_MODULE(iscsi_initiator, iscsi_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
 MODULE_DEPEND(iscsi_initiator, cam, 1, 1, 1);
diff --git a/sys/dev/mfi/mfi_linux.c b/sys/dev/mfi/mfi_linux.c
index 1452b80c280f..892b4f7a9fba 100644
--- a/sys/dev/mfi/mfi_linux.c
+++ b/sys/dev/mfi/mfi_linux.c
@@ -1,109 +1,109 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006 IronPort Systems
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <machine/bus.h>
 
 #if defined(__amd64__) /* Assume amd64 wants 32 bit Linux */
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_ioctl.h>
 #include <compat/linux/linux_util.h>
 
 #include <dev/mfi/mfireg.h>
 #include <dev/mfi/mfi_ioctl.h>
 
 /* There are multiple ioctl number ranges that need to be handled */
 #define MFI_LINUX_IOCTL_MIN  0x4d00
 #define MFI_LINUX_IOCTL_MAX  0x4d04
 
 static linux_ioctl_function_t mfi_linux_ioctl;
 static struct linux_ioctl_handler mfi_linux_handler = {mfi_linux_ioctl,
 						       MFI_LINUX_IOCTL_MIN,
 						       MFI_LINUX_IOCTL_MAX};
 
 SYSINIT  (mfi_register,   SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_register_handler, &mfi_linux_handler);
 SYSUNINIT(mfi_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_unregister_handler, &mfi_linux_handler);
 
 static struct linux_device_handler mfi_device_handler =
 	{ "mfi", "megaraid_sas", "mfi0", "megaraid_sas_ioctl_node", -1, 0, 1};
 
 SYSINIT  (mfi_register2,   SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_device_register_handler, &mfi_device_handler);
 SYSUNINIT(mfi_unregister2, SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_device_unregister_handler, &mfi_device_handler);
 
 static int
 mfi_linux_modevent(module_t mod, int cmd, void *data)
 {
 	return (0);
 }
 
 DEV_MODULE(mfi_linux, mfi_linux_modevent, NULL);
 MODULE_DEPEND(mfi, linux, 1, 1, 1);
 
 static int
 mfi_linux_ioctl(struct thread *p, struct linux_ioctl_args *args)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	int error;
 	u_long cmd = args->cmd;
 
 	switch (cmd) {
 	case MFI_LINUX_CMD:
 		cmd = MFI_LINUX_CMD_2;
 		break;
 	case MFI_LINUX_SET_AEN:
 		cmd = MFI_LINUX_SET_AEN_2;
 		break;
 	}
 
-	error = fget(p, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	error = fget(p, args->fd, cap_rights_init_one(&rights, CAP_IOCTL), &fp);
 	if (error != 0)
 		return (error);
 	error = fo_ioctl(fp, cmd, (caddr_t)args->arg, p->td_ucred, p);
 	fdrop(fp, p);
 	return (error);
 }
diff --git a/sys/dev/mrsas/mrsas_linux.c b/sys/dev/mrsas/mrsas_linux.c
index 1794f977a21d..11ce3c3f8d54 100644
--- a/sys/dev/mrsas/mrsas_linux.c
+++ b/sys/dev/mrsas/mrsas_linux.c
@@ -1,141 +1,141 @@
 /*
  * Copyright (c) 2015, AVAGO Tech. All rights reserved. Author: Kashyap Desai,
  * Copyright (c) 2014, LSI Corp. All rights reserved. Author: Kashyap Desai,
  * Sibananda Sahu Support: freebsdraid@avagotech.com
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer. 2. Redistributions
  * in binary form must reproduce the above copyright notice, this list of
  * conditions and the following disclaimer in the documentation and/or other
  * materials provided with the distribution. 3. Neither the name of the
  * <ORGANIZATION> nor the names of its contributors may be used to endorse or
  * promote products derived from this software without specific prior written
  * permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * The views and conclusions contained in the software and documentation are
  * those of the authors and should not be interpreted as representing
  * official policies,either expressed or implied, of the FreeBSD Project.
  *
  * Send feedback to: <megaraidfbsd@avagotech.com> Mail to: AVAGO TECHNOLOGIES, 1621
  * Barber Lane, Milpitas, CA 95035 ATTN: MegaRaid FreeBSD
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #if (__FreeBSD_version >= 1001511)
 #include <sys/capsicum.h>
 #elif (__FreeBSD_version > 900000)
 #include <sys/capability.h>
 #endif
 
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <machine/bus.h>
 
 #if defined(__amd64__)			/* Assume amd64 wants 32 bit Linux */
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_ioctl.h>
 #include <compat/linux/linux_util.h>
 
 #include <dev/mrsas/mrsas.h>
 #undef COMPAT_FREEBSD32
 #include <dev/mrsas/mrsas_ioctl.h>
 
 /* There are multiple ioctl number ranges that need to be handled */
 #define	MRSAS_LINUX_IOCTL_MIN  0x4d00
 #define	MRSAS_LINUX_IOCTL_MAX  0x4d01
 
 static linux_ioctl_function_t mrsas_linux_ioctl;
 static struct linux_ioctl_handler mrsas_linux_handler = {mrsas_linux_ioctl,
 	MRSAS_LINUX_IOCTL_MIN,
 MRSAS_LINUX_IOCTL_MAX};
 
 SYSINIT(mrsas_register, SI_SUB_KLD, SI_ORDER_MIDDLE,
     linux_ioctl_register_handler, &mrsas_linux_handler);
 SYSUNINIT(mrsas_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
     linux_ioctl_unregister_handler, &mrsas_linux_handler);
 
 static struct linux_device_handler mrsas_device_handler =
 {"mrsas", "megaraid_sas", "mrsas0", "megaraid_sas_ioctl_node", -1, 0, 1};
 
 SYSINIT(mrsas_register2, SI_SUB_KLD, SI_ORDER_MIDDLE,
     linux_device_register_handler, &mrsas_device_handler);
 SYSUNINIT(mrsas_unregister2, SI_SUB_KLD, SI_ORDER_MIDDLE,
     linux_device_unregister_handler, &mrsas_device_handler);
 
 static int
 mrsas_linux_modevent(module_t mod __unused, int cmd __unused, void *data __unused)
 {
 	return (0);
 }
 
 /*
  * mrsas_linux_ioctl:	linux emulator IOCtl commands entry point.
  *
  * This function is the entry point for IOCtls from linux binaries.
  * It calls the mrsas_ioctl function for processing
  * depending on the IOCTL command received.
  */
 static int
 mrsas_linux_ioctl(struct thread *p, struct linux_ioctl_args *args)
 {
 #if (__FreeBSD_version >= 1000000)
 	cap_rights_t rights;
 
 #endif
 	struct file *fp;
 	int error;
 	u_long cmd = args->cmd;
 
 	if (cmd != MRSAS_LINUX_CMD32) {
 		error = ENOTSUP;
 		goto END;
 	}
 #if (__FreeBSD_version >= 1000000)
-	error = fget(p, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	error = fget(p, args->fd, cap_rights_init_one(&rights, CAP_IOCTL), &fp);
 #elif (__FreeBSD_version <= 900000)
 	error = fget(p, args->fd, &fp);
 #else					/* For FreeBSD version greater than
 					 * 9.0.0 but less than 10.0.0 */
 	error = fget(p, args->fd, CAP_IOCTL, &fp);
 #endif
 	if (error != 0)
 		goto END;
 
 	error = fo_ioctl(fp, cmd, (caddr_t)args->arg, p->td_ucred, p);
 	fdrop(fp, p);
 END:
 	return (error);
 }
 
 DEV_MODULE(mrsas_linux, mrsas_linux_modevent, NULL);
 MODULE_DEPEND(mrsas, linux, 1, 1, 1);
diff --git a/sys/dev/tdfx/tdfx_linux.c b/sys/dev/tdfx/tdfx_linux.c
index e20763b37647..922ff5478a42 100644
--- a/sys/dev/tdfx/tdfx_linux.c
+++ b/sys/dev/tdfx/tdfx_linux.c
@@ -1,92 +1,92 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006 The FreeBSD Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 
 #include <dev/tdfx/tdfx_linux.h>
 
 LINUX_IOCTL_SET(tdfx, LINUX_IOCTL_TDFX_MIN, LINUX_IOCTL_TDFX_MAX);
 
 /*
  * Linux emulation IOCTL for /dev/tdfx
  */
 static int
 linux_ioctl_tdfx(struct thread *td, struct linux_ioctl_args* args)
 {
    cap_rights_t rights;
    int error = 0;
    u_long cmd = args->cmd & 0xffff;
 
    /* The structure passed to ioctl has two shorts, one int
       and one void*. */
    char d_pio[2*sizeof(short) + sizeof(int) + sizeof(void*)];
 
    struct file *fp;
 
-   error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+   error = fget(td, args->fd, cap_rights_init_one(&rights, CAP_IOCTL), &fp);
    if (error != 0)
 	   return (error);
    /* We simply copy the data and send it right to ioctl */
    copyin((caddr_t)args->arg, &d_pio, sizeof(d_pio));
    error = fo_ioctl(fp, cmd, (caddr_t)&d_pio, td->td_ucred, td);
    fdrop(fp, td);
    return error;
 }
 
 static int
 tdfx_linux_modevent(struct module *mod __unused, int what, void *arg __unused)
 {
 
 	switch (what) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		return (0);
 	}
 	return (EOPNOTSUPP);
 }
 
 static moduledata_t tdfx_linux_mod = {
 	"tdfx_linux",
 	tdfx_linux_modevent,
 	0
 };
 
 /* As in SYSCALL_MODULE */
 DECLARE_MODULE(tdfx_linux, tdfx_linux_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
 MODULE_VERSION(tdfx_linux, 1);
 MODULE_DEPEND(tdfx_linux, tdfx, 1, 1, 1);
 MODULE_DEPEND(tdfx_linux, linux, 1, 1, 1);
diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c
index ba13aaf1ce85..1271b50e6e94 100644
--- a/sys/fs/fdescfs/fdesc_vnops.c
+++ b/sys/fs/fdescfs/fdesc_vnops.c
@@ -1,660 +1,660 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vnops.c	8.9 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>	/* boottime */
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/file.h>	/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 #define	NFDCACHE 4
 #define FD_NHASH(ix) \
 	(&fdhashtbl[(ix) & fdhash])
 static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl;
 static u_long fdhash;
 
 struct mtx fdesc_hashmtx;
 
 static vop_getattr_t	fdesc_getattr;
 static vop_lookup_t	fdesc_lookup;
 static vop_open_t	fdesc_open;
 static vop_pathconf_t	fdesc_pathconf;
 static vop_readdir_t	fdesc_readdir;
 static vop_readlink_t	fdesc_readlink;
 static vop_reclaim_t	fdesc_reclaim;
 static vop_setattr_t	fdesc_setattr;
 
 static struct vop_vector fdesc_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		VOP_NULL,
 	.vop_getattr =		fdesc_getattr,
 	.vop_lookup =		fdesc_lookup,
 	.vop_open =		fdesc_open,
 	.vop_pathconf =		fdesc_pathconf,
 	.vop_readdir =		fdesc_readdir,
 	.vop_readlink =		fdesc_readlink,
 	.vop_reclaim =		fdesc_reclaim,
 	.vop_setattr =		fdesc_setattr,
 };
 VFS_VOP_VECTOR_REGISTER(fdesc_vnodeops);
 
 static void fdesc_insmntque_dtr(struct vnode *, void *);
 static void fdesc_remove_entry(struct fdescnode *);
 
 /*
  * Initialise cache headers
  */
 int
 fdesc_init(struct vfsconf *vfsp)
 {
 
 	mtx_init(&fdesc_hashmtx, "fdescfs_hash", NULL, MTX_DEF);
 	fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash);
 	return (0);
 }
 
 /*
  * Uninit ready for unload.
  */
 int
 fdesc_uninit(struct vfsconf *vfsp)
 {
 
 	hashdestroy(fdhashtbl, M_CACHE, fdhash);
 	mtx_destroy(&fdesc_hashmtx);
 	return (0);
 }
 
 /*
  * If allocating vnode fails, call this.
  */
 static void
 fdesc_insmntque_dtr(struct vnode *vp, void *arg)
 {
 
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Remove an entry from the hash if it exists.
  */
 static void
 fdesc_remove_entry(struct fdescnode *fd)
 {
 	struct fdhashhead *fc;
 	struct fdescnode *fd2;
 
 	fc = FD_NHASH(fd->fd_ix);
 	mtx_lock(&fdesc_hashmtx);
 	LIST_FOREACH(fd2, fc, fd_hash) {
 		if (fd == fd2) {
 			LIST_REMOVE(fd, fd_hash);
 			break;
 		}
 	}
 	mtx_unlock(&fdesc_hashmtx);
 }
 
 int
 fdesc_allocvp(fdntype ftype, unsigned fd_fd, int ix, struct mount *mp,
     struct vnode **vpp)
 {
 	struct fdescmount *fmp;
 	struct fdhashhead *fc;
 	struct fdescnode *fd, *fd2;
 	struct vnode *vp, *vp2;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	fc = FD_NHASH(ix);
 loop:
 	mtx_lock(&fdesc_hashmtx);
 	/*
 	 * If a forced unmount is progressing, we need to drop it. The flags are
 	 * protected by the hashmtx.
 	 */
 	fmp = mp->mnt_data;
 	if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) {
 		mtx_unlock(&fdesc_hashmtx);
 		return (-1);
 	}
 
 	LIST_FOREACH(fd, fc, fd_hash) {
 		if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) {
 			/* Get reference to vnode in case it's being free'd */
 			vp = fd->fd_vnode;
 			VI_LOCK(vp);
 			mtx_unlock(&fdesc_hashmtx);
 			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK))
 				goto loop;
 			*vpp = vp;
 			return (0);
 		}
 	}
 	mtx_unlock(&fdesc_hashmtx);
 
 	fd = malloc(sizeof(struct fdescnode), M_TEMP, M_WAITOK);
 
 	error = getnewvnode("fdescfs", mp, &fdesc_vnodeops, &vp);
 	if (error) {
 		free(fd, M_TEMP);
 		return (error);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_data = fd;
 	fd->fd_vnode = vp;
 	fd->fd_type = ftype;
 	fd->fd_fd = fd_fd;
 	fd->fd_ix = ix;
 	if (ftype == Fdesc && fmp->flags & FMNT_LINRDLNKF)
 		vp->v_vflag |= VV_READLINK;
 	error = insmntque1(vp, mp, fdesc_insmntque_dtr, NULL);
 	if (error != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 
 	/* Make sure that someone didn't beat us when inserting the vnode. */
 	mtx_lock(&fdesc_hashmtx);
 	/*
 	 * If a forced unmount is progressing, we need to drop it. The flags are
 	 * protected by the hashmtx.
 	 */
 	fmp = mp->mnt_data;
 	if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) {
 		mtx_unlock(&fdesc_hashmtx);
 		vgone(vp);
 		vput(vp);
 		*vpp = NULLVP;
 		return (-1);
 	}
 
 	LIST_FOREACH(fd2, fc, fd_hash) {
 		if (fd2->fd_ix == ix && fd2->fd_vnode->v_mount == mp) {
 			/* Get reference to vnode in case it's being free'd */
 			vp2 = fd2->fd_vnode;
 			VI_LOCK(vp2);
 			mtx_unlock(&fdesc_hashmtx);
 			error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK);
 			/* Someone beat us, dec use count and wait for reclaim */
 			vgone(vp);
 			vput(vp);
 			/* If we didn't get it, return no vnode. */
 			if (error)
 				vp2 = NULLVP;
 			*vpp = vp2;
 			return (error);
 		}
 	}
 
 	/* If we came here, we can insert it safely. */
 	LIST_INSERT_HEAD(fc, fd, fd_hash);
 	mtx_unlock(&fdesc_hashmtx);
 	*vpp = vp;
 	return (0);
 }
 
 struct fdesc_get_ino_args {
 	fdntype ftype;
 	unsigned fd_fd;
 	int ix;
 	struct file *fp;
 	struct thread *td;
 };
 
 static int
 fdesc_get_ino_alloc(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 	struct fdesc_get_ino_args *a;
 	int error;
 
 	a = arg;
 	error = fdesc_allocvp(a->ftype, a->fd_fd, a->ix, mp, rvp);
 	fdrop(a->fp, a->td);
 	return (error);
 }
 
 /*
  * vp is the current namei directory
  * ndp is the name to locate in that directory...
  */
 static int
 fdesc_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	char *pname = cnp->cn_nameptr;
 	struct thread *td = cnp->cn_thread;
 	struct file *fp;
 	struct fdesc_get_ino_args arg;
 	int nlen = cnp->cn_namelen;
 	u_int fd, fd1;
 	int error;
 	struct vnode *fvp;
 
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad;
 	}
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (VTOFDESC(dvp)->fd_type != Froot) {
 		error = ENOTDIR;
 		goto bad;
 	}
 
 	fd = 0;
 	/* the only time a leading 0 is acceptable is if it's "0" */
 	if (*pname == '0' && nlen != 1) {
 		error = ENOENT;
 		goto bad;
 	}
 	while (nlen--) {
 		if (*pname < '0' || *pname > '9') {
 			error = ENOENT;
 			goto bad;
 		}
 		fd1 = 10 * fd + *pname++ - '0';
 		if (fd1 < fd) {
 			error = ENOENT;
 			goto bad;
 		}
 		fd = fd1;
 	}
 
 	/*
 	 * No rights to check since 'fp' isn't actually used.
 	 */
 	if ((error = fget(td, fd, &cap_no_rights, &fp)) != 0)
 		goto bad;
 
 	/* Check if we're looking up ourselves. */
 	if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) {
 		/*
 		 * In case we're holding the last reference to the file, the dvp
 		 * will be re-acquired.
 		 */
 		vhold(dvp);
 		VOP_UNLOCK(dvp);
 		fdrop(fp, td);
 
 		/* Re-aquire the lock afterwards. */
 		vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE);
 		vdrop(dvp);
 		fvp = dvp;
 		if (VN_IS_DOOMED(dvp))
 			error = ENOENT;
 	} else {
 		/*
 		 * Unlock our root node (dvp) when doing this, since we might
 		 * deadlock since the vnode might be locked by another thread
 		 * and the root vnode lock will be obtained afterwards (in case
 		 * we're looking up the fd of the root vnode), which will be the
 		 * opposite lock order. Vhold the root vnode first so we don't
 		 * lose it.
 		 */
 		arg.ftype = Fdesc;
 		arg.fd_fd = fd;
 		arg.ix = FD_DESC + fd;
 		arg.fp = fp;
 		arg.td = td;
 		error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg,
 		    LK_EXCLUSIVE, &fvp);
 	}
 
 	if (error)
 		goto bad;
 	*vpp = fvp;
 	return (0);
 
 bad:
 	*vpp = NULL;
 	return (error);
 }
 
 static int
 fdesc_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (VTOFDESC(vp)->fd_type == Froot)
 		return (0);
 
 	/*
 	 * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the file
 	 * descriptor being sought for duplication. The error return ensures
 	 * that the vnode for this device will be released by vn_open. Open
 	 * will detect this special error and take the actions in dupfdopen.
 	 * Other callers of vn_open or VOP_OPEN will simply report the
 	 * error.
 	 */
 	ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd;	/* XXX */
 	return (ENODEV);
 }
 
 static int
 fdesc_pathconf(struct vop_pathconf_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error;
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		if (VTOFDESC(vp)->fd_type == Froot)
 			*ap->a_retval = 2;
 		else
 			*ap->a_retval = 1;
 		return (0);
 	default:
 		if (VTOFDESC(vp)->fd_type == Froot)
 			return (vop_stdpathconf(ap));
 		vref(vp);
 		VOP_UNLOCK(vp);
 		error = kern_fpathconf(curthread, VTOFDESC(vp)->fd_fd,
 		    ap->a_name, ap->a_retval);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		vunref(vp);
 		return (error);
 	}
 }
 
 static int
 fdesc_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct timeval boottime;
 
 	getboottime(&boottime);
 	vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
 	vap->va_fileid = VTOFDESC(vp)->fd_ix;
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_atime.tv_sec = boottime.tv_sec;
 	vap->va_atime.tv_nsec = 0;
 	vap->va_mtime = vap->va_atime;
 	vap->va_ctime = vap->va_mtime;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 
 	switch (VTOFDESC(vp)->fd_type) {
 	case Froot:
 		vap->va_type = VDIR;
 		vap->va_nlink = 2;
 		vap->va_size = DEV_BSIZE;
 		vap->va_rdev = NODEV;
 		break;
 
 	case Fdesc:
 		vap->va_type = (vp->v_vflag & VV_READLINK) == 0 ? VCHR : VLNK;
 		vap->va_nlink = 1;
 		vap->va_size = 0;
 		vap->va_rdev = makedev(0, vap->va_fileid);
 		break;
 
 	default:
 		panic("fdesc_getattr");
 		break;
 	}
 
 	vp->v_type = vap->va_type;
 	return (0);
 }
 
 static int
 fdesc_setattr(struct vop_setattr_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	struct thread *td = curthread;
 	cap_rights_t rights;
 	unsigned fd;
 	int error;
 
 	/*
 	 * Can't mess with the root vnode
 	 */
 	if (VTOFDESC(ap->a_vp)->fd_type == Froot)
 		return (EACCES);
 
 	fd = VTOFDESC(ap->a_vp)->fd_fd;
 
 	/*
 	 * Allow setattr where there is an underlying vnode.
 	 */
 	error = getvnode(td, fd,
-	    cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
+	    cap_rights_init_one(&rights, CAP_EXTATTR_SET), &fp);
 	if (error) {
 		/*
 		 * getvnode() returns EINVAL if the file descriptor is not
 		 * backed by a vnode.  Silently drop all changes except
 		 * chflags(2) in this case.
 		 */
 		if (error == EINVAL) {
 			if (vap->va_flags != VNOVAL)
 				error = EOPNOTSUPP;
 			else
 				error = 0;
 		}
 		return (error);
 	}
 	vp = fp->f_vnode;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 #define UIO_MX _GENERIC_DIRLEN(10) /* number of symbols in INT_MAX printout */
 
 static int
 fdesc_readdir(struct vop_readdir_args *ap)
 {
 	struct fdescmount *fmp;
 	struct uio *uio = ap->a_uio;
 	struct filedesc *fdp;
 	struct dirent d;
 	struct dirent *dp = &d;
 	int error, i, off, fcnt;
 
 	if (VTOFDESC(ap->a_vp)->fd_type != Froot)
 		panic("fdesc_readdir: not dir");
 
 	fmp = VFSTOFDESC(ap->a_vp->v_mount);
 	if (ap->a_ncookies != NULL)
 		*ap->a_ncookies = 0;
 
 	off = (int)uio->uio_offset;
 	if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
 	    uio->uio_resid < UIO_MX)
 		return (EINVAL);
 	i = (u_int)off / UIO_MX;
 	fdp = uio->uio_td->td_proc->p_fd;
 	error = 0;
 
 	fcnt = i - 2;		/* The first two nodes are `.' and `..' */
 
 	FILEDESC_SLOCK(fdp);
 	while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
 		bzero((caddr_t)dp, UIO_MX);
 		switch (i) {
 		case 0:	/* `.' */
 		case 1: /* `..' */
 			dp->d_fileno = i + FD_ROOT;
 			dp->d_namlen = i + 1;
 			dp->d_reclen = UIO_MX;
 			bcopy("..", dp->d_name, dp->d_namlen);
 			dp->d_type = DT_DIR;
 			dirent_terminate(dp);
 			break;
 		default:
 			if (fdp->fd_ofiles[fcnt].fde_file == NULL)
 				break;
 			dp->d_namlen = sprintf(dp->d_name, "%d", fcnt);
 			dp->d_reclen = UIO_MX;
 			dp->d_type = (fmp->flags & FMNT_LINRDLNKF) == 0 ?
 			    DT_CHR : DT_LNK;
 			dp->d_fileno = i + FD_DESC;
 			dirent_terminate(dp);
 			break;
 		}
 		/* NOTE: d_off is the offset of the *next* entry. */
 		dp->d_off = UIO_MX * (i + 1);
 		if (dp->d_namlen != 0) {
 			/*
 			 * And ship to userland
 			 */
 			FILEDESC_SUNLOCK(fdp);
 			error = uiomove(dp, UIO_MX, uio);
 			if (error)
 				goto done;
 			FILEDESC_SLOCK(fdp);
 		}
 		i++;
 		fcnt++;
 	}
 	FILEDESC_SUNLOCK(fdp);
 
 done:
 	uio->uio_offset = i * UIO_MX;
 	return (error);
 }
 
 static int
 fdesc_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct fdescnode *fd;
 
  	vp = ap->a_vp;
  	fd = VTOFDESC(vp);
 	fdesc_remove_entry(fd);
 	free(vp->v_data, M_TEMP);
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 fdesc_readlink(struct vop_readlink_args *va)
 {
 	struct vnode *vp, *vn;
 	struct thread *td;
 	struct uio *uio;
 	struct file *fp;
 	char *freepath, *fullpath;
 	size_t pathlen;
 	int lockflags, fd_fd;
 	int error;
 
 	freepath = NULL;
 	vn = va->a_vp;
 	if (VTOFDESC(vn)->fd_type != Fdesc)
 		panic("fdesc_readlink: not fdescfs link");
 	fd_fd = ((struct fdescnode *)vn->v_data)->fd_fd;
 	lockflags = VOP_ISLOCKED(vn);
 	VOP_UNLOCK(vn);
 
 	td = curthread;
 	error = fget_cap(td, fd_fd, &cap_no_rights, &fp, NULL);
 	if (error != 0)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_VNODE:
 		vp = fp->f_vnode;
 		error = vn_fullpath(vp, &fullpath, &freepath);
 		break;
 	default:
 		fullpath = "anon_inode:[unknown]";
 		break;
 	}
 	if (error == 0) {
 		uio = va->a_uio;
 		pathlen = strlen(fullpath);
 		error = uiomove(fullpath, pathlen, uio);
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 	fdrop(fp, td);
 
 out:
 	vn_lock(vn, lockflags | LK_RETRY);
 	return (error);
 }
diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c
index 730877b6eb6f..9c0ac5c4364e 100644
--- a/sys/fs/nfsclient/nfs_clport.c
+++ b/sys/fs/nfsclient/nfs_clport.c
@@ -1,1420 +1,1420 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/capsicum.h>
 
 /*
  * generally, I don't like #includes inside .h files, but it seems to
  * be the easiest way to handle the port.
  */
 #include <sys/fail.h>
 #include <sys/hash.h>
 #include <sys/sysctl.h>
 #include <fs/nfs/nfsport.h>
 #include <netinet/in_fib.h>
 #include <netinet/if_ether.h>
 #include <netinet6/ip6_var.h>
 #include <net/if_types.h>
 #include <net/route/nhop.h>
 
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #ifdef KDTRACE_HOOKS
 dtrace_nfsclient_attrcache_flush_probe_func_t
 		dtrace_nfscl_attrcache_flush_done_probe;
 uint32_t	nfscl_attrcache_flush_done_id;
 
 dtrace_nfsclient_attrcache_get_hit_probe_func_t
 		dtrace_nfscl_attrcache_get_hit_probe;
 uint32_t	nfscl_attrcache_get_hit_id;
 
 dtrace_nfsclient_attrcache_get_miss_probe_func_t
 		dtrace_nfscl_attrcache_get_miss_probe;
 uint32_t	nfscl_attrcache_get_miss_id;
 
 dtrace_nfsclient_attrcache_load_probe_func_t
 		dtrace_nfscl_attrcache_load_done_probe;
 uint32_t	nfscl_attrcache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
 extern struct vop_vector newnfs_vnodeops;
 extern struct vop_vector newnfs_fifoops;
 extern uma_zone_t newnfsnode_zone;
 extern struct buf_ops buf_ops_newnfs;
 extern uma_zone_t ncl_pbuf_zone;
 extern short nfsv4_cbport;
 extern int nfscl_enablecallb;
 extern int nfs_numnfscbd;
 extern int nfscl_inited;
 struct mtx ncl_iod_mutex;
 NFSDLOCKMUTEX;
 extern struct mtx nfsrv_dslock_mtx;
 
 extern void (*ncl_call_invalcaches)(struct vnode *);
 
 SYSCTL_DECL(_vfs_nfs);
 static int ncl_fileid_maxwarnings = 10;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, fileid_maxwarnings, CTLFLAG_RWTUN,
     &ncl_fileid_maxwarnings, 0,
     "Limit fileid corruption warnings; 0 is off; -1 is unlimited");
 static volatile int ncl_fileid_nwarnings;
 
 static void nfscl_warn_fileid(struct nfsmount *, struct nfsvattr *,
     struct nfsvattr *);
 
 /*
  * Comparison function for vfs_hash functions.
  */
 int
 newnfs_vncmpf(struct vnode *vp, void *arg)
 {
 	struct nfsfh *nfhp = (struct nfsfh *)arg;
 	struct nfsnode *np = VTONFS(vp);
 
 	if (np->n_fhp->nfh_len != nfhp->nfh_len ||
 	    NFSBCMP(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len))
 		return (1);
 	return (0);
 }
 
 /*
  * Look up a vnode/nfsnode by file handle.
  * Callers must check for mount points!!
  * In all cases, a pointer to a
  * nfsnode structure is returned.
  * This variant takes a "struct nfsfh *" as second argument and uses
  * that structure up, either by hanging off the nfsnode or FREEing it.
  */
 int
 nfscl_nget(struct mount *mntp, struct vnode *dvp, struct nfsfh *nfhp,
     struct componentname *cnp, struct thread *td, struct nfsnode **npp,
     void *stuff, int lkflags)
 {
 	struct nfsnode *np, *dnp;
 	struct vnode *vp, *nvp;
 	struct nfsv4node *newd, *oldd;
 	int error;
 	u_int hash;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(mntp);
 	dnp = VTONFS(dvp);
 	*npp = NULL;
 
 	hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len, FNV1_32_INIT);
 
 	error = vfs_hash_get(mntp, hash, lkflags,
 	    td, &nvp, newnfs_vncmpf, nfhp);
 	if (error == 0 && nvp != NULL) {
 		/*
 		 * I believe there is a slight chance that vgonel() could
 		 * get called on this vnode between when NFSVOPLOCK() drops
 		 * the VI_LOCK() and vget() acquires it again, so that it
 		 * hasn't yet had v_usecount incremented. If this were to
 		 * happen, the VIRF_DOOMED flag would be set, so check for
 		 * that here. Since we now have the v_usecount incremented,
 		 * we should be ok until we vrele() it, if the VIRF_DOOMED
 		 * flag isn't set now.
 		 */
 		VI_LOCK(nvp);
 		if (VN_IS_DOOMED(nvp)) {
 			VI_UNLOCK(nvp);
 			vrele(nvp);
 			error = ENOENT;
 		} else {
 			VI_UNLOCK(nvp);
 		}
 	}
 	if (error) {
 		free(nfhp, M_NFSFH);
 		return (error);
 	}
 	if (nvp != NULL) {
 		np = VTONFS(nvp);
 		/*
 		 * For NFSv4, check to see if it is the same name and
 		 * replace the name, if it is different.
 		 */
 		oldd = newd = NULL;
 		if ((nmp->nm_flag & NFSMNT_NFSV4) && np->n_v4 != NULL &&
 		    nvp->v_type == VREG &&
 		    (np->n_v4->n4_namelen != cnp->cn_namelen ||
 		     NFSBCMP(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 		     cnp->cn_namelen) ||
 		     dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 		     NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 		     dnp->n_fhp->nfh_len))) {
 		    newd = malloc(
 			sizeof (struct nfsv4node) + dnp->n_fhp->nfh_len +
 			+ cnp->cn_namelen - 1, M_NFSV4NODE, M_WAITOK);
 		    NFSLOCKNODE(np);
 		    if (newd != NULL && np->n_v4 != NULL && nvp->v_type == VREG
 			&& (np->n_v4->n4_namelen != cnp->cn_namelen ||
 			 NFSBCMP(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 			 cnp->cn_namelen) ||
 			 dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 			 NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			 dnp->n_fhp->nfh_len))) {
 			oldd = np->n_v4;
 			np->n_v4 = newd;
 			newd = NULL;
 			np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 			np->n_v4->n4_namelen = cnp->cn_namelen;
 			NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			    dnp->n_fhp->nfh_len);
 			NFSBCOPY(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 			    cnp->cn_namelen);
 		    }
 		    NFSUNLOCKNODE(np);
 		}
 		if (newd != NULL)
 			free(newd, M_NFSV4NODE);
 		if (oldd != NULL)
 			free(oldd, M_NFSV4NODE);
 		*npp = np;
 		free(nfhp, M_NFSFH);
 		return (0);
 	}
 	np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO);
 
 	error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp);
 	if (error) {
 		uma_zfree(newnfsnode_zone, np);
 		free(nfhp, M_NFSFH);
 		return (error);
 	}
 	vp = nvp;
 	KASSERT(vp->v_bufobj.bo_bsize != 0, ("nfscl_nget: bo_bsize == 0"));
 	vp->v_bufobj.bo_ops = &buf_ops_newnfs;
 	vp->v_data = np;
 	np->n_vnode = vp;
 	/* 
 	 * Initialize the mutex even if the vnode is going to be a loser.
 	 * This simplifies the logic in reclaim, which can then unconditionally
 	 * destroy the mutex (in the case of the loser, or if hash_insert
 	 * happened to return an error no special casing is needed).
 	 */
 	mtx_init(&np->n_mtx, "NEWNFSnode lock", NULL, MTX_DEF | MTX_DUPOK);
 	lockinit(&np->n_excl, PVFS, "nfsupg", VLKTIMEOUT, LK_NOSHARE |
 	    LK_CANRECURSE);
 
 	/* 
 	 * Are we getting the root? If so, make sure the vnode flags
 	 * are correct 
 	 */
 	if ((nfhp->nfh_len == nmp->nm_fhsize) &&
 	    !bcmp(nfhp->nfh_fh, nmp->nm_fh, nfhp->nfh_len)) {
 		if (vp->v_type == VNON)
 			vp->v_type = VDIR;
 		vp->v_vflag |= VV_ROOT;
 	}
 
 	vp->v_vflag |= VV_VMSIZEVNLOCK;
 
 	np->n_fhp = nfhp;
 	/*
 	 * For NFSv4, we have to attach the directory file handle and
 	 * file name, so that Open Ops can be done later.
 	 */
 	if (nmp->nm_flag & NFSMNT_NFSV4) {
 		np->n_v4 = malloc(sizeof (struct nfsv4node)
 		    + dnp->n_fhp->nfh_len + cnp->cn_namelen - 1, M_NFSV4NODE,
 		    M_WAITOK);
 		np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 		np->n_v4->n4_namelen = cnp->cn_namelen;
 		NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 		    dnp->n_fhp->nfh_len);
 		NFSBCOPY(cnp->cn_nameptr, NFS4NODENAME(np->n_v4),
 		    cnp->cn_namelen);
 	} else {
 		np->n_v4 = NULL;
 	}
 
 	/*
 	 * NFS supports recursive and shared locking.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_NOWITNESS, NULL);
 	VN_LOCK_AREC(vp);
 	VN_LOCK_ASHARE(vp);
 	error = insmntque(vp, mntp);
 	if (error != 0) {
 		*npp = NULL;
 		mtx_destroy(&np->n_mtx);
 		lockdestroy(&np->n_excl);
 		free(nfhp, M_NFSFH);
 		if (np->n_v4 != NULL)
 			free(np->n_v4, M_NFSV4NODE);
 		uma_zfree(newnfsnode_zone, np);
 		return (error);
 	}
 	error = vfs_hash_insert(vp, hash, lkflags, 
 	    td, &nvp, newnfs_vncmpf, nfhp);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
 		/* vfs_hash_insert() vput()'s the losing vnode */
 		return (0);
 	}
 	*npp = np;
 
 	return (0);
 }
 
 /*
  * Another variant of nfs_nget(). This one is only used by reopen. It
  * takes almost the same args as nfs_nget(), but only succeeds if an entry
  * exists in the cache. (Since files should already be "open" with a
  * vnode ref cnt on the node when reopen calls this, it should always
  * succeed.)
  * Also, don't get a vnode lock, since it may already be locked by some
  * other process that is handling it. This is ok, since all other threads
  * on the client are blocked by the nfsc_lock being exclusively held by the
  * caller of this function.
  */
 int
 nfscl_ngetreopen(struct mount *mntp, u_int8_t *fhp, int fhsize,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *nvp;
 	u_int hash;
 	struct nfsfh *nfhp;
 	int error;
 
 	*npp = NULL;
 	/* For forced dismounts, just return error. */
 	if (NFSCL_FORCEDISM(mntp))
 		return (EINTR);
 	nfhp = malloc(sizeof (struct nfsfh) + fhsize,
 	    M_NFSFH, M_WAITOK);
 	bcopy(fhp, &nfhp->nfh_fh[0], fhsize);
 	nfhp->nfh_len = fhsize;
 
 	hash = fnv_32_buf(fhp, fhsize, FNV1_32_INIT);
 
 	/*
 	 * First, try to get the vnode locked, but don't block for the lock.
 	 */
 	error = vfs_hash_get(mntp, hash, (LK_EXCLUSIVE | LK_NOWAIT), td, &nvp,
 	    newnfs_vncmpf, nfhp);
 	if (error == 0 && nvp != NULL) {
 		NFSVOPUNLOCK(nvp);
 	} else if (error == EBUSY) {
 		/*
 		 * It is safe so long as a vflush() with
 		 * FORCECLOSE has not been done. Since the Renew thread is
 		 * stopped and the MNTK_UNMOUNTF flag is set before doing
 		 * a vflush() with FORCECLOSE, we should be ok here.
 		 */
 		if (NFSCL_FORCEDISM(mntp))
 			error = EINTR;
 		else {
 			vfs_hash_ref(mntp, hash, td, &nvp, newnfs_vncmpf, nfhp);
 			if (nvp == NULL) {
 				error = ENOENT;
 			} else if (VN_IS_DOOMED(nvp)) {
 				error = ENOENT;
 				vrele(nvp);
 			} else {
 				error = 0;
 			}
 		}
 	}
 	free(nfhp, M_NFSFH);
 	if (error)
 		return (error);
 	if (nvp != NULL) {
 		*npp = VTONFS(nvp);
 		return (0);
 	}
 	return (EINVAL);
 }
 
 static void
 nfscl_warn_fileid(struct nfsmount *nmp, struct nfsvattr *oldnap,
     struct nfsvattr *newnap)
 {
 	int off;
 
 	if (ncl_fileid_maxwarnings >= 0 &&
 	    ncl_fileid_nwarnings >= ncl_fileid_maxwarnings)
 		return;
 	off = 0;
 	if (ncl_fileid_maxwarnings >= 0) {
 		if (++ncl_fileid_nwarnings >= ncl_fileid_maxwarnings)
 			off = 1;
 	}
 
 	printf("newnfs: server '%s' error: fileid changed. "
 	    "fsid %jx:%jx: expected fileid %#jx, got %#jx. "
 	    "(BROKEN NFS SERVER OR MIDDLEWARE)\n",
 	    nmp->nm_com.nmcom_hostname,
 	    (uintmax_t)nmp->nm_fsid[0],
 	    (uintmax_t)nmp->nm_fsid[1],
 	    (uintmax_t)oldnap->na_fileid,
 	    (uintmax_t)newnap->na_fileid);
 
 	if (off)
 		printf("newnfs: Logged %d times about fileid corruption; "
 		    "going quiet to avoid spamming logs excessively. (Limit "
 		    "is: %d).\n", ncl_fileid_nwarnings,
 		    ncl_fileid_maxwarnings);
 }
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the attributes of the second argument and
  * Iff vaper not NULL
  *    copy the attributes to *vaper
  * Similar to nfs_loadattrcache(), except the attributes are passed in
  * instead of being parsed out of the mbuf list.
  */
 int
 nfscl_loadattrcache(struct vnode **vpp, struct nfsvattr *nap, void *nvaper,
     void *stuff, int writeattr, int dontshrink)
 {
 	struct vnode *vp = *vpp;
 	struct vattr *vap, *nvap = &nap->na_vattr, *vaper = nvaper;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	struct timespec mtime_save;
 	int error, force_fid_err;
 
 	error = 0;
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special 
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	NFSLOCKNODE(np);
 	if (vp->v_type != nvap->va_type) {
 		vp->v_type = nvap->va_type;
 		if (vp->v_type == VFIFO)
 			vp->v_op = &newnfs_fifoops;
 		np->n_mtime = nvap->va_mtime;
 	}
 	nmp = VFSTONFS(vp->v_mount);
 	vap = &np->n_vattr.na_vattr;
 	mtime_save = vap->va_mtime;
 	if (writeattr) {
 		np->n_vattr.na_filerev = nap->na_filerev;
 		np->n_vattr.na_size = nap->na_size;
 		np->n_vattr.na_mtime = nap->na_mtime;
 		np->n_vattr.na_ctime = nap->na_ctime;
 		np->n_vattr.na_fsid = nap->na_fsid;
 		np->n_vattr.na_mode = nap->na_mode;
 	} else {
 		force_fid_err = 0;
 		KFAIL_POINT_ERROR(DEBUG_FP, nfscl_force_fileid_warning,
 		    force_fid_err);
 		/*
 		 * BROKEN NFS SERVER OR MIDDLEWARE
 		 *
 		 * Certain NFS servers (certain old proprietary filers ca.
 		 * 2006) or broken middleboxes (e.g. WAN accelerator products)
 		 * will respond to GETATTR requests with results for a
 		 * different fileid.
 		 *
 		 * The WAN accelerator we've observed not only serves stale
 		 * cache results for a given file, it also occasionally serves
 		 * results for wholly different files.  This causes surprising
 		 * problems; for example the cached size attribute of a file
 		 * may truncate down and then back up, resulting in zero
 		 * regions in file contents read by applications.  We observed
 		 * this reliably with Clang and .c files during parallel build.
 		 * A pcap revealed packet fragmentation and GETATTR RPC
 		 * responses with wholly wrong fileids.
 		 */
 		if ((np->n_vattr.na_fileid != 0 &&
 		     np->n_vattr.na_fileid != nap->na_fileid) ||
 		    force_fid_err) {
 			nfscl_warn_fileid(nmp, &np->n_vattr, nap);
 			error = EIDRM;
 			goto out;
 		}
 		NFSBCOPY((caddr_t)nap, (caddr_t)&np->n_vattr,
 		    sizeof (struct nfsvattr));
 	}
 
 	/*
 	 * For NFSv4, if the node's fsid is not equal to the mount point's
 	 * fsid, return the low order 32bits of the node's fsid. This
 	 * allows getcwd(3) to work. There is a chance that the fsid might
 	 * be the same as a local fs, but since this is in an NFS mount
 	 * point, I don't think that will cause any problems?
 	 */
 	if (NFSHASNFSV4(nmp) && NFSHASHASSETFSID(nmp) &&
 	    (nmp->nm_fsid[0] != np->n_vattr.na_filesid[0] ||
 	     nmp->nm_fsid[1] != np->n_vattr.na_filesid[1])) {
 		/*
 		 * va_fsid needs to be set to some value derived from
 		 * np->n_vattr.na_filesid that is not equal
 		 * vp->v_mount->mnt_stat.f_fsid[0], so that it changes
 		 * from the value used for the top level server volume
 		 * in the mounted subtree.
 		 */
 		vn_fsid(vp, vap);
 		if ((uint32_t)vap->va_fsid == np->n_vattr.na_filesid[0])
 			vap->va_fsid = hash32_buf(
 			    np->n_vattr.na_filesid, 2 * sizeof(uint64_t), 0);
 	} else
 		vn_fsid(vp, vap);
 	np->n_attrstamp = time_second;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (dontshrink && vap->va_size < np->n_size) {
 				/*
 				 * We've been told not to shrink the file;
 				 * zero np->n_attrstamp to indicate that
 				 * the attributes are stale.
 				 */
 				vap->va_size = np->n_size;
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 			} else if (np->n_flag & NMODIFIED) {
 				/*
 				 * We've modified the file: Use the larger
 				 * of our size, and the server's size.
 				 */
 				if (vap->va_size < np->n_size) {
 					vap->va_size = np->n_size;
 				} else {
 					np->n_size = vap->va_size;
 					np->n_flag |= NSIZECHANGED;
 				}
 			} else {
 				np->n_size = vap->va_size;
 				np->n_flag |= NSIZECHANGED;
 			}
 		} else {
 			np->n_size = vap->va_size;
 		}
 	}
 	/*
 	 * The following checks are added to prevent a race between (say)
 	 * a READDIR+ and a WRITE. 
 	 * READDIR+, WRITE requests sent out.
 	 * READDIR+ resp, WRITE resp received on client.
 	 * However, the WRITE resp was handled before the READDIR+ resp
 	 * causing the post op attrs from the write to be loaded first
 	 * and the attrs from the READDIR+ to be loaded later. If this 
 	 * happens, we have stale attrs loaded into the attrcache.
 	 * We detect this by for the mtime moving back. We invalidate the 
 	 * attrcache when this happens.
 	 */
 	if (timespeccmp(&mtime_save, &vap->va_mtime, >)) {
 		/* Size changed or mtime went backwards */
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	if (vaper != NULL) {
 		NFSBCOPY((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 
 out:
 #ifdef KDTRACE_HOOKS
 	if (np->n_attrstamp != 0)
 		KDTRACE_NFS_ATTRCACHE_LOAD_DONE(vp, vap, error);
 #endif
 	(void)ncl_pager_setsize(vp, NULL);
 	return (error);
 }
 
 /*
  * Call vnode_pager_setsize() if the size of the node changed, as
  * recorded in nfsnode vs. v_object, or delay the call if notifying
  * the pager is not possible at the moment.
  *
  * If nsizep is non-NULL, the call is delayed and the new node size is
  * provided.  Caller should itself call vnode_pager_setsize() if
  * function returned true.  If nsizep is NULL, function tries to call
  * vnode_pager_setsize() itself if needed and possible, and the nfs
  * node is unlocked unconditionally, the return value is not useful.
  */
 bool
 ncl_pager_setsize(struct vnode *vp, u_quad_t *nsizep)
 {
 	struct nfsnode *np;
 	vm_object_t object;
 	struct vattr *vap;
 	u_quad_t nsize;
 	bool setnsize;
 
 	np = VTONFS(vp);
 	NFSASSERTNODE(np);
 
 	vap = &np->n_vattr.na_vattr;
 	nsize = vap->va_size;
 	object = vp->v_object;
 	setnsize = false;
 
 	if (object != NULL && nsize != object->un_pager.vnp.vnp_size) {
 		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
 		    (curthread->td_pflags2 & TDP2_SBPAGES) == 0)
 			setnsize = true;
 		else
 			np->n_flag |= NVNSETSZSKIP;
 	}
 	if (nsizep == NULL) {
 		NFSUNLOCKNODE(np);
 		if (setnsize)
 			vnode_pager_setsize(vp, nsize);
 		setnsize = false;
 	} else {
 		*nsizep = nsize;
 	}
 	return (setnsize);
 }
 
 /*
  * Fill in the client id name. For these bytes:
  * 1 - they must be unique
  * 2 - they should be persistent across client reboots
  * 1 is more critical than 2
  * Use the mount point's unique id plus either the uuid or, if that
  * isn't set, random junk.
  */
 void
 nfscl_fillclid(u_int64_t clval, char *uuid, u_int8_t *cp, u_int16_t idlen)
 {
 	int uuidlen;
 
 	/*
 	 * First, put in the 64bit mount point identifier.
 	 */
 	if (idlen >= sizeof (u_int64_t)) {
 		NFSBCOPY((caddr_t)&clval, cp, sizeof (u_int64_t));
 		cp += sizeof (u_int64_t);
 		idlen -= sizeof (u_int64_t);
 	}
 
 	/*
 	 * If uuid is non-zero length, use it.
 	 */
 	uuidlen = strlen(uuid);
 	if (uuidlen > 0 && idlen >= uuidlen) {
 		NFSBCOPY(uuid, cp, uuidlen);
 		cp += uuidlen;
 		idlen -= uuidlen;
 	}
 
 	/*
 	 * This only normally happens if the uuid isn't set.
 	 */
 	while (idlen > 0) {
 		*cp++ = (u_int8_t)(arc4random() % 256);
 		idlen--;
 	}
 }
 
 /*
  * Fill in a lock owner name. For now, pid + the process's creation time.
  */
 void
 nfscl_filllockowner(void *id, u_int8_t *cp, int flags)
 {
 	union {
 		u_int32_t	lval;
 		u_int8_t	cval[4];
 	} tl;
 	struct proc *p;
 
 	if (id == NULL) {
 		/* Return the single open_owner of all 0 bytes. */
 		bzero(cp, NFSV4CL_LOCKNAMELEN);
 		return;
 	}
 	if ((flags & F_POSIX) != 0) {
 		p = (struct proc *)id;
 		tl.lval = p->p_pid;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp++ = tl.cval[3];
 		tl.lval = p->p_stats->p_start.tv_sec;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp++ = tl.cval[3];
 		tl.lval = p->p_stats->p_start.tv_usec;
 		*cp++ = tl.cval[0];
 		*cp++ = tl.cval[1];
 		*cp++ = tl.cval[2];
 		*cp = tl.cval[3];
 	} else if ((flags & F_FLOCK) != 0) {
 		bcopy(&id, cp, sizeof(id));
 		bzero(&cp[sizeof(id)], NFSV4CL_LOCKNAMELEN - sizeof(id));
 	} else {
 		printf("nfscl_filllockowner: not F_POSIX or F_FLOCK\n");
 		bzero(cp, NFSV4CL_LOCKNAMELEN);
 	}
 }
 
 /*
  * Find the parent process for the thread passed in as an argument.
  * If none exists, return NULL, otherwise return a thread for the parent.
  * (Can be any of the threads, since it is only used for td->td_proc.)
  */
 NFSPROC_T *
 nfscl_getparent(struct thread *td)
 {
 	struct proc *p;
 	struct thread *ptd;
 
 	if (td == NULL)
 		return (NULL);
 	p = td->td_proc;
 	if (p->p_pid == 0)
 		return (NULL);
 	p = p->p_pptr;
 	if (p == NULL)
 		return (NULL);
 	ptd = TAILQ_FIRST(&p->p_threads);
 	return (ptd);
 }
 
 /*
  * Start up the renew kernel thread.
  */
 static void
 start_nfscl(void *arg)
 {
 	struct nfsclclient *clp;
 	struct thread *td;
 
 	clp = (struct nfsclclient *)arg;
 	td = TAILQ_FIRST(&clp->nfsc_renewthread->p_threads);
 	nfscl_renewthread(clp, td);
 	kproc_exit(0);
 }
 
 void
 nfscl_start_renewthread(struct nfsclclient *clp)
 {
 
 	kproc_create(start_nfscl, (void *)clp, &clp->nfsc_renewthread, 0, 0,
 	    "nfscl");
 }
 
 /*
  * Handle wcc_data.
  * For NFSv4, it assumes that nfsv4_wccattr() was used to set up the getattr
  * as the first Op after PutFH.
  * (For NFSv4, the postop attributes are after the Op, so they can't be
  *  parsed here. A separate call to nfscl_postop_attr() is required.)
  */
 int
 nfscl_wcc_data(struct nfsrv_descript *nd, struct vnode *vp,
     struct nfsvattr *nap, int *flagp, int *wccflagp, void *stuff)
 {
 	u_int32_t *tl;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 	int error = 0;
 
 	if (wccflagp != NULL)
 		*wccflagp = 0;
 	if (nd->nd_flag & ND_NFSV3) {
 		*flagp = 0;
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (*tl == newnfs_true) {
 			NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
 			if (wccflagp != NULL) {
 				NFSLOCKNODE(np);
 				*wccflagp = (np->n_mtime.tv_sec ==
 				    fxdr_unsigned(u_int32_t, *(tl + 2)) &&
 				    np->n_mtime.tv_nsec ==
 				    fxdr_unsigned(u_int32_t, *(tl + 3)));
 				NFSUNLOCKNODE(np);
 			}
 		}
 		error = nfscl_postop_attr(nd, nap, flagp, stuff);
 		if (wccflagp != NULL && *flagp == 0)
 			*wccflagp = 0;
 	} else if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR))
 	    == (ND_NFSV4 | ND_V4WCCATTR)) {
 		error = nfsv4_loadattr(nd, NULL, &nfsva, NULL,
 		    NULL, 0, NULL, NULL, NULL, NULL, NULL, 0,
 		    NULL, NULL, NULL, NULL, NULL);
 		if (error)
 			return (error);
 		/*
 		 * Get rid of Op# and status for next op.
 		 */
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		if (*++tl)
 			nd->nd_flag |= ND_NOMOREDATA;
 		if (wccflagp != NULL &&
 		    nfsva.na_vattr.va_mtime.tv_sec != 0) {
 			NFSLOCKNODE(np);
 			*wccflagp = (np->n_mtime.tv_sec ==
 			    nfsva.na_vattr.va_mtime.tv_sec &&
 			    np->n_mtime.tv_nsec ==
 			    nfsva.na_vattr.va_mtime.tv_sec);
 			NFSUNLOCKNODE(np);
 		}
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * Get postop attributes.
  */
 int
 nfscl_postop_attr(struct nfsrv_descript *nd, struct nfsvattr *nap, int *retp,
     void *stuff)
 {
 	u_int32_t *tl;
 	int error = 0;
 
 	*retp = 0;
 	if (nd->nd_flag & ND_NOMOREDATA)
 		return (error);
 	if (nd->nd_flag & ND_NFSV3) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		*retp = fxdr_unsigned(int, *tl);
 	} else if (nd->nd_flag & ND_NFSV4) {
 		/*
 		 * For NFSv4, the postop attr are at the end, so no point
 		 * in looking if nd_repstat != 0.
 		 */
 		if (!nd->nd_repstat) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			if (*(tl + 1))
 				/* should never happen since nd_repstat != 0 */
 				nd->nd_flag |= ND_NOMOREDATA;
 			else
 				*retp = 1;
 		}
 	} else if (!nd->nd_repstat) {
 		/* For NFSv2, the attributes are here iff nd_repstat == 0 */
 		*retp = 1;
 	}
 	if (*retp) {
 		error = nfsm_loadattr(nd, nap);
 		if (error)
 			*retp = 0;
 	}
 nfsmout:
 	return (error);
 }
 
 /*
  * nfscl_request() - mostly a wrapper for newnfs_request().
  */
 int
 nfscl_request(struct nfsrv_descript *nd, struct vnode *vp, NFSPROC_T *p,
     struct ucred *cred, void *stuff)
 {
 	int ret, vers;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vp->v_mount);
 	if (nd->nd_flag & ND_NFSV4)
 		vers = NFS_VER4;
 	else if (nd->nd_flag & ND_NFSV3)
 		vers = NFS_VER3;
 	else
 		vers = NFS_VER2;
 	ret = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred,
 		NFS_PROG, vers, NULL, 1, NULL, NULL);
 	return (ret);
 }
 
 /*
  * fill in this bsden's variant of statfs using nfsstatfs.
  */
 void
 nfscl_loadsbinfo(struct nfsmount *nmp, struct nfsstatfs *sfp, void *statfs)
 {
 	struct statfs *sbp = (struct statfs *)statfs;
 
 	if (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) {
 		sbp->f_bsize = NFS_FABLKSIZE;
 		sbp->f_blocks = sfp->sf_tbytes / NFS_FABLKSIZE;
 		sbp->f_bfree = sfp->sf_fbytes / NFS_FABLKSIZE;
 		/*
 		 * Although sf_abytes is uint64_t and f_bavail is int64_t,
 		 * the value after dividing by NFS_FABLKSIZE is small
 		 * enough that it will fit in 63bits, so it is ok to
 		 * assign it to f_bavail without fear that it will become
 		 * negative.
 		 */
 		sbp->f_bavail = sfp->sf_abytes / NFS_FABLKSIZE;
 		sbp->f_files = sfp->sf_tfiles;
 		/* Since f_ffree is int64_t, clip it to 63bits. */
 		if (sfp->sf_ffiles > INT64_MAX)
 			sbp->f_ffree = INT64_MAX;
 		else
 			sbp->f_ffree = sfp->sf_ffiles;
 	} else if ((nmp->nm_flag & NFSMNT_NFSV4) == 0) {
 		/*
 		 * The type casts to (int32_t) ensure that this code is
 		 * compatible with the old NFS client, in that it will
 		 * propagate bit31 to the high order bits. This may or may
 		 * not be correct for NFSv2, but since it is a legacy
 		 * environment, I'd rather retain backwards compatibility.
 		 */
 		sbp->f_bsize = (int32_t)sfp->sf_bsize;
 		sbp->f_blocks = (int32_t)sfp->sf_blocks;
 		sbp->f_bfree = (int32_t)sfp->sf_bfree;
 		sbp->f_bavail = (int32_t)sfp->sf_bavail;
 		sbp->f_files = 0;
 		sbp->f_ffree = 0;
 	}
 }
 
 /*
  * Use the fsinfo stuff to update the mount point.
  */
 void
 nfscl_loadfsinfo(struct nfsmount *nmp, struct nfsfsinfo *fsp)
 {
 
 	if ((nmp->nm_wsize == 0 || fsp->fs_wtpref < nmp->nm_wsize) &&
 	    fsp->fs_wtpref >= NFS_FABLKSIZE)
 		nmp->nm_wsize = (fsp->fs_wtpref + NFS_FABLKSIZE - 1) &
 		    ~(NFS_FABLKSIZE - 1);
 	if (fsp->fs_wtmax < nmp->nm_wsize && fsp->fs_wtmax > 0) {
 		nmp->nm_wsize = fsp->fs_wtmax & ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_wsize == 0)
 			nmp->nm_wsize = fsp->fs_wtmax;
 	}
 	if (nmp->nm_wsize < NFS_FABLKSIZE)
 		nmp->nm_wsize = NFS_FABLKSIZE;
 	if ((nmp->nm_rsize == 0 || fsp->fs_rtpref < nmp->nm_rsize) &&
 	    fsp->fs_rtpref >= NFS_FABLKSIZE)
 		nmp->nm_rsize = (fsp->fs_rtpref + NFS_FABLKSIZE - 1) &
 		    ~(NFS_FABLKSIZE - 1);
 	if (fsp->fs_rtmax < nmp->nm_rsize && fsp->fs_rtmax > 0) {
 		nmp->nm_rsize = fsp->fs_rtmax & ~(NFS_FABLKSIZE - 1);
 		if (nmp->nm_rsize == 0)
 			nmp->nm_rsize = fsp->fs_rtmax;
 	}
 	if (nmp->nm_rsize < NFS_FABLKSIZE)
 		nmp->nm_rsize = NFS_FABLKSIZE;
 	if ((nmp->nm_readdirsize == 0 || fsp->fs_dtpref < nmp->nm_readdirsize)
 	    && fsp->fs_dtpref >= NFS_DIRBLKSIZ)
 		nmp->nm_readdirsize = (fsp->fs_dtpref + NFS_DIRBLKSIZ - 1) &
 		    ~(NFS_DIRBLKSIZ - 1);
 	if (fsp->fs_rtmax < nmp->nm_readdirsize && fsp->fs_rtmax > 0) {
 		nmp->nm_readdirsize = fsp->fs_rtmax & ~(NFS_DIRBLKSIZ - 1);
 		if (nmp->nm_readdirsize == 0)
 			nmp->nm_readdirsize = fsp->fs_rtmax;
 	}
 	if (nmp->nm_readdirsize < NFS_DIRBLKSIZ)
 		nmp->nm_readdirsize = NFS_DIRBLKSIZ;
 	if (fsp->fs_maxfilesize > 0 &&
 	    fsp->fs_maxfilesize < nmp->nm_maxfilesize)
 		nmp->nm_maxfilesize = fsp->fs_maxfilesize;
 	nmp->nm_mountp->mnt_stat.f_iosize = newnfs_iosize(nmp);
 	nmp->nm_state |= NFSSTA_GOTFSINFO;
 }
 
 /*
  * Lookups source address which should be used to communicate with
  * @nmp and stores it inside @pdst.
  *
  * Returns 0 on success.
  */
 u_int8_t *
 nfscl_getmyip(struct nfsmount *nmp, struct in6_addr *paddr, int *isinet6p)
 {
 #if defined(INET6) || defined(INET)
 	int fibnum;
 
 	fibnum = curthread->td_proc->p_fibnum;
 #endif
 #ifdef INET
 	if (nmp->nm_nam->sa_family == AF_INET) {
 		struct epoch_tracker et;
 		struct nhop_object *nh;
 		struct sockaddr_in *sin;
 		struct in_addr addr = {};
 
 		sin = (struct sockaddr_in *)nmp->nm_nam;
 		NET_EPOCH_ENTER(et);
 		CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred));
 		nh = fib4_lookup(fibnum, sin->sin_addr, 0, NHR_NONE, 0);
 		CURVNET_RESTORE();
 		if (nh != NULL)
 			addr = IA_SIN(ifatoia(nh->nh_ifa))->sin_addr;
 		NET_EPOCH_EXIT(et);
 		if (nh == NULL)
 			return (NULL);
 
 		if (IN_LOOPBACK(ntohl(addr.s_addr))) {
 			/* Ignore loopback addresses */
 			return (NULL);
 		}
 
 		*isinet6p = 0;
 		*((struct in_addr *)paddr) = addr;
 
 		return (u_int8_t *)paddr;
 	}
 #endif
 #ifdef INET6
 	if (nmp->nm_nam->sa_family == AF_INET6) {
 		struct sockaddr_in6 *sin6;
 		int error;
 
 		sin6 = (struct sockaddr_in6 *)nmp->nm_nam;
 
 		CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred));
 		error = in6_selectsrc_addr(fibnum, &sin6->sin6_addr,
 		    sin6->sin6_scope_id, NULL, paddr, NULL);
 		CURVNET_RESTORE();
 		if (error != 0)
 			return (NULL);
 
 		if (IN6_IS_ADDR_LOOPBACK(paddr))
 			return (NULL);
 
 		/* Scope is embedded in */
 		*isinet6p = 1;
 
 		return (u_int8_t *)paddr;
 	}
 #endif
 	return (NULL);
 }
 
 /*
  * Copy NFS uid, gids from the cred structure.
  */
 void
 newnfs_copyincred(struct ucred *cr, struct nfscred *nfscr)
 {
 	int i;
 
 	KASSERT(cr->cr_ngroups >= 0,
 	    ("newnfs_copyincred: negative cr_ngroups"));
 	nfscr->nfsc_uid = cr->cr_uid;
 	nfscr->nfsc_ngroups = MIN(cr->cr_ngroups, NFS_MAXGRPS + 1);
 	for (i = 0; i < nfscr->nfsc_ngroups; i++)
 		nfscr->nfsc_groups[i] = cr->cr_groups[i];
 }
 
 /*
  * Do any client specific initialization.
  */
 void
 nfscl_init(void)
 {
 	static int inited = 0;
 
 	if (inited)
 		return;
 	inited = 1;
 	nfscl_inited = 1;
 	ncl_pbuf_zone = pbuf_zsecond_create("nfspbuf", nswbuf / 2);
 }
 
 /*
  * Check each of the attributes to be set, to ensure they aren't already
  * the correct value. Disable setting ones already correct.
  */
 int
 nfscl_checksattr(struct vattr *vap, struct nfsvattr *nvap)
 {
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vap->va_mode == nvap->na_mode)
 			vap->va_mode = (mode_t)VNOVAL;
 	}
 	if (vap->va_uid != (uid_t)VNOVAL) {
 		if (vap->va_uid == nvap->na_uid)
 			vap->va_uid = (uid_t)VNOVAL;
 	}
 	if (vap->va_gid != (gid_t)VNOVAL) {
 		if (vap->va_gid == nvap->na_gid)
 			vap->va_gid = (gid_t)VNOVAL;
 	}
 	if (vap->va_size != VNOVAL) {
 		if (vap->va_size == nvap->na_size)
 			vap->va_size = VNOVAL;
 	}
 
 	/*
 	 * We are normally called with only a partially initialized
 	 * VAP.  Since the NFSv3 spec says that server may use the
 	 * file attributes to store the verifier, the spec requires
 	 * us to do a SETATTR RPC. FreeBSD servers store the verifier
 	 * in atime, but we can't really assume that all servers will
 	 * so we ensure that our SETATTR sets both atime and mtime.
 	 * Set the VA_UTIMES_NULL flag for this case, so that
 	 * the server's time will be used.  This is needed to
 	 * work around a bug in some Solaris servers, where
 	 * setting the time TOCLIENT causes the Setattr RPC
 	 * to return NFS_OK, but not set va_mode.
 	 */
 	if (vap->va_mtime.tv_sec == VNOVAL) {
 		vfs_timestamp(&vap->va_mtime);
 		vap->va_vaflags |= VA_UTIMES_NULL;
 	}
 	if (vap->va_atime.tv_sec == VNOVAL)
 		vap->va_atime = vap->va_mtime;
 	return (1);
 }
 
 /*
  * Map nfsv4 errors to errno.h errors.
  * The uid and gid arguments are only used for NFSERR_BADOWNER and that
  * error should only be returned for the Open, Create and Setattr Ops.
  * As such, most calls can just pass in 0 for those arguments.
  */
 int
 nfscl_maperr(struct thread *td, int error, uid_t uid, gid_t gid)
 {
 	struct proc *p;
 
 	if (error < 10000 || error >= NFSERR_STALEWRITEVERF)
 		return (error);
 	if (td != NULL)
 		p = td->td_proc;
 	else
 		p = NULL;
 	switch (error) {
 	case NFSERR_BADOWNER:
 		tprintf(p, LOG_INFO,
 		    "No name and/or group mapping for uid,gid:(%d,%d)\n",
 		    uid, gid);
 		return (EPERM);
 	case NFSERR_BADNAME:
 	case NFSERR_BADCHAR:
 		printf("nfsv4 char/name not handled by server\n");
 		return (ENOENT);
 	case NFSERR_STALECLIENTID:
 	case NFSERR_STALESTATEID:
 	case NFSERR_EXPIRED:
 	case NFSERR_BADSTATEID:
 	case NFSERR_BADSESSION:
 		printf("nfsv4 recover err returned %d\n", error);
 		return (EIO);
 	case NFSERR_BADHANDLE:
 	case NFSERR_SERVERFAULT:
 	case NFSERR_BADTYPE:
 	case NFSERR_FHEXPIRED:
 	case NFSERR_RESOURCE:
 	case NFSERR_MOVED:
 	case NFSERR_NOFILEHANDLE:
 	case NFSERR_MINORVERMISMATCH:
 	case NFSERR_OLDSTATEID:
 	case NFSERR_BADSEQID:
 	case NFSERR_LEASEMOVED:
 	case NFSERR_RECLAIMBAD:
 	case NFSERR_BADXDR:
 	case NFSERR_OPILLEGAL:
 		printf("nfsv4 client/server protocol prob err=%d\n",
 		    error);
 		return (EIO);
 	default:
 		tprintf(p, LOG_INFO, "nfsv4 err=%d\n", error);
 		return (EIO);
 	};
 }
 
 /*
  * Check to see if the process for this owner exists. Return 1 if it doesn't
  * and 0 otherwise.
  */
 int
 nfscl_procdoesntexist(u_int8_t *own)
 {
 	union {
 		u_int32_t	lval;
 		u_int8_t	cval[4];
 	} tl;
 	struct proc *p;
 	pid_t pid;
 	int i, ret = 0;
 
 	/* For the single open_owner of all 0 bytes, just return 0. */
 	for (i = 0; i < NFSV4CL_LOCKNAMELEN; i++)
 		if (own[i] != 0)
 			break;
 	if (i == NFSV4CL_LOCKNAMELEN)
 		return (0);
 
 	tl.cval[0] = *own++;
 	tl.cval[1] = *own++;
 	tl.cval[2] = *own++;
 	tl.cval[3] = *own++;
 	pid = tl.lval;
 	p = pfind_any_locked(pid);
 	if (p == NULL)
 		return (1);
 	if (p->p_stats == NULL) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	tl.cval[0] = *own++;
 	tl.cval[1] = *own++;
 	tl.cval[2] = *own++;
 	tl.cval[3] = *own++;
 	if (tl.lval != p->p_stats->p_start.tv_sec) {
 		ret = 1;
 	} else {
 		tl.cval[0] = *own++;
 		tl.cval[1] = *own++;
 		tl.cval[2] = *own++;
 		tl.cval[3] = *own;
 		if (tl.lval != p->p_stats->p_start.tv_usec)
 			ret = 1;
 	}
 	PROC_UNLOCK(p);
 	return (ret);
 }
 
 /*
  * - nfs pseudo system call for the client
  */
 /*
  * MPSAFE
  */
 static int
 nfssvc_nfscl(struct thread *td, struct nfssvc_args *uap)
 {
 	struct file *fp;
 	struct nfscbd_args nfscbdarg;
 	struct nfsd_nfscbd_args nfscbdarg2;
 	struct nameidata nd;
 	struct nfscl_dumpmntopts dumpmntopts;
 	cap_rights_t rights;
 	char *buf;
 	int error;
 	struct mount *mp;
 	struct nfsmount *nmp;
 
 	if (uap->flag & NFSSVC_CBADDSOCK) {
 		error = copyin(uap->argp, (caddr_t)&nfscbdarg, sizeof(nfscbdarg));
 		if (error)
 			return (error);
 		/*
 		 * Since we don't know what rights might be required,
 		 * pretend that we need them all. It is better to be too
 		 * careful than too reckless.
 		 */
 		error = fget(td, nfscbdarg.sock,
-		    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
+		    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
 		if (error)
 			return (error);
 		if (fp->f_type != DTYPE_SOCKET) {
 			fdrop(fp, td);
 			return (EPERM);
 		}
 		error = nfscbd_addsock(fp);
 		fdrop(fp, td);
 		if (!error && nfscl_enablecallb == 0) {
 			nfsv4_cbport = nfscbdarg.port;
 			nfscl_enablecallb = 1;
 		}
 	} else if (uap->flag & NFSSVC_NFSCBD) {
 		if (uap->argp == NULL) 
 			return (EINVAL);
 		error = copyin(uap->argp, (caddr_t)&nfscbdarg2,
 		    sizeof(nfscbdarg2));
 		if (error)
 			return (error);
 		error = nfscbd_nfsd(td, &nfscbdarg2);
 	} else if (uap->flag & NFSSVC_DUMPMNTOPTS) {
 		error = copyin(uap->argp, &dumpmntopts, sizeof(dumpmntopts));
 		if (error == 0 && (dumpmntopts.ndmnt_blen < 256 ||
 		    dumpmntopts.ndmnt_blen > 1024))
 			error = EINVAL;
 		if (error == 0)
 			error = nfsrv_lookupfilename(&nd,
 			    dumpmntopts.ndmnt_fname, td);
 		if (error == 0 && strcmp(nd.ni_vp->v_mount->mnt_vfc->vfc_name,
 		    "nfs") != 0) {
 			vput(nd.ni_vp);
 			error = EINVAL;
 		}
 		if (error == 0) {
 			buf = malloc(dumpmntopts.ndmnt_blen, M_TEMP, M_WAITOK);
 			nfscl_retopts(VFSTONFS(nd.ni_vp->v_mount), buf,
 			    dumpmntopts.ndmnt_blen);
 			vput(nd.ni_vp);
 			error = copyout(buf, dumpmntopts.ndmnt_buf,
 			    dumpmntopts.ndmnt_blen);
 			free(buf, M_TEMP);
 		}
 	} else if (uap->flag & NFSSVC_FORCEDISM) {
 		buf = malloc(MNAMELEN + 1, M_TEMP, M_WAITOK);
 		error = copyinstr(uap->argp, buf, MNAMELEN + 1, NULL);
 		if (error == 0) {
 			nmp = NULL;
 			mtx_lock(&mountlist_mtx);
 			TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 				if (strcmp(mp->mnt_stat.f_mntonname, buf) ==
 				    0 && strcmp(mp->mnt_stat.f_fstypename,
 				    "nfs") == 0 && mp->mnt_data != NULL) {
 					nmp = VFSTONFS(mp);
 					NFSDDSLOCK();
 					if (nfsv4_findmirror(nmp) != NULL) {
 						NFSDDSUNLOCK();
 						error = ENXIO;
 						nmp = NULL;
 						break;
 					}
 					mtx_lock(&nmp->nm_mtx);
 					if ((nmp->nm_privflag &
 					    NFSMNTP_FORCEDISM) == 0) {
 						nmp->nm_privflag |= 
 						   (NFSMNTP_FORCEDISM |
 						    NFSMNTP_CANCELRPCS);
 						mtx_unlock(&nmp->nm_mtx);
 					} else {
 						mtx_unlock(&nmp->nm_mtx);
 						nmp = NULL;
 					}
 					NFSDDSUNLOCK();
 					break;
 				}
 			}
 			mtx_unlock(&mountlist_mtx);
 
 			if (nmp != NULL) {
 				/*
 				 * Call newnfs_nmcancelreqs() to cause
 				 * any RPCs in progress on the mount point to
 				 * fail.
 				 * This will cause any process waiting for an
 				 * RPC to complete while holding a vnode lock
 				 * on the mounted-on vnode (such as "df" or
 				 * a non-forced "umount") to fail.
 				 * This will unlock the mounted-on vnode so
 				 * a forced dismount can succeed.
 				 * Then clear NFSMNTP_CANCELRPCS and wakeup(),
 				 * so that nfs_unmount() can complete.
 				 */
 				newnfs_nmcancelreqs(nmp);
 				mtx_lock(&nmp->nm_mtx);
 				nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
 				wakeup(nmp);
 				mtx_unlock(&nmp->nm_mtx);
 			} else if (error == 0)
 				error = EINVAL;
 		}
 		free(buf, M_TEMP);
 	} else {
 		error = EINVAL;
 	}
 	return (error);
 }
 
 extern int (*nfsd_call_nfscl)(struct thread *, struct nfssvc_args *);
 
 /*
  * Called once to initialize data structures...
  */
 static int
 nfscl_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 	static int loaded = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		if (loaded)
 			return (0);
 		newnfs_portinit();
 		mtx_init(&ncl_iod_mutex, "ncl_iod_mutex", NULL, MTX_DEF);
 		nfscl_init();
 		NFSD_LOCK();
 		nfsrvd_cbinit(0);
 		NFSD_UNLOCK();
 		ncl_call_invalcaches = ncl_invalcaches;
 		nfsd_call_nfscl = nfssvc_nfscl;
 		loaded = 1;
 		break;
 
 	case MOD_UNLOAD:
 		if (nfs_numnfscbd != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * XXX: Unloading of nfscl module is unsupported.
 		 */
 #if 0
 		ncl_call_invalcaches = NULL;
 		nfsd_call_nfscl = NULL;
 		uma_zdestroy(ncl_pbuf_zone);
 		/* and get rid of the mutexes */
 		mtx_destroy(&ncl_iod_mutex);
 		loaded = 0;
 		break;
 #else
 		/* FALLTHROUGH */
 #endif
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return error;
 }
 static moduledata_t nfscl_mod = {
 	"nfscl",
 	nfscl_modevent,
 	NULL,
 };
 DECLARE_MODULE(nfscl, nfscl_mod, SI_SUB_VFS, SI_ORDER_FIRST);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(nfscl, 1);
 MODULE_DEPEND(nfscl, nfscommon, 1, 1, 1);
 MODULE_DEPEND(nfscl, krpc, 1, 1, 1);
 MODULE_DEPEND(nfscl, nfssvc, 1, 1, 1);
diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c
index 8336a0b8fab2..b781503a6815 100644
--- a/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/sys/fs/nfsserver/nfs_nfsdport.c
@@ -1,6723 +1,6723 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/capsicum.h>
 #include <sys/extattr.h>
 
 /*
  * Functions that perform the vfs operations required by the routines in
  * nfsd_serv.c. It is hoped that this change will make the server more
  * portable.
  */
 
 #include <fs/nfs/nfsport.h>
 #include <security/mac/mac_framework.h>
 #include <sys/filio.h>
 #include <sys/hash.h>
 #include <sys/sysctl.h>
 #include <nlm/nlm_prot.h>
 #include <nlm/nlm.h>
 
 FEATURE(nfsd, "NFSv4 server");
 
 extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
 extern int nfsrv_useacl;
 extern int newnfs_numnfsd;
 extern struct mount nfsv4root_mnt;
 extern struct nfsrv_stablefirst nfsrv_stablefirst;
 extern void (*nfsd_call_servertimer)(void);
 extern SVCPOOL	*nfsrvd_pool;
 extern struct nfsv4lock nfsd_suspend_lock;
 extern struct nfsclienthashhead *nfsclienthash;
 extern struct nfslockhashhead *nfslockhash;
 extern struct nfssessionhash *nfssessionhash;
 extern int nfsrv_sessionhashsize;
 extern struct nfsstatsv1 nfsstatsv1;
 extern struct nfslayouthash *nfslayouthash;
 extern int nfsrv_layouthashsize;
 extern struct mtx nfsrv_dslock_mtx;
 extern int nfs_pnfsiothreads;
 extern struct nfsdontlisthead nfsrv_dontlisthead;
 extern volatile int nfsrv_dontlistlen;
 extern volatile int nfsrv_devidcnt;
 extern int nfsrv_maxpnfsmirror;
 struct vfsoptlist nfsv4root_opt, nfsv4root_newopt;
 NFSDLOCKMUTEX;
 NFSSTATESPINLOCK;
 struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
 struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
 struct mtx nfsrc_udpmtx;
 struct mtx nfs_v4root_mutex;
 struct mtx nfsrv_dontlistlock_mtx;
 struct mtx nfsrv_recalllock_mtx;
 struct nfsrvfh nfs_rootfh, nfs_pubfh;
 int nfs_pubfhset = 0, nfs_rootfhset = 0;
 struct proc *nfsd_master_proc = NULL;
 int nfsd_debuglevel = 0;
 static pid_t nfsd_master_pid = (pid_t)-1;
 static char nfsd_master_comm[MAXCOMLEN + 1];
 static struct timeval nfsd_master_start;
 static uint32_t nfsv4_sysid = 0;
 static fhandle_t zerofh;
 
 static int nfssvc_srvcall(struct thread *, struct nfssvc_args *,
     struct ucred *);
 
 int nfsrv_enable_crossmntpt = 1;
 static int nfs_commit_blks;
 static int nfs_commit_miss;
 extern int nfsrv_issuedelegs;
 extern int nfsrv_dolocallocks;
 extern int nfsd_enable_stringtouid;
 extern struct nfsdevicehead nfsrv_devidhead;
 
 static int nfsrv_createiovec(int, struct mbuf **, struct mbuf **,
     struct iovec **);
 static int nfsrv_createiovec_extpgs(int, int, struct mbuf **,
     struct mbuf **, struct iovec **);
 static int nfsrv_createiovecw(int, struct mbuf *, char *, struct iovec **,
     int *);
 static void nfsrv_pnfscreate(struct vnode *, struct vattr *, struct ucred *,
     NFSPROC_T *);
 static void nfsrv_pnfsremovesetup(struct vnode *, NFSPROC_T *, struct vnode **,
     int *, char *, fhandle_t *);
 static void nfsrv_pnfsremove(struct vnode **, int, char *, fhandle_t *,
     NFSPROC_T *);
 static int nfsrv_proxyds(struct vnode *, off_t, int, struct ucred *,
     struct thread *, int, struct mbuf **, char *, struct mbuf **,
     struct nfsvattr *, struct acl *, off_t *, int, bool *);
 static int nfsrv_setextattr(struct vnode *, struct nfsvattr *, NFSPROC_T *);
 static int nfsrv_readdsrpc(fhandle_t *, off_t, int, struct ucred *,
     NFSPROC_T *, struct nfsmount *, struct mbuf **, struct mbuf **);
 static int nfsrv_writedsrpc(fhandle_t *, off_t, int, struct ucred *,
     NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct mbuf **,
     char *, int *);
 static int nfsrv_allocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *,
     NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *);
 static int nfsrv_setacldsrpc(fhandle_t *, struct ucred *, NFSPROC_T *,
     struct vnode *, struct nfsmount **, int, struct acl *, int *);
 static int nfsrv_setattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *,
     struct vnode *, struct nfsmount **, int, struct nfsvattr *, int *);
 static int nfsrv_getattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *,
     struct vnode *, struct nfsmount *, struct nfsvattr *);
 static int nfsrv_seekdsrpc(fhandle_t *, off_t *, int, bool *, struct ucred *,
     NFSPROC_T *, struct nfsmount *);
 static int nfsrv_putfhname(fhandle_t *, char *);
 static int nfsrv_pnfslookupds(struct vnode *, struct vnode *,
     struct pnfsdsfile *, struct vnode **, NFSPROC_T *);
 static void nfsrv_pnfssetfh(struct vnode *, struct pnfsdsfile *, char *, char *,
     struct vnode *, NFSPROC_T *);
 static int nfsrv_dsremove(struct vnode *, char *, struct ucred *, NFSPROC_T *);
 static int nfsrv_dssetacl(struct vnode *, struct acl *, struct ucred *,
     NFSPROC_T *);
 static int nfsrv_pnfsstatfs(struct statfs *, struct mount *);
 
 int nfs_pnfsio(task_fn_t *, void *);
 
 SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "NFS server");
 SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW,
     &nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points");
 SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks,
     0, "");
 SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss,
     0, "");
 SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW,
     &nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations");
 SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW,
     &nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files");
 SYSCTL_INT(_vfs_nfsd, OID_AUTO, debuglevel, CTLFLAG_RW, &nfsd_debuglevel,
     0, "Debug level for NFS server");
 SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_stringtouid, CTLFLAG_RW,
     &nfsd_enable_stringtouid, 0, "Enable nfsd to accept numeric owner_names");
 static int nfsrv_pnfsgetdsattr = 1;
 SYSCTL_INT(_vfs_nfsd, OID_AUTO, pnfsgetdsattr, CTLFLAG_RW,
     &nfsrv_pnfsgetdsattr, 0, "When set getattr gets DS attributes via RPC");
 
 /*
  * nfsrv_dsdirsize can only be increased and only when the nfsd threads are
  * not running.
  * The dsN subdirectories for the increased values must have been created
  * on all DS servers before this increase is done.
  */
 u_int	nfsrv_dsdirsize = 20;
 static int
 sysctl_dsdirsize(SYSCTL_HANDLER_ARGS)
 {
 	int error, newdsdirsize;
 
 	newdsdirsize = nfsrv_dsdirsize;
 	error = sysctl_handle_int(oidp, &newdsdirsize, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (newdsdirsize <= nfsrv_dsdirsize || newdsdirsize > 10000 ||
 	    newnfs_numnfsd != 0)
 		return (EINVAL);
 	nfsrv_dsdirsize = newdsdirsize;
 	return (0);
 }
 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, dsdirsize,
     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrv_dsdirsize),
     sysctl_dsdirsize, "IU", "Number of dsN subdirs on the DS servers");
 
 #define	MAX_REORDERED_RPC	16
 #define	NUM_HEURISTIC		1031
 #define	NHUSE_INIT		64
 #define	NHUSE_INC		16
 #define	NHUSE_MAX		2048
 
 static struct nfsheur {
 	struct vnode *nh_vp;	/* vp to match (unreferenced pointer) */
 	off_t nh_nextoff;	/* next offset for sequential detection */
 	int nh_use;		/* use count for selection */
 	int nh_seqcount;	/* heuristic */
 } nfsheur[NUM_HEURISTIC];
 
 /*
  * Heuristic to detect sequential operation.
  */
 static struct nfsheur *
 nfsrv_sequential_heuristic(struct uio *uio, struct vnode *vp)
 {
 	struct nfsheur *nh;
 	int hi, try;
 
 	/* Locate best candidate. */
 	try = 32;
 	hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
 	nh = &nfsheur[hi];
 	while (try--) {
 		if (nfsheur[hi].nh_vp == vp) {
 			nh = &nfsheur[hi];
 			break;
 		}
 		if (nfsheur[hi].nh_use > 0)
 			--nfsheur[hi].nh_use;
 		hi = (hi + 1) % NUM_HEURISTIC;
 		if (nfsheur[hi].nh_use < nh->nh_use)
 			nh = &nfsheur[hi];
 	}
 
 	/* Initialize hint if this is a new file. */
 	if (nh->nh_vp != vp) {
 		nh->nh_vp = vp;
 		nh->nh_nextoff = uio->uio_offset;
 		nh->nh_use = NHUSE_INIT;
 		if (uio->uio_offset == 0)
 			nh->nh_seqcount = 4;
 		else
 			nh->nh_seqcount = 1;
 	}
 
 	/* Calculate heuristic. */
 	if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) ||
 	    uio->uio_offset == nh->nh_nextoff) {
 		/* See comments in vfs_vnops.c:sequential_heuristic(). */
 		nh->nh_seqcount += howmany(uio->uio_resid, 16384);
 		if (nh->nh_seqcount > IO_SEQMAX)
 			nh->nh_seqcount = IO_SEQMAX;
 	} else if (qabs(uio->uio_offset - nh->nh_nextoff) <= MAX_REORDERED_RPC *
 	    imax(vp->v_mount->mnt_stat.f_iosize, uio->uio_resid)) {
 		/* Probably a reordered RPC, leave seqcount alone. */
 	} else if (nh->nh_seqcount > 1) {
 		nh->nh_seqcount /= 2;
 	} else {
 		nh->nh_seqcount = 0;
 	}
 	nh->nh_use += NHUSE_INC;
 	if (nh->nh_use > NHUSE_MAX)
 		nh->nh_use = NHUSE_MAX;
 	return (nh);
 }
 
 /*
  * Get attributes into nfsvattr structure.
  */
 int
 nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap,
     struct nfsrv_descript *nd, struct thread *p, int vpislocked,
     nfsattrbit_t *attrbitp)
 {
 	int error, gotattr, lockedit = 0;
 	struct nfsvattr na;
 
 	if (vpislocked == 0) {
 		/*
 		 * When vpislocked == 0, the vnode is either exclusively
 		 * locked by this thread or not locked by this thread.
 		 * As such, shared lock it, if not exclusively locked.
 		 */
 		if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
 			lockedit = 1;
 			NFSVOPLOCK(vp, LK_SHARED | LK_RETRY);
 		}
 	}
 
 	/*
 	 * Acquire the Change, Size, TimeAccess, TimeModify and SpaceUsed
 	 * attributes, as required.
 	 * This needs to be done for regular files if:
 	 * - non-NFSv4 RPCs or
 	 * - when attrbitp == NULL or
 	 * - an NFSv4 RPC with any of the above attributes in attrbitp.
 	 * A return of 0 for nfsrv_proxyds() indicates that it has acquired
 	 * these attributes.  nfsrv_proxyds() will return an error if the
 	 * server is not a pNFS one.
 	 */
 	gotattr = 0;
 	if (vp->v_type == VREG && nfsrv_devidcnt > 0 && (attrbitp == NULL ||
 	    (nd->nd_flag & ND_NFSV4) == 0 ||
 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_CHANGE) ||
 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SIZE) ||
 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_TIMEACCESS) ||
 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_TIMEMODIFY) ||
 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEUSED))) {
 		error = nfsrv_proxyds(vp, 0, 0, nd->nd_cred, p,
 		    NFSPROC_GETATTR, NULL, NULL, NULL, &na, NULL, NULL, 0,
 		    NULL);
 		if (error == 0)
 			gotattr = 1;
 	}
 
 	error = VOP_GETATTR(vp, &nvap->na_vattr, nd->nd_cred);
 	if (lockedit != 0)
 		NFSVOPUNLOCK(vp);
 
 	/*
 	 * If we got the Change, Size and Modify Time from the DS,
 	 * replace them.
 	 */
 	if (gotattr != 0) {
 		nvap->na_atime = na.na_atime;
 		nvap->na_mtime = na.na_mtime;
 		nvap->na_filerev = na.na_filerev;
 		nvap->na_size = na.na_size;
 		nvap->na_bytes = na.na_bytes;
 	}
 	NFSD_DEBUG(4, "nfsvno_getattr: gotattr=%d err=%d chg=%ju\n", gotattr,
 	    error, (uintmax_t)na.na_filerev);
 
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Get a file handle for a vnode.
  */
 int
 nfsvno_getfh(struct vnode *vp, fhandle_t *fhp, struct thread *p)
 {
 	int error;
 
 	NFSBZERO((caddr_t)fhp, sizeof(fhandle_t));
 	fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fhp->fh_fid);
 
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Perform access checking for vnodes obtained from file handles that would
  * refer to files already opened by a Unix client. You cannot just use
  * vn_writechk() and VOP_ACCESSX() for two reasons.
  * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write
  *     case.
  * 2 - The owner is to be given access irrespective of mode bits for some
  *     operations, so that processes that chmod after opening a file don't
  *     break.
  */
 int
 nfsvno_accchk(struct vnode *vp, accmode_t accmode, struct ucred *cred,
     struct nfsexstuff *exp, struct thread *p, int override, int vpislocked,
     u_int32_t *supportedtypep)
 {
 	struct vattr vattr;
 	int error = 0, getret = 0;
 
 	if (vpislocked == 0) {
 		if (NFSVOPLOCK(vp, LK_SHARED) != 0) {
 			error = EPERM;
 			goto out;
 		}
 	}
 	if (accmode & VWRITE) {
 		/* Just vn_writechk() changed to check rdonly */
 		/*
 		 * Disallow write attempts on read-only file systems;
 		 * unless the file is a socket or a block or character
 		 * device resident on the file system.
 		 */
 		if (NFSVNO_EXRDONLY(exp) ||
 		    (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				error = EROFS;
 			default:
 				break;
 			}
 		}
 		/*
 		 * If there's shared text associated with
 		 * the inode, try to free it up once.  If
 		 * we fail, we can't allow writing.
 		 */
 		if (VOP_IS_TEXT(vp) && error == 0)
 			error = ETXTBSY;
 	}
 	if (error != 0) {
 		if (vpislocked == 0)
 			NFSVOPUNLOCK(vp);
 		goto out;
 	}
 
 	/*
 	 * Should the override still be applied when ACLs are enabled?
 	 */
 	error = VOP_ACCESSX(vp, accmode, cred, p);
 	if (error != 0 && (accmode & (VDELETE | VDELETE_CHILD))) {
 		/*
 		 * Try again with VEXPLICIT_DENY, to see if the test for
 		 * deletion is supported.
 		 */
 		error = VOP_ACCESSX(vp, accmode | VEXPLICIT_DENY, cred, p);
 		if (error == 0) {
 			if (vp->v_type == VDIR) {
 				accmode &= ~(VDELETE | VDELETE_CHILD);
 				accmode |= VWRITE;
 				error = VOP_ACCESSX(vp, accmode, cred, p);
 			} else if (supportedtypep != NULL) {
 				*supportedtypep &= ~NFSACCESS_DELETE;
 			}
 		}
 	}
 
 	/*
 	 * Allow certain operations for the owner (reads and writes
 	 * on files that are already open).
 	 */
 	if (override != NFSACCCHK_NOOVERRIDE &&
 	    (error == EPERM || error == EACCES)) {
 		if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT))
 			error = 0;
 		else if (override & NFSACCCHK_ALLOWOWNER) {
 			getret = VOP_GETATTR(vp, &vattr, cred);
 			if (getret == 0 && cred->cr_uid == vattr.va_uid)
 				error = 0;
 		}
 	}
 	if (vpislocked == 0)
 		NFSVOPUNLOCK(vp);
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Set attribute(s) vnop.
  */
 int
 nfsvno_setattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred,
     struct thread *p, struct nfsexstuff *exp)
 {
 	u_quad_t savsize = 0;
 	int error, savedit;
 	time_t savbtime;
 
 	/*
 	 * If this is an exported file system and a pNFS service is running,
 	 * don't VOP_SETATTR() of size for the MDS file system.
 	 */
 	savedit = 0;
 	error = 0;
 	if (vp->v_type == VREG && (vp->v_mount->mnt_flag & MNT_EXPORTED) != 0 &&
 	    nfsrv_devidcnt != 0 && nvap->na_vattr.va_size != VNOVAL &&
 	    nvap->na_vattr.va_size > 0) {
 		savsize = nvap->na_vattr.va_size;
 		nvap->na_vattr.va_size = VNOVAL;
 		if (nvap->na_vattr.va_uid != (uid_t)VNOVAL ||
 		    nvap->na_vattr.va_gid != (gid_t)VNOVAL ||
 		    nvap->na_vattr.va_mode != (mode_t)VNOVAL ||
 		    nvap->na_vattr.va_atime.tv_sec != VNOVAL ||
 		    nvap->na_vattr.va_mtime.tv_sec != VNOVAL)
 			savedit = 1;
 		else
 			savedit = 2;
 	}
 	if (savedit != 2)
 		error = VOP_SETATTR(vp, &nvap->na_vattr, cred);
 	if (savedit != 0)
 		nvap->na_vattr.va_size = savsize;
 	if (error == 0 && (nvap->na_vattr.va_uid != (uid_t)VNOVAL ||
 	    nvap->na_vattr.va_gid != (gid_t)VNOVAL ||
 	    nvap->na_vattr.va_size != VNOVAL ||
 	    nvap->na_vattr.va_mode != (mode_t)VNOVAL ||
 	    nvap->na_vattr.va_atime.tv_sec != VNOVAL ||
 	    nvap->na_vattr.va_mtime.tv_sec != VNOVAL)) {
 		/* Never modify birthtime on a DS file. */
 		savbtime = nvap->na_vattr.va_birthtime.tv_sec;
 		nvap->na_vattr.va_birthtime.tv_sec = VNOVAL;
 		/* For a pNFS server, set the attributes on the DS file. */
 		error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SETATTR,
 		    NULL, NULL, NULL, nvap, NULL, NULL, 0, NULL);
 		nvap->na_vattr.va_birthtime.tv_sec = savbtime;
 		if (error == ENOENT)
 			error = 0;
 	}
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Set up nameidata for a lookup() call and do it.
  */
 int
 nfsvno_namei(struct nfsrv_descript *nd, struct nameidata *ndp,
     struct vnode *dp, int islocked, struct nfsexstuff *exp, struct thread *p,
     struct vnode **retdirp)
 {
 	struct componentname *cnp = &ndp->ni_cnd;
 	int i;
 	struct iovec aiov;
 	struct uio auio;
 	int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen;
 	int error = 0;
 	char *cp;
 
 	*retdirp = NULL;
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 	ndp->ni_lcf = 0;
 	/*
 	 * Extract and set starting directory.
 	 */
 	if (dp->v_type != VDIR) {
 		if (islocked)
 			vput(dp);
 		else
 			vrele(dp);
 		nfsvno_relpathbuf(ndp);
 		error = ENOTDIR;
 		goto out1;
 	}
 	if (islocked)
 		NFSVOPUNLOCK(dp);
 	VREF(dp);
 	*retdirp = dp;
 	if (NFSVNO_EXRDONLY(exp))
 		cnp->cn_flags |= RDONLY;
 	ndp->ni_segflg = UIO_SYSSPACE;
 
 	if (nd->nd_flag & ND_PUBLOOKUP) {
 		ndp->ni_loopcnt = 0;
 		if (cnp->cn_pnbuf[0] == '/') {
 			vrele(dp);
 			/*
 			 * Check for degenerate pathnames here, since lookup()
 			 * panics on them.
 			 */
 			for (i = 1; i < ndp->ni_pathlen; i++)
 				if (cnp->cn_pnbuf[i] != '/')
 					break;
 			if (i == ndp->ni_pathlen) {
 				error = NFSERR_ACCES;
 				goto out;
 			}
 			dp = rootvnode;
 			VREF(dp);
 		}
 	} else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) ||
 	    (nd->nd_flag & ND_NFSV4) == 0) {
 		/*
 		 * Only cross mount points for NFSv4 when doing a
 		 * mount while traversing the file system above
 		 * the mount point, unless nfsrv_enable_crossmntpt is set.
 		 */
 		cnp->cn_flags |= NOCROSSMOUNT;
 	}
 
 	/*
 	 * Initialize for scan, set ni_startdir and bump ref on dp again
 	 * because lookup() will dereference ni_startdir.
 	 */
 
 	cnp->cn_thread = p;
 	ndp->ni_startdir = dp;
 	ndp->ni_rootdir = rootvnode;
 	ndp->ni_topdir = NULL;
 
 	if (!lockleaf)
 		cnp->cn_flags |= LOCKLEAF;
 	for (;;) {
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		/*
 		 * Call lookup() to do the real work.  If an error occurs,
 		 * ndp->ni_vp and ni_dvp are left uninitialized or NULL and
 		 * we do not have to dereference anything before returning.
 		 * In either case ni_startdir will be dereferenced and NULLed
 		 * out.
 		 */
 		error = lookup(ndp);
 		if (error)
 			break;
 
 		/*
 		 * Check for encountering a symbolic link.  Trivial
 		 * termination occurs if no symlink encountered.
 		 */
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
 				nfsvno_relpathbuf(ndp);
 			if (ndp->ni_vp && !lockleaf)
 				NFSVOPUNLOCK(ndp->ni_vp);
 			break;
 		}
 
 		/*
 		 * Validate symlink
 		 */
 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
 			NFSVOPUNLOCK(ndp->ni_dvp);
 		if (!(nd->nd_flag & ND_PUBLOOKUP)) {
 			error = EINVAL;
 			goto badlink2;
 		}
 
 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
 			error = ELOOP;
 			goto badlink2;
 		}
 		if (ndp->ni_pathlen > 1)
 			cp = uma_zalloc(namei_zone, M_WAITOK);
 		else
 			cp = cnp->cn_pnbuf;
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = NULL;
 		auio.uio_resid = MAXPATHLEN;
 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
 		if (error) {
 		badlink1:
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 		badlink2:
 			vrele(ndp->ni_dvp);
 			vput(ndp->ni_vp);
 			break;
 		}
 		linklen = MAXPATHLEN - auio.uio_resid;
 		if (linklen == 0) {
 			error = ENOENT;
 			goto badlink1;
 		}
 		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
 			error = ENAMETOOLONG;
 			goto badlink1;
 		}
 
 		/*
 		 * Adjust or replace path
 		 */
 		if (ndp->ni_pathlen > 1) {
 			NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 			cnp->cn_pnbuf = cp;
 		} else
 			cnp->cn_pnbuf[linklen] = '\0';
 		ndp->ni_pathlen += linklen;
 
 		/*
 		 * Cleanup refs for next loop and check if root directory
 		 * should replace current directory.  Normally ni_dvp
 		 * becomes the new base directory and is cleaned up when
 		 * we loop.  Explicitly null pointers after invalidation
 		 * to clarify operation.
 		 */
 		vput(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 
 		if (cnp->cn_pnbuf[0] == '/') {
 			vrele(ndp->ni_dvp);
 			ndp->ni_dvp = ndp->ni_rootdir;
 			VREF(ndp->ni_dvp);
 		}
 		ndp->ni_startdir = ndp->ni_dvp;
 		ndp->ni_dvp = NULL;
 	}
 	if (!lockleaf)
 		cnp->cn_flags &= ~LOCKLEAF;
 
 out:
 	if (error) {
 		nfsvno_relpathbuf(ndp);
 		ndp->ni_vp = NULL;
 		ndp->ni_dvp = NULL;
 		ndp->ni_startdir = NULL;
 	} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) {
 		ndp->ni_dvp = NULL;
 	}
 
 out1:
 	NFSEXITCODE2(error, nd);
 	return (error);
 }
 
 /*
  * Set up a pathname buffer and return a pointer to it and, optionally
  * set a hash pointer.
  */
 void
 nfsvno_setpathbuf(struct nameidata *ndp, char **bufpp, u_long **hashpp)
 {
 	struct componentname *cnp = &ndp->ni_cnd;
 
 	cnp->cn_flags |= (NOMACCHECK | HASBUF);
 	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	if (hashpp != NULL)
 		*hashpp = NULL;
 	*bufpp = cnp->cn_pnbuf;
 }
 
 /*
  * Release the above path buffer, if not released by nfsvno_namei().
  */
 void
 nfsvno_relpathbuf(struct nameidata *ndp)
 {
 
 	if ((ndp->ni_cnd.cn_flags & HASBUF) == 0)
 		panic("nfsrelpath");
 	uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 	ndp->ni_cnd.cn_flags &= ~HASBUF;
 }
 
 /*
  * Readlink vnode op into an mbuf list.
  */
 int
 nfsvno_readlink(struct vnode *vp, struct ucred *cred, int maxextsiz,
     struct thread *p, struct mbuf **mpp, struct mbuf **mpendp, int *lenp)
 {
 	struct iovec *iv;
 	struct uio io, *uiop = &io;
 	struct mbuf *mp, *mp3;
 	int len, tlen, error = 0;
 
 	len = NFS_MAXPATHLEN;
 	if (maxextsiz > 0)
 		uiop->uio_iovcnt = nfsrv_createiovec_extpgs(len, maxextsiz,
 		    &mp3, &mp, &iv);
 	else
 		uiop->uio_iovcnt = nfsrv_createiovec(len, &mp3, &mp, &iv);
 	uiop->uio_iov = iv;
 	uiop->uio_offset = 0;
 	uiop->uio_resid = len;
 	uiop->uio_rw = UIO_READ;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = NULL;
 	error = VOP_READLINK(vp, uiop, cred);
 	free(iv, M_TEMP);
 	if (error) {
 		m_freem(mp3);
 		*lenp = 0;
 		goto out;
 	}
 	if (uiop->uio_resid > 0) {
 		len -= uiop->uio_resid;
 		tlen = NFSM_RNDUP(len);
 		if (tlen == 0) {
 			m_freem(mp3);
 			mp3 = mp = NULL;
 		} else if (tlen != NFS_MAXPATHLEN || tlen != len)
 			mp = nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen,
 			    tlen - len);
 	}
 	*lenp = len;
 	*mpp = mp3;
 	*mpendp = mp;
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Create an mbuf chain and an associated iovec that can be used to Read
  * or Getextattr of data.
  * Upon success, return pointers to the first and last mbufs in the chain
  * plus the malloc'd iovec and its iovlen.
  */
 static int
 nfsrv_createiovec(int len, struct mbuf **mpp, struct mbuf **mpendp,
     struct iovec **ivp)
 {
 	struct mbuf *m, *m2 = NULL, *m3;
 	struct iovec *iv;
 	int i, left, siz;
 
 	left = len;
 	m3 = NULL;
 	/*
 	 * Generate the mbuf list with the uio_iov ref. to it.
 	 */
 	i = 0;
 	while (left > 0) {
 		NFSMGET(m);
 		MCLGET(m, M_WAITOK);
 		m->m_len = 0;
 		siz = min(M_TRAILINGSPACE(m), left);
 		left -= siz;
 		i++;
 		if (m3)
 			m2->m_next = m;
 		else
 			m3 = m;
 		m2 = m;
 	}
 	*ivp = iv = malloc(i * sizeof (struct iovec), M_TEMP, M_WAITOK);
 	m = m3;
 	left = len;
 	i = 0;
 	while (left > 0) {
 		if (m == NULL)
 			panic("nfsrv_createiovec iov");
 		siz = min(M_TRAILINGSPACE(m), left);
 		if (siz > 0) {
 			iv->iov_base = mtod(m, caddr_t) + m->m_len;
 			iv->iov_len = siz;
 			m->m_len += siz;
 			left -= siz;
 			iv++;
 			i++;
 		}
 		m = m->m_next;
 	}
 	*mpp = m3;
 	*mpendp = m2;
 	return (i);
 }
 
 /*
  * Create an mbuf chain and an associated iovec that can be used to Read
  * or Getextattr of data.
  * Upon success, return pointers to the first and last mbufs in the chain
  * plus the malloc'd iovec and its iovlen.
  * Same as above, but creates ext_pgs mbuf(s).
  */
 static int
 nfsrv_createiovec_extpgs(int len, int maxextsiz, struct mbuf **mpp,
     struct mbuf **mpendp, struct iovec **ivp)
 {
 	struct mbuf *m, *m2 = NULL, *m3;
 	struct iovec *iv;
 	int i, left, pgno, siz;
 
 	left = len;
 	m3 = NULL;
 	/*
 	 * Generate the mbuf list with the uio_iov ref. to it.
 	 */
 	i = 0;
 	while (left > 0) {
 		siz = min(left, maxextsiz);
 		m = mb_alloc_ext_plus_pages(siz, M_WAITOK);
 		left -= siz;
 		i += m->m_epg_npgs;
 		if (m3 != NULL)
 			m2->m_next = m;
 		else
 			m3 = m;
 		m2 = m;
 	}
 	*ivp = iv = malloc(i * sizeof (struct iovec), M_TEMP, M_WAITOK);
 	m = m3;
 	left = len;
 	i = 0;
 	pgno = 0;
 	while (left > 0) {
 		if (m == NULL)
 			panic("nfsvno_createiovec_extpgs iov");
 		siz = min(PAGE_SIZE, left);
 		if (siz > 0) {
 			iv->iov_base = (void *)PHYS_TO_DMAP(m->m_epg_pa[pgno]);
 			iv->iov_len = siz;
 			m->m_len += siz;
 			if (pgno == m->m_epg_npgs - 1)
 				m->m_epg_last_len = siz;
 			left -= siz;
 			iv++;
 			i++;
 			pgno++;
 		}
 		if (pgno == m->m_epg_npgs && left > 0) {
 			m = m->m_next;
 			if (m == NULL)
 				panic("nfsvno_createiovec_extpgs iov");
 			pgno = 0;
 		}
 	}
 	*mpp = m3;
 	*mpendp = m2;
 	return (i);
 }
 
 /*
  * Read vnode op call into mbuf list.
  */
 int
 nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
     int maxextsiz, struct thread *p, struct mbuf **mpp,
     struct mbuf **mpendp)
 {
 	struct mbuf *m;
 	struct iovec *iv;
 	int error = 0, len, tlen, ioflag = 0;
 	struct mbuf *m3;
 	struct uio io, *uiop = &io;
 	struct nfsheur *nh;
 
 	/*
 	 * Attempt to read from a DS file. A return of ENOENT implies
 	 * there is no DS file to read.
 	 */
 	error = nfsrv_proxyds(vp, off, cnt, cred, p, NFSPROC_READDS, mpp,
 	    NULL, mpendp, NULL, NULL, NULL, 0, NULL);
 	if (error != ENOENT)
 		return (error);
 
 	len = NFSM_RNDUP(cnt);
 	if (maxextsiz > 0)
 		uiop->uio_iovcnt = nfsrv_createiovec_extpgs(len, maxextsiz,
 		    &m3, &m, &iv);
 	else
 		uiop->uio_iovcnt = nfsrv_createiovec(len, &m3, &m, &iv);
 	uiop->uio_iov = iv;
 	uiop->uio_offset = off;
 	uiop->uio_resid = len;
 	uiop->uio_rw = UIO_READ;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = NULL;
 	nh = nfsrv_sequential_heuristic(uiop, vp);
 	ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
 	/* XXX KDM make this more systematic? */
 	nfsstatsv1.srvbytes[NFSV4OP_READ] += uiop->uio_resid;
 	error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred);
 	free(iv, M_TEMP);
 	if (error) {
 		m_freem(m3);
 		*mpp = NULL;
 		goto out;
 	}
 	nh->nh_nextoff = uiop->uio_offset;
 	tlen = len - uiop->uio_resid;
 	cnt = cnt < tlen ? cnt : tlen;
 	tlen = NFSM_RNDUP(cnt);
 	if (tlen == 0) {
 		m_freem(m3);
 		m3 = m = NULL;
 	} else if (len != tlen || tlen != cnt)
 		m = nfsrv_adj(m3, len - tlen, tlen - cnt);
 	*mpp = m3;
 	*mpendp = m;
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Create the iovec for the mbuf chain passed in as an argument.
  * The "cp" argument is where the data starts within the first mbuf in
  * the chain. It returns the iovec and the iovcnt.
  */
 static int
 nfsrv_createiovecw(int retlen, struct mbuf *m, char *cp, struct iovec **ivpp,
     int *iovcntp)
 {
 	struct mbuf *mp;
 	struct iovec *ivp;
 	int cnt, i, len;
 
 	/*
 	 * Loop through the mbuf chain, counting how many mbufs are a
 	 * part of this write operation, so the iovec size is known.
 	 */
 	cnt = 0;
 	len = retlen;
 	mp = m;
 	i = mtod(mp, caddr_t) + mp->m_len - cp;
 	while (len > 0) {
 		if (i > 0) {
 			len -= i;
 			cnt++;
 		}
 		mp = mp->m_next;
 		if (!mp) {
 			if (len > 0)
 				return (EBADRPC);
 		} else
 			i = mp->m_len;
 	}
 
 	/* Now, create the iovec. */
 	mp = m;
 	*ivpp = ivp = malloc(cnt * sizeof (struct iovec), M_TEMP,
 	    M_WAITOK);
 	*iovcntp = cnt;
 	i = mtod(mp, caddr_t) + mp->m_len - cp;
 	len = retlen;
 	while (len > 0) {
 		if (mp == NULL)
 			panic("nfsrv_createiovecw");
 		if (i > 0) {
 			i = min(i, len);
 			ivp->iov_base = cp;
 			ivp->iov_len = i;
 			ivp++;
 			len -= i;
 		}
 		mp = mp->m_next;
 		if (mp) {
 			i = mp->m_len;
 			cp = mtod(mp, caddr_t);
 		}
 	}
 	return (0);
 }
 
 /*
  * Write vnode op from an mbuf list.
  */
 int
 nfsvno_write(struct vnode *vp, off_t off, int retlen, int *stable,
     struct mbuf *mp, char *cp, struct ucred *cred, struct thread *p)
 {
 	struct iovec *iv;
 	int cnt, ioflags, error;
 	struct uio io, *uiop = &io;
 	struct nfsheur *nh;
 
 	/*
 	 * Attempt to write to a DS file. A return of ENOENT implies
 	 * there is no DS file to write.
 	 */
 	error = nfsrv_proxyds(vp, off, retlen, cred, p, NFSPROC_WRITEDS,
 	    &mp, cp, NULL, NULL, NULL, NULL, 0, NULL);
 	if (error != ENOENT) {
 		*stable = NFSWRITE_FILESYNC;
 		return (error);
 	}
 
 	if (*stable == NFSWRITE_UNSTABLE)
 		ioflags = IO_NODELOCKED;
 	else
 		ioflags = (IO_SYNC | IO_NODELOCKED);
 	error = nfsrv_createiovecw(retlen, mp, cp, &iv, &cnt);
 	if (error != 0)
 		return (error);
 	uiop->uio_iov = iv;
 	uiop->uio_iovcnt = cnt;
 	uiop->uio_resid = retlen;
 	uiop->uio_rw = UIO_WRITE;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	NFSUIOPROC(uiop, p);
 	uiop->uio_offset = off;
 	nh = nfsrv_sequential_heuristic(uiop, vp);
 	ioflags |= nh->nh_seqcount << IO_SEQSHIFT;
 	/* XXX KDM make this more systematic? */
 	nfsstatsv1.srvbytes[NFSV4OP_WRITE] += uiop->uio_resid;
 	error = VOP_WRITE(vp, uiop, ioflags, cred);
 	if (error == 0)
 		nh->nh_nextoff = uiop->uio_offset;
 	free(iv, M_TEMP);
 
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Common code for creating a regular file (plus special files for V2).
  */
 int
 nfsvno_createsub(struct nfsrv_descript *nd, struct nameidata *ndp,
     struct vnode **vpp, struct nfsvattr *nvap, int *exclusive_flagp,
     int32_t *cverf, NFSDEV_T rdev, struct nfsexstuff *exp)
 {
 	u_quad_t tempsize;
 	int error;
 	struct thread *p = curthread;
 
 	error = nd->nd_repstat;
 	if (!error && ndp->ni_vp == NULL) {
 		if (nvap->na_type == VREG || nvap->na_type == VSOCK) {
 			vrele(ndp->ni_startdir);
 			error = VOP_CREATE(ndp->ni_dvp,
 			    &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
 			/* For a pNFS server, create the data file on a DS. */
 			if (error == 0 && nvap->na_type == VREG) {
 				/*
 				 * Create a data file on a DS for a pNFS server.
 				 * This function just returns if not
 				 * running a pNFS DS or the creation fails.
 				 */
 				nfsrv_pnfscreate(ndp->ni_vp, &nvap->na_vattr,
 				    nd->nd_cred, p);
 			}
 			vput(ndp->ni_dvp);
 			nfsvno_relpathbuf(ndp);
 			if (!error) {
 				if (*exclusive_flagp) {
 					*exclusive_flagp = 0;
 					NFSVNO_ATTRINIT(nvap);
 					nvap->na_atime.tv_sec = cverf[0];
 					nvap->na_atime.tv_nsec = cverf[1];
 					error = VOP_SETATTR(ndp->ni_vp,
 					    &nvap->na_vattr, nd->nd_cred);
 					if (error != 0) {
 						vput(ndp->ni_vp);
 						ndp->ni_vp = NULL;
 						error = NFSERR_NOTSUPP;
 					}
 				}
 			}
 		/*
 		 * NFS V2 Only. nfsrvd_mknod() does this for V3.
 		 * (This implies, just get out on an error.)
 		 */
 		} else if (nvap->na_type == VCHR || nvap->na_type == VBLK ||
 			nvap->na_type == VFIFO) {
 			if (nvap->na_type == VCHR && rdev == 0xffffffff)
 				nvap->na_type = VFIFO;
                         if (nvap->na_type != VFIFO &&
 			    (error = priv_check_cred(nd->nd_cred, PRIV_VFS_MKNOD_DEV))) {
 				vrele(ndp->ni_startdir);
 				nfsvno_relpathbuf(ndp);
 				vput(ndp->ni_dvp);
 				goto out;
 			}
 			nvap->na_rdev = rdev;
 			error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
 			    &ndp->ni_cnd, &nvap->na_vattr);
 			vput(ndp->ni_dvp);
 			nfsvno_relpathbuf(ndp);
 			vrele(ndp->ni_startdir);
 			if (error)
 				goto out;
 		} else {
 			vrele(ndp->ni_startdir);
 			nfsvno_relpathbuf(ndp);
 			vput(ndp->ni_dvp);
 			error = ENXIO;
 			goto out;
 		}
 		*vpp = ndp->ni_vp;
 	} else {
 		/*
 		 * Handle cases where error is already set and/or
 		 * the file exists.
 		 * 1 - clean up the lookup
 		 * 2 - iff !error and na_size set, truncate it
 		 */
 		vrele(ndp->ni_startdir);
 		nfsvno_relpathbuf(ndp);
 		*vpp = ndp->ni_vp;
 		if (ndp->ni_dvp == *vpp)
 			vrele(ndp->ni_dvp);
 		else
 			vput(ndp->ni_dvp);
 		if (!error && nvap->na_size != VNOVAL) {
 			error = nfsvno_accchk(*vpp, VWRITE,
 			    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
 			    NFSACCCHK_VPISLOCKED, NULL);
 			if (!error) {
 				tempsize = nvap->na_size;
 				NFSVNO_ATTRINIT(nvap);
 				nvap->na_size = tempsize;
 				error = VOP_SETATTR(*vpp,
 				    &nvap->na_vattr, nd->nd_cred);
 			}
 		}
 		if (error)
 			vput(*vpp);
 	}
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Do a mknod vnode op.
  */
 int
 nfsvno_mknod(struct nameidata *ndp, struct nfsvattr *nvap, struct ucred *cred,
     struct thread *p)
 {
 	int error = 0;
 	enum vtype vtyp;
 
 	vtyp = nvap->na_type;
 	/*
 	 * Iff doesn't exist, create it.
 	 */
 	if (ndp->ni_vp) {
 		vrele(ndp->ni_startdir);
 		nfsvno_relpathbuf(ndp);
 		vput(ndp->ni_dvp);
 		vrele(ndp->ni_vp);
 		error = EEXIST;
 		goto out;
 	}
 	if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) {
 		vrele(ndp->ni_startdir);
 		nfsvno_relpathbuf(ndp);
 		vput(ndp->ni_dvp);
 		error = NFSERR_BADTYPE;
 		goto out;
 	}
 	if (vtyp == VSOCK) {
 		vrele(ndp->ni_startdir);
 		error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 		    &ndp->ni_cnd, &nvap->na_vattr);
 		vput(ndp->ni_dvp);
 		nfsvno_relpathbuf(ndp);
 	} else {
 		if (nvap->na_type != VFIFO &&
 		    (error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV))) {
 			vrele(ndp->ni_startdir);
 			nfsvno_relpathbuf(ndp);
 			vput(ndp->ni_dvp);
 			goto out;
 		}
 		error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
 		    &ndp->ni_cnd, &nvap->na_vattr);
 		vput(ndp->ni_dvp);
 		nfsvno_relpathbuf(ndp);
 		vrele(ndp->ni_startdir);
 		/*
 		 * Since VOP_MKNOD returns the ni_vp, I can't
 		 * see any reason to do the lookup.
 		 */
 	}
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Mkdir vnode op.
  */
 int
 nfsvno_mkdir(struct nameidata *ndp, struct nfsvattr *nvap, uid_t saved_uid,
     struct ucred *cred, struct thread *p, struct nfsexstuff *exp)
 {
 	int error = 0;
 
 	if (ndp->ni_vp != NULL) {
 		if (ndp->ni_dvp == ndp->ni_vp)
 			vrele(ndp->ni_dvp);
 		else
 			vput(ndp->ni_dvp);
 		vrele(ndp->ni_vp);
 		nfsvno_relpathbuf(ndp);
 		error = EEXIST;
 		goto out;
 	}
 	error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
 	    &nvap->na_vattr);
 	vput(ndp->ni_dvp);
 	nfsvno_relpathbuf(ndp);
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * symlink vnode op.
  */
 int
 nfsvno_symlink(struct nameidata *ndp, struct nfsvattr *nvap, char *pathcp,
     int pathlen, int not_v2, uid_t saved_uid, struct ucred *cred, struct thread *p,
     struct nfsexstuff *exp)
 {
 	int error = 0;
 
 	if (ndp->ni_vp) {
 		vrele(ndp->ni_startdir);
 		nfsvno_relpathbuf(ndp);
 		if (ndp->ni_dvp == ndp->ni_vp)
 			vrele(ndp->ni_dvp);
 		else
 			vput(ndp->ni_dvp);
 		vrele(ndp->ni_vp);
 		error = EEXIST;
 		goto out;
 	}
 
 	error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
 	    &nvap->na_vattr, pathcp);
 	vput(ndp->ni_dvp);
 	vrele(ndp->ni_startdir);
 	nfsvno_relpathbuf(ndp);
 	/*
 	 * Although FreeBSD still had the lookup code in
 	 * it for 7/current, there doesn't seem to be any
 	 * point, since VOP_SYMLINK() returns the ni_vp.
 	 * Just vput it for v2.
 	 */
 	if (!not_v2 && !error)
 		vput(ndp->ni_vp);
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Parse symbolic link arguments.
  * This function has an ugly side effect. It will malloc() an area for
  * the symlink and set iov_base to point to it, only if it succeeds.
  * So, if it returns with uiop->uio_iov->iov_base != NULL, that must
  * be FREE'd later.
  */
 int
 nfsvno_getsymlink(struct nfsrv_descript *nd, struct nfsvattr *nvap,
     struct thread *p, char **pathcpp, int *lenp)
 {
 	u_int32_t *tl;
 	char *pathcp = NULL;
 	int error = 0, len;
 	struct nfsv2_sattr *sp;
 
 	*pathcpp = NULL;
 	*lenp = 0;
 	if ((nd->nd_flag & ND_NFSV3) &&
 	    (error = nfsrv_sattr(nd, NULL, nvap, NULL, NULL, p)))
 		goto nfsmout;
 	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 	len = fxdr_unsigned(int, *tl);
 	if (len > NFS_MAXPATHLEN || len <= 0) {
 		error = EBADRPC;
 		goto nfsmout;
 	}
 	pathcp = malloc(len + 1, M_TEMP, M_WAITOK);
 	error = nfsrv_mtostr(nd, pathcp, len);
 	if (error)
 		goto nfsmout;
 	if (nd->nd_flag & ND_NFSV2) {
 		NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
 		nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode);
 	}
 	*pathcpp = pathcp;
 	*lenp = len;
 	NFSEXITCODE2(0, nd);
 	return (0);
 nfsmout:
 	if (pathcp)
 		free(pathcp, M_TEMP);
 	NFSEXITCODE2(error, nd);
 	return (error);
 }
 
 /*
  * Remove a non-directory object.
  */
 int
 nfsvno_removesub(struct nameidata *ndp, int is_v4, struct ucred *cred,
     struct thread *p, struct nfsexstuff *exp)
 {
 	struct vnode *vp, *dsdvp[NFSDEV_MAXMIRRORS];
 	int error = 0, mirrorcnt;
 	char fname[PNFS_FILENAME_LEN + 1];
 	fhandle_t fh;
 
 	vp = ndp->ni_vp;
 	dsdvp[0] = NULL;
 	if (vp->v_type == VDIR)
 		error = NFSERR_ISDIR;
 	else if (is_v4)
 		error = nfsrv_checkremove(vp, 1, NULL, (nfsquad_t)((u_quad_t)0),
 		    p);
 	if (error == 0)
 		nfsrv_pnfsremovesetup(vp, p, dsdvp, &mirrorcnt, fname, &fh);
 	if (!error)
 		error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd);
 	if (error == 0 && dsdvp[0] != NULL)
 		nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p);
 	if (ndp->ni_dvp == vp)
 		vrele(ndp->ni_dvp);
 	else
 		vput(ndp->ni_dvp);
 	vput(vp);
 	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
 		nfsvno_relpathbuf(ndp);
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Remove a directory.
  */
 int
 nfsvno_rmdirsub(struct nameidata *ndp, int is_v4, struct ucred *cred,
     struct thread *p, struct nfsexstuff *exp)
 {
 	struct vnode *vp;
 	int error = 0;
 
 	vp = ndp->ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (ndp->ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_vflag & VV_ROOT)
 		error = EBUSY;
 out:
 	if (!error)
 		error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd);
 	if (ndp->ni_dvp == vp)
 		vrele(ndp->ni_dvp);
 	else
 		vput(ndp->ni_dvp);
 	vput(vp);
 	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
 		nfsvno_relpathbuf(ndp);
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Rename vnode op.
  */
 int
 nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp,
     u_int32_t ndstat, u_int32_t ndflag, struct ucred *cred, struct thread *p)
 {
 	struct vnode *fvp, *tvp, *tdvp, *dsdvp[NFSDEV_MAXMIRRORS];
 	int error = 0, mirrorcnt;
 	char fname[PNFS_FILENAME_LEN + 1];
 	fhandle_t fh;
 
 	dsdvp[0] = NULL;
 	fvp = fromndp->ni_vp;
 	if (ndstat) {
 		vrele(fromndp->ni_dvp);
 		vrele(fvp);
 		error = ndstat;
 		goto out1;
 	}
 	tdvp = tondp->ni_dvp;
 	tvp = tondp->ni_vp;
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST;
 			goto out;
 		}
 		if (tvp->v_type == VDIR && tvp->v_mountedhere) {
 			error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
 			goto out;
 		}
 
 		/*
 		 * A rename to '.' or '..' results in a prematurely
 		 * unlocked vnode on FreeBSD5, so I'm just going to fail that
 		 * here.
 		 */
 		if ((tondp->ni_cnd.cn_namelen == 1 &&
 		     tondp->ni_cnd.cn_nameptr[0] == '.') ||
 		    (tondp->ni_cnd.cn_namelen == 2 &&
 		     tondp->ni_cnd.cn_nameptr[0] == '.' &&
 		     tondp->ni_cnd.cn_nameptr[1] == '.')) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 	if (fvp->v_type == VDIR && fvp->v_mountedhere) {
 		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
 		goto out;
 	}
 	if (fvp->v_mount != tdvp->v_mount) {
 		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
 		goto out;
 	}
 	if (fvp == tdvp) {
 		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL;
 		goto out;
 	}
 	if (fvp == tvp) {
 		/*
 		 * If source and destination are the same, there is nothing to
 		 * do. Set error to -1 to indicate this.
 		 */
 		error = -1;
 		goto out;
 	}
 	if (ndflag & ND_NFSV4) {
 		if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) {
 			error = nfsrv_checkremove(fvp, 0, NULL,
 			    (nfsquad_t)((u_quad_t)0), p);
 			NFSVOPUNLOCK(fvp);
 		} else
 			error = EPERM;
 		if (tvp && !error)
 			error = nfsrv_checkremove(tvp, 1, NULL,
 			    (nfsquad_t)((u_quad_t)0), p);
 	} else {
 		/*
 		 * For NFSv2 and NFSv3, try to get rid of the delegation, so
 		 * that the NFSv4 client won't be confused by the rename.
 		 * Since nfsd_recalldelegation() can only be called on an
 		 * unlocked vnode at this point and fvp is the file that will
 		 * still exist after the rename, just do fvp.
 		 */
 		nfsd_recalldelegation(fvp, p);
 	}
 	if (error == 0 && tvp != NULL) {
 		nfsrv_pnfsremovesetup(tvp, p, dsdvp, &mirrorcnt, fname, &fh);
 		NFSD_DEBUG(4, "nfsvno_rename: pnfsremovesetup"
 		    " dsdvp=%p\n", dsdvp[0]);
 	}
 out:
 	if (!error) {
 		error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp,
 		    &fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp,
 		    &tondp->ni_cnd);
 	} else {
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fromndp->ni_dvp);
 		vrele(fvp);
 		if (error == -1)
 			error = 0;
 	}
 
 	/*
 	 * If dsdvp[0] != NULL, it was set up by nfsrv_pnfsremovesetup() and
 	 * if the rename succeeded, the DS file for the tvp needs to be
 	 * removed.
 	 */
 	if (error == 0 && dsdvp[0] != NULL) {
 		nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p);
 		NFSD_DEBUG(4, "nfsvno_rename: pnfsremove\n");
 	}
 
 	vrele(tondp->ni_startdir);
 	nfsvno_relpathbuf(tondp);
 out1:
 	vrele(fromndp->ni_startdir);
 	nfsvno_relpathbuf(fromndp);
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Link vnode op.
  */
 int
 nfsvno_link(struct nameidata *ndp, struct vnode *vp, struct ucred *cred,
     struct thread *p, struct nfsexstuff *exp)
 {
 	struct vnode *xp;
 	int error = 0;
 
 	xp = ndp->ni_vp;
 	if (xp != NULL) {
 		error = EEXIST;
 	} else {
 		xp = ndp->ni_dvp;
 		if (vp->v_mount != xp->v_mount)
 			error = EXDEV;
 	}
 	if (!error) {
 		NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (!VN_IS_DOOMED(vp))
 			error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd);
 		else
 			error = EPERM;
 		if (ndp->ni_dvp == vp)
 			vrele(ndp->ni_dvp);
 		else
 			vput(ndp->ni_dvp);
 		NFSVOPUNLOCK(vp);
 	} else {
 		if (ndp->ni_dvp == ndp->ni_vp)
 			vrele(ndp->ni_dvp);
 		else
 			vput(ndp->ni_dvp);
 		if (ndp->ni_vp)
 			vrele(ndp->ni_vp);
 	}
 	nfsvno_relpathbuf(ndp);
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Do the fsync() appropriate for the commit.
  */
 int
 nfsvno_fsync(struct vnode *vp, u_int64_t off, int cnt, struct ucred *cred,
     struct thread *td)
 {
 	int error = 0;
 
 	/*
 	 * RFC 1813 3.3.21: if count is 0, a flush from offset to the end of
 	 * file is done.  At this time VOP_FSYNC does not accept offset and
 	 * byte count parameters so call VOP_FSYNC the whole file for now.
 	 * The same is true for NFSv4: RFC 3530 Sec. 14.2.3.
 	 * File systems that do not use the buffer cache (as indicated
 	 * by MNTK_USES_BCACHE not being set) must use VOP_FSYNC().
 	 */
 	if (cnt == 0 || cnt > MAX_COMMIT_COUNT ||
 	    (vp->v_mount->mnt_kern_flag & MNTK_USES_BCACHE) == 0) {
 		/*
 		 * Give up and do the whole thing
 		 */
 		if (vp->v_object && vm_object_mightbedirty(vp->v_object)) {
 			VM_OBJECT_WLOCK(vp->v_object);
 			vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
 			VM_OBJECT_WUNLOCK(vp->v_object);
 		}
 		error = VOP_FSYNC(vp, MNT_WAIT, td);
 	} else {
 		/*
 		 * Locate and synchronously write any buffers that fall
 		 * into the requested range.  Note:  we are assuming that
 		 * f_iosize is a power of 2.
 		 */
 		int iosize = vp->v_mount->mnt_stat.f_iosize;
 		int iomask = iosize - 1;
 		struct bufobj *bo;
 		daddr_t lblkno;
 
 		/*
 		 * Align to iosize boundary, super-align to page boundary.
 		 */
 		if (off & iomask) {
 			cnt += off & iomask;
 			off &= ~(u_quad_t)iomask;
 		}
 		if (off & PAGE_MASK) {
 			cnt += off & PAGE_MASK;
 			off &= ~(u_quad_t)PAGE_MASK;
 		}
 		lblkno = off / iosize;
 
 		if (vp->v_object && vm_object_mightbedirty(vp->v_object)) {
 			VM_OBJECT_WLOCK(vp->v_object);
 			vm_object_page_clean(vp->v_object, off, off + cnt,
 			    OBJPC_SYNC);
 			VM_OBJECT_WUNLOCK(vp->v_object);
 		}
 
 		bo = &vp->v_bufobj;
 		BO_LOCK(bo);
 		while (cnt > 0) {
 			struct buf *bp;
 
 			/*
 			 * If we have a buffer and it is marked B_DELWRI we
 			 * have to lock and write it.  Otherwise the prior
 			 * write is assumed to have already been committed.
 			 *
 			 * gbincore() can return invalid buffers now so we
 			 * have to check that bit as well (though B_DELWRI
 			 * should not be set if B_INVAL is set there could be
 			 * a race here since we haven't locked the buffer).
 			 */
 			if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 				    LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) {
 					BO_LOCK(bo);
 					continue; /* retry */
 				}
 			    	if ((bp->b_flags & (B_DELWRI|B_INVAL)) ==
 				    B_DELWRI) {
 					bremfree(bp);
 					bp->b_flags &= ~B_ASYNC;
 					bwrite(bp);
 					++nfs_commit_miss;
 				} else
 					BUF_UNLOCK(bp);
 				BO_LOCK(bo);
 			}
 			++nfs_commit_blks;
 			if (cnt < iosize)
 				break;
 			cnt -= iosize;
 			++lblkno;
 		}
 		BO_UNLOCK(bo);
 	}
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Statfs vnode op.
  */
 int
 nfsvno_statfs(struct vnode *vp, struct statfs *sf)
 {
 	struct statfs *tsf;
 	int error;
 
 	tsf = NULL;
 	if (nfsrv_devidcnt > 0) {
 		/* For a pNFS service, get the DS numbers. */
 		tsf = malloc(sizeof(*tsf), M_TEMP, M_WAITOK | M_ZERO);
 		error = nfsrv_pnfsstatfs(tsf, vp->v_mount);
 		if (error != 0) {
 			free(tsf, M_TEMP);
 			tsf = NULL;
 		}
 	}
 	error = VFS_STATFS(vp->v_mount, sf);
 	if (error == 0) {
 		if (tsf != NULL) {
 			sf->f_blocks = tsf->f_blocks;
 			sf->f_bavail = tsf->f_bavail;
 			sf->f_bfree = tsf->f_bfree;
 			sf->f_bsize = tsf->f_bsize;
 		}
 		/*
 		 * Since NFS handles these values as unsigned on the
 		 * wire, there is no way to represent negative values,
 		 * so set them to 0. Without this, they will appear
 		 * to be very large positive values for clients like
 		 * Solaris10.
 		 */
 		if (sf->f_bavail < 0)
 			sf->f_bavail = 0;
 		if (sf->f_ffree < 0)
 			sf->f_ffree = 0;
 	}
 	free(tsf, M_TEMP);
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but
  * must handle nfsrv_opencheck() calls after any other access checks.
  */
 void
 nfsvno_open(struct nfsrv_descript *nd, struct nameidata *ndp,
     nfsquad_t clientid, nfsv4stateid_t *stateidp, struct nfsstate *stp,
     int *exclusive_flagp, struct nfsvattr *nvap, int32_t *cverf, int create,
     NFSACL_T *aclp, nfsattrbit_t *attrbitp, struct ucred *cred,
     struct nfsexstuff *exp, struct vnode **vpp)
 {
 	struct vnode *vp = NULL;
 	u_quad_t tempsize;
 	struct nfsexstuff nes;
 	struct thread *p = curthread;
 
 	if (ndp->ni_vp == NULL)
 		nd->nd_repstat = nfsrv_opencheck(clientid,
 		    stateidp, stp, NULL, nd, p, nd->nd_repstat);
 	if (!nd->nd_repstat) {
 		if (ndp->ni_vp == NULL) {
 			vrele(ndp->ni_startdir);
 			nd->nd_repstat = VOP_CREATE(ndp->ni_dvp,
 			    &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
 			/* For a pNFS server, create the data file on a DS. */
 			if (nd->nd_repstat == 0) {
 				/*
 				 * Create a data file on a DS for a pNFS server.
 				 * This function just returns if not
 				 * running a pNFS DS or the creation fails.
 				 */
 				nfsrv_pnfscreate(ndp->ni_vp, &nvap->na_vattr,
 				    cred, p);
 			}
 			vput(ndp->ni_dvp);
 			nfsvno_relpathbuf(ndp);
 			if (!nd->nd_repstat) {
 				if (*exclusive_flagp) {
 					*exclusive_flagp = 0;
 					NFSVNO_ATTRINIT(nvap);
 					nvap->na_atime.tv_sec = cverf[0];
 					nvap->na_atime.tv_nsec = cverf[1];
 					nd->nd_repstat = VOP_SETATTR(ndp->ni_vp,
 					    &nvap->na_vattr, cred);
 					if (nd->nd_repstat != 0) {
 						vput(ndp->ni_vp);
 						ndp->ni_vp = NULL;
 						nd->nd_repstat = NFSERR_NOTSUPP;
 					} else
 						NFSSETBIT_ATTRBIT(attrbitp,
 						    NFSATTRBIT_TIMEACCESS);
 				} else {
 					nfsrv_fixattr(nd, ndp->ni_vp, nvap,
 					    aclp, p, attrbitp, exp);
 				}
 			}
 			vp = ndp->ni_vp;
 		} else {
 			if (ndp->ni_startdir)
 				vrele(ndp->ni_startdir);
 			nfsvno_relpathbuf(ndp);
 			vp = ndp->ni_vp;
 			if (create == NFSV4OPEN_CREATE) {
 				if (ndp->ni_dvp == vp)
 					vrele(ndp->ni_dvp);
 				else
 					vput(ndp->ni_dvp);
 			}
 			if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) {
 				if (ndp->ni_cnd.cn_flags & RDONLY)
 					NFSVNO_SETEXRDONLY(&nes);
 				else
 					NFSVNO_EXINIT(&nes);
 				nd->nd_repstat = nfsvno_accchk(vp, 
 				    VWRITE, cred, &nes, p,
 				    NFSACCCHK_NOOVERRIDE,
 				    NFSACCCHK_VPISLOCKED, NULL);
 				nd->nd_repstat = nfsrv_opencheck(clientid,
 				    stateidp, stp, vp, nd, p, nd->nd_repstat);
 				if (!nd->nd_repstat) {
 					tempsize = nvap->na_size;
 					NFSVNO_ATTRINIT(nvap);
 					nvap->na_size = tempsize;
 					nd->nd_repstat = VOP_SETATTR(vp,
 					    &nvap->na_vattr, cred);
 				}
 			} else if (vp->v_type == VREG) {
 				nd->nd_repstat = nfsrv_opencheck(clientid,
 				    stateidp, stp, vp, nd, p, nd->nd_repstat);
 			}
 		}
 	} else {
 		if (ndp->ni_cnd.cn_flags & HASBUF)
 			nfsvno_relpathbuf(ndp);
 		if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) {
 			vrele(ndp->ni_startdir);
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			if (ndp->ni_vp)
 				vput(ndp->ni_vp);
 		}
 	}
 	*vpp = vp;
 
 	NFSEXITCODE2(0, nd);
 }
 
 /*
  * Updates the file rev and sets the mtime and ctime
  * to the current clock time, returning the va_filerev and va_Xtime
  * values.
  * Return ESTALE to indicate the vnode is VIRF_DOOMED.
  */
 int
 nfsvno_updfilerev(struct vnode *vp, struct nfsvattr *nvap,
     struct nfsrv_descript *nd, struct thread *p)
 {
 	struct vattr va;
 
 	VATTR_NULL(&va);
 	vfs_timestamp(&va.va_mtime);
 	if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
 		NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY);
 		if (VN_IS_DOOMED(vp))
 			return (ESTALE);
 	}
 	(void) VOP_SETATTR(vp, &va, nd->nd_cred);
 	(void) nfsvno_getattr(vp, nvap, nd, p, 1, NULL);
 	return (0);
 }
 
 /*
  * Glue routine to nfsv4_fillattr().
  */
 int
 nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
     struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp,
     struct ucred *cred, struct thread *p, int isdgram, int reterr,
     int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
 {
 	struct statfs *sf;
 	int error;
 
 	sf = NULL;
 	if (nfsrv_devidcnt > 0 &&
 	    (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEAVAIL) ||
 	     NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEFREE) ||
 	     NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACETOTAL))) {
 		sf = malloc(sizeof(*sf), M_TEMP, M_WAITOK | M_ZERO);
 		error = nfsrv_pnfsstatfs(sf, mp);
 		if (error != 0) {
 			free(sf, M_TEMP);
 			sf = NULL;
 		}
 	}
 	error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
 	    attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
 	    mounted_on_fileno, sf);
 	free(sf, M_TEMP);
 	NFSEXITCODE2(0, nd);
 	return (error);
 }
 
 /* Since the Readdir vnode ops vary, put the entire functions in here. */
 /*
  * nfs readdir service
  * - mallocs what it thinks is enough to read
  *	count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR
  * - calls VOP_READDIR()
  * - loops around building the reply
  *	if the output generated exceeds count break out of loop
  *	The NFSM_CLGET macro is used here so that the reply will be packed
  *	tightly in mbuf clusters.
  * - it trims out records with d_fileno == 0
  *	this doesn't matter for Unix clients, but they might confuse clients
  *	for other os'.
  * - it trims out records with d_type == DT_WHT
  *	these cannot be seen through NFS (unless we extend the protocol)
  *     The alternate call nfsrvd_readdirplus() does lookups as well.
  * PS: The NFS protocol spec. does not clarify what the "count" byte
  *	argument is a count of.. just name strings and file id's or the
  *	entire reply rpc or ...
  *	I tried just file name and id sizes and it confused the Sun client,
  *	so I am using the full rpc size now. The "paranoia.." comment refers
  *	to including the status longwords that are not a part of the dir.
  *	"entry" structures, but are in the rpc.
  */
 int
 nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram,
     struct vnode *vp, struct nfsexstuff *exp)
 {
 	struct dirent *dp;
 	u_int32_t *tl;
 	int dirlen;
 	char *cpos, *cend, *rbuf;
 	struct nfsvattr at;
 	int nlen, error = 0, getret = 1;
 	int siz, cnt, fullsiz, eofflag, ncookies;
 	u_int64_t off, toff, verf __unused;
 	u_long *cookies = NULL, *cookiep;
 	struct uio io;
 	struct iovec iv;
 	int is_ufs;
 	struct thread *p = curthread;
 
 	if (nd->nd_repstat) {
 		nfsrv_postopattr(nd, getret, &at);
 		goto out;
 	}
 	if (nd->nd_flag & ND_NFSV2) {
 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		off = fxdr_unsigned(u_quad_t, *tl++);
 	} else {
 		NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
 		off = fxdr_hyper(tl);
 		tl += 2;
 		verf = fxdr_hyper(tl);
 		tl += 2;
 	}
 	toff = off;
 	cnt = fxdr_unsigned(int, *tl);
 	if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
 		cnt = NFS_SRVMAXDATA(nd);
 	siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
 	fullsiz = siz;
 	if (nd->nd_flag & ND_NFSV3) {
 		nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd, p, 1,
 		    NULL);
 #if 0
 		/*
 		 * va_filerev is not sufficient as a cookie verifier,
 		 * since it is not supposed to change when entries are
 		 * removed/added unless that offset cookies returned to
 		 * the client are no longer valid.
 		 */
 		if (!nd->nd_repstat && toff && verf != at.na_filerev)
 			nd->nd_repstat = NFSERR_BAD_COOKIE;
 #endif
 	}
 	if (!nd->nd_repstat && vp->v_type != VDIR)
 		nd->nd_repstat = NFSERR_NOTDIR;
 	if (nd->nd_repstat == 0 && cnt == 0) {
 		if (nd->nd_flag & ND_NFSV2)
 			/* NFSv2 does not have NFSERR_TOOSMALL */
 			nd->nd_repstat = EPERM;
 		else
 			nd->nd_repstat = NFSERR_TOOSMALL;
 	}
 	if (!nd->nd_repstat)
 		nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
 		    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
 		    NFSACCCHK_VPISLOCKED, NULL);
 	if (nd->nd_repstat) {
 		vput(vp);
 		if (nd->nd_flag & ND_NFSV3)
 			nfsrv_postopattr(nd, getret, &at);
 		goto out;
 	}
 	is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0;
 	rbuf = malloc(siz, M_TEMP, M_WAITOK);
 again:
 	eofflag = 0;
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 	iv.iov_base = rbuf;
 	iv.iov_len = siz;
 	io.uio_iov = &iv;
 	io.uio_iovcnt = 1;
 	io.uio_offset = (off_t)off;
 	io.uio_resid = siz;
 	io.uio_segflg = UIO_SYSSPACE;
 	io.uio_rw = UIO_READ;
 	io.uio_td = NULL;
 	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
 	    &cookies);
 	off = (u_int64_t)io.uio_offset;
 	if (io.uio_resid)
 		siz -= io.uio_resid;
 
 	if (!cookies && !nd->nd_repstat)
 		nd->nd_repstat = NFSERR_PERM;
 	if (nd->nd_flag & ND_NFSV3) {
 		getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL);
 		if (!nd->nd_repstat)
 			nd->nd_repstat = getret;
 	}
 
 	/*
 	 * Handles the failed cases. nd->nd_repstat == 0 past here.
 	 */
 	if (nd->nd_repstat) {
 		vput(vp);
 		free(rbuf, M_TEMP);
 		if (cookies)
 			free(cookies, M_TEMP);
 		if (nd->nd_flag & ND_NFSV3)
 			nfsrv_postopattr(nd, getret, &at);
 		goto out;
 	}
 	/*
 	 * If nothing read, return eof
 	 * rpc reply
 	 */
 	if (siz == 0) {
 		vput(vp);
 		if (nd->nd_flag & ND_NFSV2) {
 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		} else {
 			nfsrv_postopattr(nd, getret, &at);
 			NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 			txdr_hyper(at.na_filerev, tl);
 			tl += 2;
 		}
 		*tl++ = newnfs_false;
 		*tl = newnfs_true;
 		free(rbuf, M_TEMP);
 		free(cookies, M_TEMP);
 		goto out;
 	}
 
 	/*
 	 * Check for degenerate cases of nothing useful read.
 	 * If so go try again
 	 */
 	cpos = rbuf;
 	cend = rbuf + siz;
 	dp = (struct dirent *)cpos;
 	cookiep = cookies;
 
 	/*
 	 * For some reason FreeBSD's ufs_readdir() chooses to back the
 	 * directory offset up to a block boundary, so it is necessary to
 	 * skip over the records that precede the requested offset. This
 	 * requires the assumption that file offset cookies monotonically
 	 * increase.
 	 */
 	while (cpos < cend && ncookies > 0 &&
 	    (dp->d_fileno == 0 || dp->d_type == DT_WHT ||
 	     (is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff))) {
 		cpos += dp->d_reclen;
 		dp = (struct dirent *)cpos;
 		cookiep++;
 		ncookies--;
 	}
 	if (cpos >= cend || ncookies == 0) {
 		siz = fullsiz;
 		toff = off;
 		goto again;
 	}
 	vput(vp);
 
 	/*
 	 * If cnt > MCLBYTES and the reply will not be saved, use
 	 * ext_pgs mbufs for TLS.
 	 * For NFSv4.0, we do not know for sure if the reply will
 	 * be saved, so do not use ext_pgs mbufs for NFSv4.0.
 	 */
 	if (cnt > MCLBYTES && siz > MCLBYTES &&
 	    (nd->nd_flag & (ND_TLS | ND_EXTPG | ND_SAVEREPLY)) == ND_TLS &&
 	    (nd->nd_flag & (ND_NFSV4 | ND_NFSV41)) != ND_NFSV4)
 		nd->nd_flag |= ND_EXTPG;
 
 	/*
 	 * dirlen is the size of the reply, including all XDR and must
 	 * not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate
 	 * if the XDR should be included in "count", but to be safe, we do.
 	 * (Include the two booleans at the end of the reply in dirlen now.)
 	 */
 	if (nd->nd_flag & ND_NFSV3) {
 		nfsrv_postopattr(nd, getret, &at);
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		txdr_hyper(at.na_filerev, tl);
 		dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
 	} else {
 		dirlen = 2 * NFSX_UNSIGNED;
 	}
 
 	/* Loop through the records and build reply */
 	while (cpos < cend && ncookies > 0) {
 		nlen = dp->d_namlen;
 		if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
 			nlen <= NFS_MAXNAMLEN) {
 			if (nd->nd_flag & ND_NFSV3)
 				dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
 			else
 				dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
 			if (dirlen > cnt) {
 				eofflag = 0;
 				break;
 			}
 
 			/*
 			 * Build the directory record xdr from
 			 * the dirent entry.
 			 */
 			if (nd->nd_flag & ND_NFSV3) {
 				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 				*tl++ = newnfs_true;
 				*tl++ = 0;
 			} else {
 				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 				*tl++ = newnfs_true;
 			}
 			*tl = txdr_unsigned(dp->d_fileno);
 			(void) nfsm_strtom(nd, dp->d_name, nlen);
 			if (nd->nd_flag & ND_NFSV3) {
 				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 				*tl++ = 0;
 			} else
 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 			*tl = txdr_unsigned(*cookiep);
 		}
 		cpos += dp->d_reclen;
 		dp = (struct dirent *)cpos;
 		cookiep++;
 		ncookies--;
 	}
 	if (cpos < cend)
 		eofflag = 0;
 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 	*tl++ = newnfs_false;
 	if (eofflag)
 		*tl = newnfs_true;
 	else
 		*tl = newnfs_false;
 	free(rbuf, M_TEMP);
 	free(cookies, M_TEMP);
 
 out:
 	NFSEXITCODE2(0, nd);
 	return (0);
 nfsmout:
 	vput(vp);
 	NFSEXITCODE2(error, nd);
 	return (error);
 }
 
 /*
  * Readdirplus for V3 and Readdir for V4.
  */
 int
 nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
     struct vnode *vp, struct nfsexstuff *exp)
 {
 	struct dirent *dp;
 	u_int32_t *tl;
 	int dirlen;
 	char *cpos, *cend, *rbuf;
 	struct vnode *nvp;
 	fhandle_t nfh;
 	struct nfsvattr nva, at, *nvap = &nva;
 	struct mbuf *mb0, *mb1;
 	struct nfsreferral *refp;
 	int nlen, r, error = 0, getret = 1, usevget = 1;
 	int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
 	caddr_t bpos0, bpos1;
 	u_int64_t off, toff, verf;
 	u_long *cookies = NULL, *cookiep;
 	nfsattrbit_t attrbits, rderrbits, savbits;
 	struct uio io;
 	struct iovec iv;
 	struct componentname cn;
 	int at_root, is_ufs, is_zfs, needs_unbusy, supports_nfsv4acls;
 	struct mount *mp, *new_mp;
 	uint64_t mounted_on_fileno;
 	struct thread *p = curthread;
 	int bextpg0, bextpg1, bextpgsiz0, bextpgsiz1;
 
 	if (nd->nd_repstat) {
 		nfsrv_postopattr(nd, getret, &at);
 		goto out;
 	}
 	NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
 	off = fxdr_hyper(tl);
 	toff = off;
 	tl += 2;
 	verf = fxdr_hyper(tl);
 	tl += 2;
 	siz = fxdr_unsigned(int, *tl++);
 	cnt = fxdr_unsigned(int, *tl);
 
 	/*
 	 * Use the server's maximum data transfer size as the upper bound
 	 * on reply datalen.
 	 */
 	if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
 		cnt = NFS_SRVMAXDATA(nd);
 
 	/*
 	 * siz is a "hint" of how much directory information (name, fileid,
 	 * cookie) should be in the reply. At least one client "hints" 0,
 	 * so I set it to cnt for that case. I also round it up to the
 	 * next multiple of DIRBLKSIZ.
 	 * Since the size of a Readdirplus directory entry reply will always
 	 * be greater than a directory entry returned by VOP_READDIR(), it
 	 * does not make sense to read more than NFS_SRVMAXDATA() via
 	 * VOP_READDIR().
 	 */
 	if (siz <= 0)
 		siz = cnt;
 	else if (siz > NFS_SRVMAXDATA(nd))
 		siz = NFS_SRVMAXDATA(nd);
 	siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
 
 	if (nd->nd_flag & ND_NFSV4) {
 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 		if (error)
 			goto nfsmout;
 		NFSSET_ATTRBIT(&savbits, &attrbits);
 		NFSCLRNOTFILLABLE_ATTRBIT(&attrbits, nd);
 		NFSZERO_ATTRBIT(&rderrbits);
 		NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR);
 	} else {
 		NFSZERO_ATTRBIT(&attrbits);
 	}
 	fullsiz = siz;
 	nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL);
 #if 0
 	if (!nd->nd_repstat) {
 	    if (off && verf != at.na_filerev) {
 		/*
 		 * va_filerev is not sufficient as a cookie verifier,
 		 * since it is not supposed to change when entries are
 		 * removed/added unless that offset cookies returned to
 		 * the client are no longer valid.
 		 */
 		if (nd->nd_flag & ND_NFSV4) {
 			nd->nd_repstat = NFSERR_NOTSAME;
 		} else {
 			nd->nd_repstat = NFSERR_BAD_COOKIE;
 		}
 	    }
 	}
 #endif
 	if (!nd->nd_repstat && vp->v_type != VDIR)
 		nd->nd_repstat = NFSERR_NOTDIR;
 	if (!nd->nd_repstat && cnt == 0)
 		nd->nd_repstat = NFSERR_TOOSMALL;
 	if (!nd->nd_repstat)
 		nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
 		    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
 		    NFSACCCHK_VPISLOCKED, NULL);
 	if (nd->nd_repstat) {
 		vput(vp);
 		if (nd->nd_flag & ND_NFSV3)
 			nfsrv_postopattr(nd, getret, &at);
 		goto out;
 	}
 	is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0;
 	is_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs") == 0;
 
 	rbuf = malloc(siz, M_TEMP, M_WAITOK);
 again:
 	eofflag = 0;
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 	iv.iov_base = rbuf;
 	iv.iov_len = siz;
 	io.uio_iov = &iv;
 	io.uio_iovcnt = 1;
 	io.uio_offset = (off_t)off;
 	io.uio_resid = siz;
 	io.uio_segflg = UIO_SYSSPACE;
 	io.uio_rw = UIO_READ;
 	io.uio_td = NULL;
 	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
 	    &cookies);
 	off = (u_int64_t)io.uio_offset;
 	if (io.uio_resid)
 		siz -= io.uio_resid;
 
 	getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL);
 
 	if (!cookies && !nd->nd_repstat)
 		nd->nd_repstat = NFSERR_PERM;
 	if (!nd->nd_repstat)
 		nd->nd_repstat = getret;
 	if (nd->nd_repstat) {
 		vput(vp);
 		if (cookies)
 			free(cookies, M_TEMP);
 		free(rbuf, M_TEMP);
 		if (nd->nd_flag & ND_NFSV3)
 			nfsrv_postopattr(nd, getret, &at);
 		goto out;
 	}
 	/*
 	 * If nothing read, return eof
 	 * rpc reply
 	 */
 	if (siz == 0) {
 		vput(vp);
 		if (nd->nd_flag & ND_NFSV3)
 			nfsrv_postopattr(nd, getret, &at);
 		NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
 		txdr_hyper(at.na_filerev, tl);
 		tl += 2;
 		*tl++ = newnfs_false;
 		*tl = newnfs_true;
 		free(cookies, M_TEMP);
 		free(rbuf, M_TEMP);
 		goto out;
 	}
 
 	/*
 	 * Check for degenerate cases of nothing useful read.
 	 * If so go try again
 	 */
 	cpos = rbuf;
 	cend = rbuf + siz;
 	dp = (struct dirent *)cpos;
 	cookiep = cookies;
 
 	/*
 	 * For some reason FreeBSD's ufs_readdir() chooses to back the
 	 * directory offset up to a block boundary, so it is necessary to
 	 * skip over the records that precede the requested offset. This
 	 * requires the assumption that file offset cookies monotonically
 	 * increase.
 	 */
 	while (cpos < cend && ncookies > 0 &&
 	  (dp->d_fileno == 0 || dp->d_type == DT_WHT ||
 	   (is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff) ||
 	   ((nd->nd_flag & ND_NFSV4) &&
 	    ((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
 	     (dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) {
 		cpos += dp->d_reclen;
 		dp = (struct dirent *)cpos;
 		cookiep++;
 		ncookies--;
 	}
 	if (cpos >= cend || ncookies == 0) {
 		siz = fullsiz;
 		toff = off;
 		goto again;
 	}
 
 	/*
 	 * Busy the file system so that the mount point won't go away
 	 * and, as such, VFS_VGET() can be used safely.
 	 */
 	mp = vp->v_mount;
 	vfs_ref(mp);
 	NFSVOPUNLOCK(vp);
 	nd->nd_repstat = vfs_busy(mp, 0);
 	vfs_rel(mp);
 	if (nd->nd_repstat != 0) {
 		vrele(vp);
 		free(cookies, M_TEMP);
 		free(rbuf, M_TEMP);
 		if (nd->nd_flag & ND_NFSV3)
 			nfsrv_postopattr(nd, getret, &at);
 		goto out;
 	}
 
 	/*
 	 * Check to see if entries in this directory can be safely acquired
 	 * via VFS_VGET() or if a switch to VOP_LOOKUP() is required.
 	 * ZFS snapshot directories need VOP_LOOKUP(), so that any
 	 * automount of the snapshot directory that is required will
 	 * be done.
 	 * This needs to be done here for NFSv4, since NFSv4 never does
 	 * a VFS_VGET() for "." or "..".
 	 */
 	if (is_zfs == 1) {
 		r = VFS_VGET(mp, at.na_fileid, LK_SHARED, &nvp);
 		if (r == EOPNOTSUPP) {
 			usevget = 0;
 			cn.cn_nameiop = LOOKUP;
 			cn.cn_lkflags = LK_SHARED | LK_RETRY;
 			cn.cn_cred = nd->nd_cred;
 			cn.cn_thread = p;
 		} else if (r == 0)
 			vput(nvp);
 	}
 
 	/*
 	 * If the reply is likely to exceed MCLBYTES and the reply will
 	 * not be saved, use ext_pgs mbufs for TLS.
 	 * It is difficult to predict how large each entry will be and
 	 * how many entries have been read, so just assume the directory
 	 * entries grow by a factor of 4 when attributes are included.
 	 * For NFSv4.0, we do not know for sure if the reply will
 	 * be saved, so do not use ext_pgs mbufs for NFSv4.0.
 	 */
 	if (cnt > MCLBYTES && siz > MCLBYTES / 4 &&
 	    (nd->nd_flag & (ND_TLS | ND_EXTPG | ND_SAVEREPLY)) == ND_TLS &&
 	    (nd->nd_flag & (ND_NFSV4 | ND_NFSV41)) != ND_NFSV4)
 		nd->nd_flag |= ND_EXTPG;
 
 	/*
 	 * Save this position, in case there is an error before one entry
 	 * is created.
 	 */
 	mb0 = nd->nd_mb;
 	bpos0 = nd->nd_bpos;
 	bextpg0 = nd->nd_bextpg;
 	bextpgsiz0 = nd->nd_bextpgsiz;
 
 	/*
 	 * Fill in the first part of the reply.
 	 * dirlen is the reply length in bytes and cannot exceed cnt.
 	 * (Include the two booleans at the end of the reply in dirlen now,
 	 *  so we recognize when we have exceeded cnt.)
 	 */
 	if (nd->nd_flag & ND_NFSV3) {
 		dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
 		nfsrv_postopattr(nd, getret, &at);
 	} else {
 		dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
 	}
 	NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
 	txdr_hyper(at.na_filerev, tl);
 
 	/*
 	 * Save this position, in case there is an empty reply needed.
 	 */
 	mb1 = nd->nd_mb;
 	bpos1 = nd->nd_bpos;
 	bextpg1 = nd->nd_bextpg;
 	bextpgsiz1 = nd->nd_bextpgsiz;
 
 	/* Loop through the records and build reply */
 	entrycnt = 0;
 	while (cpos < cend && ncookies > 0 && dirlen < cnt) {
 		nlen = dp->d_namlen;
 		if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
 		    nlen <= NFS_MAXNAMLEN &&
 		    ((nd->nd_flag & ND_NFSV3) || nlen > 2 ||
 		     (nlen==2 && (dp->d_name[0]!='.' || dp->d_name[1]!='.'))
 		      || (nlen == 1 && dp->d_name[0] != '.'))) {
 			/*
 			 * Save the current position in the reply, in case
 			 * this entry exceeds cnt.
 			 */
 			mb1 = nd->nd_mb;
 			bpos1 = nd->nd_bpos;
 			bextpg1 = nd->nd_bextpg;
 			bextpgsiz1 = nd->nd_bextpgsiz;
 
 			/*
 			 * For readdir_and_lookup get the vnode using
 			 * the file number.
 			 */
 			nvp = NULL;
 			refp = NULL;
 			r = 0;
 			at_root = 0;
 			needs_unbusy = 0;
 			new_mp = mp;
 			mounted_on_fileno = (uint64_t)dp->d_fileno;
 			if ((nd->nd_flag & ND_NFSV3) ||
 			    NFSNONZERO_ATTRBIT(&savbits)) {
 				if (nd->nd_flag & ND_NFSV4)
 					refp = nfsv4root_getreferral(NULL,
 					    vp, dp->d_fileno);
 				if (refp == NULL) {
 					if (usevget)
 						r = VFS_VGET(mp, dp->d_fileno,
 						    LK_SHARED, &nvp);
 					else
 						r = EOPNOTSUPP;
 					if (r == EOPNOTSUPP) {
 						if (usevget) {
 							usevget = 0;
 							cn.cn_nameiop = LOOKUP;
 							cn.cn_lkflags =
 							    LK_SHARED |
 							    LK_RETRY;
 							cn.cn_cred =
 							    nd->nd_cred;
 							cn.cn_thread = p;
 						}
 						cn.cn_nameptr = dp->d_name;
 						cn.cn_namelen = nlen;
 						cn.cn_flags = ISLASTCN |
 						    NOFOLLOW | LOCKLEAF;
 						if (nlen == 2 &&
 						    dp->d_name[0] == '.' &&
 						    dp->d_name[1] == '.')
 							cn.cn_flags |=
 							    ISDOTDOT;
 						if (NFSVOPLOCK(vp, LK_SHARED)
 						    != 0) {
 							nd->nd_repstat = EPERM;
 							break;
 						}
 						if ((vp->v_vflag & VV_ROOT) != 0
 						    && (cn.cn_flags & ISDOTDOT)
 						    != 0) {
 							vref(vp);
 							nvp = vp;
 							r = 0;
 						} else {
 							r = VOP_LOOKUP(vp, &nvp,
 							    &cn);
 							if (vp != nvp)
 								NFSVOPUNLOCK(vp);
 						}
 					}
 
 					/*
 					 * For NFSv4, check to see if nvp is
 					 * a mount point and get the mount
 					 * point vnode, as required.
 					 */
 					if (r == 0 &&
 					    nfsrv_enable_crossmntpt != 0 &&
 					    (nd->nd_flag & ND_NFSV4) != 0 &&
 					    nvp->v_type == VDIR &&
 					    nvp->v_mountedhere != NULL) {
 						new_mp = nvp->v_mountedhere;
 						r = vfs_busy(new_mp, 0);
 						vput(nvp);
 						nvp = NULL;
 						if (r == 0) {
 							r = VFS_ROOT(new_mp,
 							    LK_SHARED, &nvp);
 							needs_unbusy = 1;
 							if (r == 0)
 								at_root = 1;
 						}
 					}
 				}
 
 				/*
 				 * If we failed to look up the entry, then it
 				 * has become invalid, most likely removed.
 				 */
 				if (r != 0) {
 					if (needs_unbusy)
 						vfs_unbusy(new_mp);
 					goto invalid;
 				}
 				KASSERT(refp != NULL || nvp != NULL,
 				    ("%s: undetected lookup error", __func__));
 
 				if (refp == NULL &&
 				    ((nd->nd_flag & ND_NFSV3) ||
 				     NFSNONZERO_ATTRBIT(&attrbits))) {
 					r = nfsvno_getfh(nvp, &nfh, p);
 					if (!r)
 					    r = nfsvno_getattr(nvp, nvap, nd, p,
 						1, &attrbits);
 					if (r == 0 && is_zfs == 1 &&
 					    nfsrv_enable_crossmntpt != 0 &&
 					    (nd->nd_flag & ND_NFSV4) != 0 &&
 					    nvp->v_type == VDIR &&
 					    vp->v_mount != nvp->v_mount) {
 					    /*
 					     * For a ZFS snapshot, there is a
 					     * pseudo mount that does not set
 					     * v_mountedhere, so it needs to
 					     * be detected via a different
 					     * mount structure.
 					     */
 					    at_root = 1;
 					    if (new_mp == mp)
 						new_mp = nvp->v_mount;
 					}
 				}
 
 				/*
 				 * If we failed to get attributes of the entry,
 				 * then just skip it for NFSv3 (the traditional
 				 * behavior in the old NFS server).
 				 * For NFSv4 the behavior is controlled by
 				 * RDATTRERROR: we either ignore the error or
 				 * fail the request.
 				 * Note that RDATTRERROR is never set for NFSv3.
 				 */
 				if (r != 0) {
 					if (!NFSISSET_ATTRBIT(&attrbits,
 					    NFSATTRBIT_RDATTRERROR)) {
 						vput(nvp);
 						if (needs_unbusy != 0)
 							vfs_unbusy(new_mp);
 						if ((nd->nd_flag & ND_NFSV3))
 							goto invalid;
 						nd->nd_repstat = r;
 						break;
 					}
 				}
 			}
 
 			/*
 			 * Build the directory record xdr
 			 */
 			if (nd->nd_flag & ND_NFSV3) {
 				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 				*tl++ = newnfs_true;
 				*tl++ = 0;
 				*tl = txdr_unsigned(dp->d_fileno);
 				dirlen += nfsm_strtom(nd, dp->d_name, nlen);
 				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 				*tl++ = 0;
 				*tl = txdr_unsigned(*cookiep);
 				nfsrv_postopattr(nd, 0, nvap);
 				dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1);
 				dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR);
 				if (nvp != NULL)
 					vput(nvp);
 			} else {
 				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
 				*tl++ = newnfs_true;
 				*tl++ = 0;
 				*tl = txdr_unsigned(*cookiep);
 				dirlen += nfsm_strtom(nd, dp->d_name, nlen);
 				if (nvp != NULL) {
 					supports_nfsv4acls =
 					    nfs_supportsnfsv4acls(nvp);
 					NFSVOPUNLOCK(nvp);
 				} else
 					supports_nfsv4acls = 0;
 				if (refp != NULL) {
 					dirlen += nfsrv_putreferralattr(nd,
 					    &savbits, refp, 0,
 					    &nd->nd_repstat);
 					if (nd->nd_repstat) {
 						if (nvp != NULL)
 							vrele(nvp);
 						if (needs_unbusy != 0)
 							vfs_unbusy(new_mp);
 						break;
 					}
 				} else if (r) {
 					dirlen += nfsvno_fillattr(nd, new_mp,
 					    nvp, nvap, &nfh, r, &rderrbits,
 					    nd->nd_cred, p, isdgram, 0,
 					    supports_nfsv4acls, at_root,
 					    mounted_on_fileno);
 				} else {
 					dirlen += nfsvno_fillattr(nd, new_mp,
 					    nvp, nvap, &nfh, r, &attrbits,
 					    nd->nd_cred, p, isdgram, 0,
 					    supports_nfsv4acls, at_root,
 					    mounted_on_fileno);
 				}
 				if (nvp != NULL)
 					vrele(nvp);
 				dirlen += (3 * NFSX_UNSIGNED);
 			}
 			if (needs_unbusy != 0)
 				vfs_unbusy(new_mp);
 			if (dirlen <= cnt)
 				entrycnt++;
 		}
 invalid:
 		cpos += dp->d_reclen;
 		dp = (struct dirent *)cpos;
 		cookiep++;
 		ncookies--;
 	}
 	vrele(vp);
 	vfs_unbusy(mp);
 
 	/*
 	 * If dirlen > cnt, we must strip off the last entry. If that
 	 * results in an empty reply, report NFSERR_TOOSMALL.
 	 */
 	if (dirlen > cnt || nd->nd_repstat) {
 		if (!nd->nd_repstat && entrycnt == 0)
 			nd->nd_repstat = NFSERR_TOOSMALL;
 		if (nd->nd_repstat) {
 			nfsm_trimtrailing(nd, mb0, bpos0, bextpg0, bextpgsiz0);
 			if (nd->nd_flag & ND_NFSV3)
 				nfsrv_postopattr(nd, getret, &at);
 		} else
 			nfsm_trimtrailing(nd, mb1, bpos1, bextpg1, bextpgsiz1);
 		eofflag = 0;
 	} else if (cpos < cend)
 		eofflag = 0;
 	if (!nd->nd_repstat) {
 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 		*tl++ = newnfs_false;
 		if (eofflag)
 			*tl = newnfs_true;
 		else
 			*tl = newnfs_false;
 	}
 	free(cookies, M_TEMP);
 	free(rbuf, M_TEMP);
 
 out:
 	NFSEXITCODE2(0, nd);
 	return (0);
 nfsmout:
 	vput(vp);
 	NFSEXITCODE2(error, nd);
 	return (error);
 }
 
 /*
  * Get the settable attributes out of the mbuf list.
  * (Return 0 or EBADRPC)
  */
 int
 nfsrv_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
     nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
 {
 	u_int32_t *tl;
 	struct nfsv2_sattr *sp;
 	int error = 0, toclient = 0;
 
 	switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) {
 	case ND_NFSV2:
 		NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
 		/*
 		 * Some old clients didn't fill in the high order 16bits.
 		 * --> check the low order 2 bytes for 0xffff
 		 */
 		if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff)
 			nvap->na_mode = nfstov_mode(sp->sa_mode);
 		if (sp->sa_uid != newnfs_xdrneg1)
 			nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid);
 		if (sp->sa_gid != newnfs_xdrneg1)
 			nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid);
 		if (sp->sa_size != newnfs_xdrneg1)
 			nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size);
 		if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) {
 #ifdef notyet
 			fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime);
 #else
 			nvap->na_atime.tv_sec =
 				fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec);
 			nvap->na_atime.tv_nsec = 0;
 #endif
 		}
 		if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1)
 			fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime);
 		break;
 	case ND_NFSV3:
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (*tl == newnfs_true) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			nvap->na_mode = nfstov_mode(*tl);
 		}
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (*tl == newnfs_true) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			nvap->na_uid = fxdr_unsigned(uid_t, *tl);
 		}
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (*tl == newnfs_true) {
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			nvap->na_gid = fxdr_unsigned(gid_t, *tl);
 		}
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		if (*tl == newnfs_true) {
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			nvap->na_size = fxdr_hyper(tl);
 		}
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		switch (fxdr_unsigned(int, *tl)) {
 		case NFSV3SATTRTIME_TOCLIENT:
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			fxdr_nfsv3time(tl, &nvap->na_atime);
 			toclient = 1;
 			break;
 		case NFSV3SATTRTIME_TOSERVER:
 			vfs_timestamp(&nvap->na_atime);
 			nvap->na_vaflags |= VA_UTIMES_NULL;
 			break;
 		}
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		switch (fxdr_unsigned(int, *tl)) {
 		case NFSV3SATTRTIME_TOCLIENT:
 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
 			fxdr_nfsv3time(tl, &nvap->na_mtime);
 			nvap->na_vaflags &= ~VA_UTIMES_NULL;
 			break;
 		case NFSV3SATTRTIME_TOSERVER:
 			vfs_timestamp(&nvap->na_mtime);
 			if (!toclient)
 				nvap->na_vaflags |= VA_UTIMES_NULL;
 			break;
 		}
 		break;
 	case ND_NFSV4:
 		error = nfsv4_sattr(nd, vp, nvap, attrbitp, aclp, p);
 	}
 nfsmout:
 	NFSEXITCODE2(error, nd);
 	return (error);
 }
 
 /*
  * Handle the setable attributes for V4.
  * Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise.
  */
 int
 nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
     nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
 {
 	u_int32_t *tl;
 	int attrsum = 0;
 	int i, j;
 	int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0;
 	int moderet, toclient = 0;
 	u_char *cp, namestr[NFSV4_SMALLSTR + 1];
 	uid_t uid;
 	gid_t gid;
 	u_short mode, mask;		/* Same type as va_mode. */
 	struct vattr va;
 
 	error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup);
 	if (error)
 		goto nfsmout;
 	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 	attrsize = fxdr_unsigned(int, *tl);
 
 	/*
 	 * Loop around getting the setable attributes. If an unsupported
 	 * one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return.
 	 */
 	if (retnotsup) {
 		nd->nd_repstat = NFSERR_ATTRNOTSUPP;
 		bitpos = NFSATTRBIT_MAX;
 	} else {
 		bitpos = 0;
 	}
 	moderet = 0;
 	for (; bitpos < NFSATTRBIT_MAX; bitpos++) {
 	    if (attrsum > attrsize) {
 		error = NFSERR_BADXDR;
 		goto nfsmout;
 	    }
 	    if (NFSISSET_ATTRBIT(attrbitp, bitpos))
 		switch (bitpos) {
 		case NFSATTRBIT_SIZE:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
                      if (vp != NULL && vp->v_type != VREG) {
                             error = (vp->v_type == VDIR) ? NFSERR_ISDIR :
                                 NFSERR_INVAL;
                             goto nfsmout;
 			}
 			nvap->na_size = fxdr_hyper(tl);
 			attrsum += NFSX_HYPER;
 			break;
 		case NFSATTRBIT_ACL:
 			error = nfsrv_dissectacl(nd, aclp, &aceerr, &aclsize,
 			    p);
 			if (error)
 				goto nfsmout;
 			if (aceerr && !nd->nd_repstat)
 				nd->nd_repstat = aceerr;
 			attrsum += aclsize;
 			break;
 		case NFSATTRBIT_ARCHIVE:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			if (!nd->nd_repstat)
 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
 			attrsum += NFSX_UNSIGNED;
 			break;
 		case NFSATTRBIT_HIDDEN:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			if (!nd->nd_repstat)
 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
 			attrsum += NFSX_UNSIGNED;
 			break;
 		case NFSATTRBIT_MIMETYPE:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			i = fxdr_unsigned(int, *tl);
 			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
 			if (error)
 				goto nfsmout;
 			if (!nd->nd_repstat)
 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
 			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
 			break;
 		case NFSATTRBIT_MODE:
 			moderet = NFSERR_INVAL;	/* Can't do MODESETMASKED. */
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			nvap->na_mode = nfstov_mode(*tl);
 			attrsum += NFSX_UNSIGNED;
 			break;
 		case NFSATTRBIT_OWNER:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			j = fxdr_unsigned(int, *tl);
 			if (j < 0) {
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			if (j > NFSV4_SMALLSTR)
 				cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
 			else
 				cp = namestr;
 			error = nfsrv_mtostr(nd, cp, j);
 			if (error) {
 				if (j > NFSV4_SMALLSTR)
 					free(cp, M_NFSSTRING);
 				goto nfsmout;
 			}
 			if (!nd->nd_repstat) {
 				nd->nd_repstat = nfsv4_strtouid(nd, cp, j,
 				    &uid);
 				if (!nd->nd_repstat)
 					nvap->na_uid = uid;
 			}
 			if (j > NFSV4_SMALLSTR)
 				free(cp, M_NFSSTRING);
 			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
 			break;
 		case NFSATTRBIT_OWNERGROUP:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			j = fxdr_unsigned(int, *tl);
 			if (j < 0) {
 				error = NFSERR_BADXDR;
 				goto nfsmout;
 			}
 			if (j > NFSV4_SMALLSTR)
 				cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
 			else
 				cp = namestr;
 			error = nfsrv_mtostr(nd, cp, j);
 			if (error) {
 				if (j > NFSV4_SMALLSTR)
 					free(cp, M_NFSSTRING);
 				goto nfsmout;
 			}
 			if (!nd->nd_repstat) {
 				nd->nd_repstat = nfsv4_strtogid(nd, cp, j,
 				    &gid);
 				if (!nd->nd_repstat)
 					nvap->na_gid = gid;
 			}
 			if (j > NFSV4_SMALLSTR)
 				free(cp, M_NFSSTRING);
 			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
 			break;
 		case NFSATTRBIT_SYSTEM:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			if (!nd->nd_repstat)
 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
 			attrsum += NFSX_UNSIGNED;
 			break;
 		case NFSATTRBIT_TIMEACCESSSET:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			attrsum += NFSX_UNSIGNED;
 			if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
 			    NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
 			    fxdr_nfsv4time(tl, &nvap->na_atime);
 			    toclient = 1;
 			    attrsum += NFSX_V4TIME;
 			} else {
 			    vfs_timestamp(&nvap->na_atime);
 			    nvap->na_vaflags |= VA_UTIMES_NULL;
 			}
 			break;
 		case NFSATTRBIT_TIMEBACKUP:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
 			if (!nd->nd_repstat)
 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
 			attrsum += NFSX_V4TIME;
 			break;
 		case NFSATTRBIT_TIMECREATE:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
 			fxdr_nfsv4time(tl, &nvap->na_btime);
 			attrsum += NFSX_V4TIME;
 			break;
 		case NFSATTRBIT_TIMEMODIFYSET:
 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 			attrsum += NFSX_UNSIGNED;
 			if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
 			    NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
 			    fxdr_nfsv4time(tl, &nvap->na_mtime);
 			    nvap->na_vaflags &= ~VA_UTIMES_NULL;
 			    attrsum += NFSX_V4TIME;
 			} else {
 			    vfs_timestamp(&nvap->na_mtime);
 			    if (!toclient)
 				nvap->na_vaflags |= VA_UTIMES_NULL;
 			}
 			break;
 		case NFSATTRBIT_MODESETMASKED:
 			NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 			mode = fxdr_unsigned(u_short, *tl++);
 			mask = fxdr_unsigned(u_short, *tl);
 			/*
 			 * vp == NULL implies an Open/Create operation.
 			 * This attribute can only be used for Setattr and
 			 * only for NFSv4.1 or higher.
 			 * If moderet != 0, a mode attribute has also been
 			 * specified and this attribute cannot be done in the
 			 * same Setattr operation.
 			 */
 			if ((nd->nd_flag & ND_NFSV41) == 0)
 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
 			else if ((mode & ~07777) != 0 || (mask & ~07777) != 0 ||
 			    vp == NULL)
 				nd->nd_repstat = NFSERR_INVAL;
 			else if (moderet == 0)
 				moderet = VOP_GETATTR(vp, &va, nd->nd_cred);
 			if (moderet == 0)
 				nvap->na_mode = (mode & mask) |
 				    (va.va_mode & ~mask);
 			else
 				nd->nd_repstat = moderet;
 			attrsum += 2 * NFSX_UNSIGNED;
 			break;
 		default:
 			nd->nd_repstat = NFSERR_ATTRNOTSUPP;
 			/*
 			 * set bitpos so we drop out of the loop.
 			 */
 			bitpos = NFSATTRBIT_MAX;
 			break;
 		}
 	}
 
 	/*
 	 * some clients pad the attrlist, so we need to skip over the
 	 * padding.
 	 */
 	if (attrsum > attrsize) {
 		error = NFSERR_BADXDR;
 	} else {
 		attrsize = NFSM_RNDUP(attrsize);
 		if (attrsum < attrsize)
 			error = nfsm_advance(nd, attrsize - attrsum, -1);
 	}
 nfsmout:
 	NFSEXITCODE2(error, nd);
 	return (error);
 }
 
 /*
  * Check/setup export credentials.
  */
 int
 nfsd_excred(struct nfsrv_descript *nd, struct nfsexstuff *exp,
     struct ucred *credanon)
 {
 	int error = 0;
 
 	/*
 	 * Check/setup credentials.
 	 */
 	if (nd->nd_flag & ND_GSS)
 		exp->nes_exflag &= ~MNT_EXPORTANON;
 
 	/*
 	 * Check to see if the operation is allowed for this security flavor.
 	 * RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to
 	 * AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS.
 	 * Also, allow Secinfo, so that it can acquire the correct flavor(s).
 	 */
 	if (nfsvno_testexp(nd, exp) &&
 	    nd->nd_procnum != NFSV4OP_SECINFO &&
 	    nd->nd_procnum != NFSPROC_FSINFO) {
 		if (nd->nd_flag & ND_NFSV4)
 			error = NFSERR_WRONGSEC;
 		else
 			error = (NFSERR_AUTHERR | AUTH_TOOWEAK);
 		goto out;
 	}
 
 	/*
 	 * Check to see if the file system is exported V4 only.
 	 */
 	if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) {
 		error = NFSERR_PROGNOTV4;
 		goto out;
 	}
 
 	/*
 	 * Now, map the user credentials.
 	 * (Note that ND_AUTHNONE will only be set for an NFSv3
 	 *  Fsinfo RPC. If set for anything else, this code might need
 	 *  to change.)
 	 */
 	if (NFSVNO_EXPORTED(exp)) {
 		if (((nd->nd_flag & ND_GSS) == 0 && nd->nd_cred->cr_uid == 0) ||
 		     NFSVNO_EXPORTANON(exp) ||
 		     (nd->nd_flag & ND_AUTHNONE) != 0) {
 			nd->nd_cred->cr_uid = credanon->cr_uid;
 			nd->nd_cred->cr_gid = credanon->cr_gid;
 			crsetgroups(nd->nd_cred, credanon->cr_ngroups,
 			    credanon->cr_groups);
 		} else if ((nd->nd_flag & ND_GSS) == 0) {
 			/*
 			 * If using AUTH_SYS, call nfsrv_getgrpscred() to see
 			 * if there is a replacement credential with a group
 			 * list set up by "nfsuserd -manage-gids".
 			 * If there is no replacement, nfsrv_getgrpscred()
 			 * simply returns its argument.
 			 */
 			nd->nd_cred = nfsrv_getgrpscred(nd->nd_cred);
 		}
 	}
 
 out:
 	NFSEXITCODE2(error, nd);
 	return (error);
 }
 
 /*
  * Check exports.
  */
 int
 nfsvno_checkexp(struct mount *mp, struct sockaddr *nam, struct nfsexstuff *exp,
     struct ucred **credp)
 {
 	int error;
 
 	error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
 	    &exp->nes_numsecflavor, exp->nes_secflavors);
 	if (error) {
 		if (nfs_rootfhset) {
 			exp->nes_exflag = 0;
 			exp->nes_numsecflavor = 0;
 			error = 0;
 		}
 	} else if (exp->nes_numsecflavor < 1 || exp->nes_numsecflavor >
 	    MAXSECFLAVORS) {
 		printf("nfsvno_checkexp: numsecflavors out of range\n");
 		exp->nes_numsecflavor = 0;
 		error = EACCES;
 	}
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Get a vnode for a file handle and export stuff.
  */
 int
 nfsvno_fhtovp(struct mount *mp, fhandle_t *fhp, struct sockaddr *nam,
     int lktype, struct vnode **vpp, struct nfsexstuff *exp,
     struct ucred **credp)
 {
 	int error;
 
 	*credp = NULL;
 	exp->nes_numsecflavor = 0;
 	error = VFS_FHTOVP(mp, &fhp->fh_fid, lktype, vpp);
 	if (error != 0)
 		/* Make sure the server replies ESTALE to the client. */
 		error = ESTALE;
 	if (nam && !error) {
 		error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
 		    &exp->nes_numsecflavor, exp->nes_secflavors);
 		if (error) {
 			if (nfs_rootfhset) {
 				exp->nes_exflag = 0;
 				exp->nes_numsecflavor = 0;
 				error = 0;
 			} else {
 				vput(*vpp);
 			}
 		} else if (exp->nes_numsecflavor < 1 || exp->nes_numsecflavor >
 		    MAXSECFLAVORS) {
 			printf("nfsvno_fhtovp: numsecflavors out of range\n");
 			exp->nes_numsecflavor = 0;
 			error = EACCES;
 			vput(*vpp);
 		}
 	}
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * nfsd_fhtovp() - convert a fh to a vnode ptr
  * 	- look up fsid in mount list (if not found ret error)
  *	- get vp and export rights by calling nfsvno_fhtovp()
  *	- if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
  *	  for AUTH_SYS
  *	- if mpp != NULL, return the mount point so that it can
  *	  be used for vn_finished_write() by the caller
  */
 void
 nfsd_fhtovp(struct nfsrv_descript *nd, struct nfsrvfh *nfp, int lktype,
     struct vnode **vpp, struct nfsexstuff *exp,
     struct mount **mpp, int startwrite)
 {
 	struct mount *mp, *mpw;
 	struct ucred *credanon;
 	fhandle_t *fhp;
 	int error;
 
 	if (mpp != NULL)
 		*mpp = NULL;
 	*vpp = NULL;
 	fhp = (fhandle_t *)nfp->nfsrvfh_data;
 	mp = vfs_busyfs(&fhp->fh_fsid);
 	if (mp == NULL) {
 		nd->nd_repstat = ESTALE;
 		goto out;
 	}
 
 	if (startwrite) {
 		mpw = mp;
 		error = vn_start_write(NULL, &mpw, V_WAIT);
 		if (error != 0) {
 			mpw = NULL;
 			vfs_unbusy(mp);
 			nd->nd_repstat = ESTALE;
 			goto out;
 		}
 		if (lktype == LK_SHARED && !(MNT_SHARED_WRITES(mp)))
 			lktype = LK_EXCLUSIVE;
 	} else
 		mpw = NULL;
 
 	nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp,
 	    &credanon);
 	vfs_unbusy(mp);
 
 	/*
 	 * For NFSv4 without a pseudo root fs, unexported file handles
 	 * can be returned, so that Lookup works everywhere.
 	 */
 	if (!nd->nd_repstat && exp->nes_exflag == 0 &&
 	    !(nd->nd_flag & ND_NFSV4)) {
 		vput(*vpp);
 		*vpp = NULL;
 		nd->nd_repstat = EACCES;
 	}
 
 	/*
 	 * If TLS is required by the export, check the flags in nd_flag.
 	 */
 	if (nd->nd_repstat == 0 && ((NFSVNO_EXTLS(exp) &&
 	    (nd->nd_flag & ND_TLS) == 0) ||
 	     (NFSVNO_EXTLSCERT(exp) &&
 	      (nd->nd_flag & ND_TLSCERT) == 0) ||
 	     (NFSVNO_EXTLSCERTUSER(exp) &&
 	      (nd->nd_flag & ND_TLSCERTUSER) == 0))) {
 		vput(*vpp);
 		nd->nd_repstat = NFSERR_ACCES;
 	}
 
 	/*
 	 * Personally, I've never seen any point in requiring a
 	 * reserved port#, since only in the rare case where the
 	 * clients are all boxes with secure system privileges,
 	 * does it provide any enhanced security, but... some people
 	 * believe it to be useful and keep putting this code back in.
 	 * (There is also some "security checker" out there that
 	 *  complains if the nfs server doesn't enforce this.)
 	 * However, note the following:
 	 * RFC3530 (NFSv4) specifies that a reserved port# not be
 	 *	required.
 	 * RFC2623 recommends that, if a reserved port# is checked for,
 	 *	that there be a way to turn that off--> ifdef'd.
 	 */
 #ifdef NFS_REQRSVPORT
 	if (!nd->nd_repstat) {
 		struct sockaddr_in *saddr;
 		struct sockaddr_in6 *saddr6;
 
 		saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
 		saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *);
 		if (!(nd->nd_flag & ND_NFSV4) &&
 		    ((saddr->sin_family == AF_INET &&
 		      ntohs(saddr->sin_port) >= IPPORT_RESERVED) ||
 		     (saddr6->sin6_family == AF_INET6 &&
 		      ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) {
 			vput(*vpp);
 			nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
 		}
 	}
 #endif	/* NFS_REQRSVPORT */
 
 	/*
 	 * Check/setup credentials.
 	 */
 	if (!nd->nd_repstat) {
 		nd->nd_saveduid = nd->nd_cred->cr_uid;
 		nd->nd_repstat = nfsd_excred(nd, exp, credanon);
 		if (nd->nd_repstat)
 			vput(*vpp);
 	}
 	if (credanon != NULL)
 		crfree(credanon);
 	if (nd->nd_repstat) {
 		vn_finished_write(mpw);
 		*vpp = NULL;
 	} else if (mpp != NULL) {
 		*mpp = mpw;
 	}
 
 out:
 	NFSEXITCODE2(0, nd);
 }
 
 /*
  * glue for fp.
  */
 static int
 fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error = 0;
 
 	fdp = p->td_proc->p_fd;
 	if (fd < 0 || fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd].fde_file) == NULL) {
 		error = EBADF;
 		goto out;
 	}
 	*fpp = fp;
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Called from nfssvc() to update the exports list. Just call
  * vfs_export(). This has to be done, since the v4 root fake fs isn't
  * in the mount list.
  */
 int
 nfsrv_v4rootexport(void *argp, struct ucred *cred, struct thread *p)
 {
 	struct nfsex_args *nfsexargp = (struct nfsex_args *)argp;
 	int error = 0;
 	struct nameidata nd;
 	fhandle_t fh;
 
 	error = vfs_export(&nfsv4root_mnt, &nfsexargp->export);
 	if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0)
 		nfs_rootfhset = 0;
 	else if (error == 0) {
 		if (nfsexargp->fspec == NULL) {
 			error = EPERM;
 			goto out;
 		}
 		/*
 		 * If fspec != NULL, this is the v4root path.
 		 */
 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE,
 		    nfsexargp->fspec, p);
 		if ((error = namei(&nd)) != 0)
 			goto out;
 		error = nfsvno_getfh(nd.ni_vp, &fh, p);
 		vrele(nd.ni_vp);
 		if (!error) {
 			nfs_rootfh.nfsrvfh_len = NFSX_MYFH;
 			NFSBCOPY((caddr_t)&fh,
 			    nfs_rootfh.nfsrvfh_data,
 			    sizeof (fhandle_t));
 			nfs_rootfhset = 1;
 		}
 	}
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * This function needs to test to see if the system is near its limit
  * for memory allocation via malloc() or mget() and return True iff
  * either of these resources are near their limit.
  * XXX (For now, this is just a stub.)
  */
 int nfsrv_testmalloclimit = 0;
 int
 nfsrv_mallocmget_limit(void)
 {
 	static int printmesg = 0;
 	static int testval = 1;
 
 	if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) {
 		if ((printmesg++ % 100) == 0)
 			printf("nfsd: malloc/mget near limit\n");
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * BSD specific initialization of a mount point.
  */
 void
 nfsd_mntinit(void)
 {
 	static int inited = 0;
 
 	if (inited)
 		return;
 	inited = 1;
 	nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED);
 	TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist);
 	TAILQ_INIT(&nfsv4root_mnt.mnt_lazyvnodelist);
 	nfsv4root_mnt.mnt_export = NULL;
 	TAILQ_INIT(&nfsv4root_opt);
 	TAILQ_INIT(&nfsv4root_newopt);
 	nfsv4root_mnt.mnt_opt = &nfsv4root_opt;
 	nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt;
 	nfsv4root_mnt.mnt_nvnodelistsize = 0;
 	nfsv4root_mnt.mnt_lazyvnodelistsize = 0;
 }
 
 /*
  * Get a vnode for a file handle, without checking exports, etc.
  */
 struct vnode *
 nfsvno_getvp(fhandle_t *fhp)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	mp = vfs_busyfs(&fhp->fh_fsid);
 	if (mp == NULL)
 		return (NULL);
 	error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
 	if (error)
 		return (NULL);
 	return (vp);
 }
 
 /*
  * Do a local VOP_ADVLOCK().
  */
 int
 nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first,
     u_int64_t end, struct thread *td)
 {
 	int error = 0;
 	struct flock fl;
 	u_int64_t tlen;
 
 	if (nfsrv_dolocallocks == 0)
 		goto out;
 	ASSERT_VOP_UNLOCKED(vp, "nfsvno_advlock: vp locked");
 
 	fl.l_whence = SEEK_SET;
 	fl.l_type = ftype;
 	fl.l_start = (off_t)first;
 	if (end == NFS64BITSSET) {
 		fl.l_len = 0;
 	} else {
 		tlen = end - first;
 		fl.l_len = (off_t)tlen;
 	}
 	/*
 	 * For FreeBSD8, the l_pid and l_sysid must be set to the same
 	 * values for all calls, so that all locks will be held by the
 	 * nfsd server. (The nfsd server handles conflicts between the
 	 * various clients.)
 	 * Since an NFSv4 lockowner is a ClientID plus an array of up to 1024
 	 * bytes, so it can't be put in l_sysid.
 	 */
 	if (nfsv4_sysid == 0)
 		nfsv4_sysid = nlm_acquire_next_sysid();
 	fl.l_pid = (pid_t)0;
 	fl.l_sysid = (int)nfsv4_sysid;
 
 	if (ftype == F_UNLCK)
 		error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl,
 		    (F_POSIX | F_REMOTE));
 	else
 		error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl,
 		    (F_POSIX | F_REMOTE));
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Check the nfsv4 root exports.
  */
 int
 nfsvno_v4rootexport(struct nfsrv_descript *nd)
 {
 	struct ucred *credanon;
 	int error = 0, numsecflavor, secflavors[MAXSECFLAVORS], i;
 	uint64_t exflags;
 
 	error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags,
 	    &credanon, &numsecflavor, secflavors);
 	if (error) {
 		error = NFSERR_PROGUNAVAIL;
 		goto out;
 	}
 	if (credanon != NULL)
 		crfree(credanon);
 	for (i = 0; i < numsecflavor; i++) {
 		if (secflavors[i] == AUTH_SYS)
 			nd->nd_flag |= ND_EXAUTHSYS;
 		else if (secflavors[i] == RPCSEC_GSS_KRB5)
 			nd->nd_flag |= ND_EXGSS;
 		else if (secflavors[i] == RPCSEC_GSS_KRB5I)
 			nd->nd_flag |= ND_EXGSSINTEGRITY;
 		else if (secflavors[i] == RPCSEC_GSS_KRB5P)
 			nd->nd_flag |= ND_EXGSSPRIVACY;
 	}
 
 	/* And set ND_EXxx flags for TLS. */
 	if ((exflags & MNT_EXTLS) != 0) {
 		nd->nd_flag |= ND_EXTLS;
 		if ((exflags & MNT_EXTLSCERT) != 0)
 			nd->nd_flag |= ND_EXTLSCERT;
 		if ((exflags & MNT_EXTLSCERTUSER) != 0)
 			nd->nd_flag |= ND_EXTLSCERTUSER;
 	}
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Nfs server pseudo system call for the nfsd's
  */
 /*
  * MPSAFE
  */
 static int
 nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
 {
 	struct file *fp;
 	struct nfsd_addsock_args sockarg;
 	struct nfsd_nfsd_args nfsdarg;
 	struct nfsd_nfsd_oargs onfsdarg;
 	struct nfsd_pnfsd_args pnfsdarg;
 	struct vnode *vp, *nvp, *curdvp;
 	struct pnfsdsfile *pf;
 	struct nfsdevice *ds, *fds;
 	cap_rights_t rights;
 	int buflen, error, ret;
 	char *buf, *cp, *cp2, *cp3;
 	char fname[PNFS_FILENAME_LEN + 1];
 
 	if (uap->flag & NFSSVC_NFSDADDSOCK) {
 		error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg));
 		if (error)
 			goto out;
 		/*
 		 * Since we don't know what rights might be required,
 		 * pretend that we need them all. It is better to be too
 		 * careful than too reckless.
 		 */
 		error = fget(td, sockarg.sock,
-		    cap_rights_init(&rights, CAP_SOCK_SERVER), &fp);
+		    cap_rights_init_one(&rights, CAP_SOCK_SERVER), &fp);
 		if (error != 0)
 			goto out;
 		if (fp->f_type != DTYPE_SOCKET) {
 			fdrop(fp, td);
 			error = EPERM;
 			goto out;
 		}
 		error = nfsrvd_addsock(fp);
 		fdrop(fp, td);
 	} else if (uap->flag & NFSSVC_NFSDNFSD) {
 		if (uap->argp == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 		if ((uap->flag & NFSSVC_NEWSTRUCT) == 0) {
 			error = copyin(uap->argp, &onfsdarg, sizeof(onfsdarg));
 			if (error == 0) {
 				nfsdarg.principal = onfsdarg.principal;
 				nfsdarg.minthreads = onfsdarg.minthreads;
 				nfsdarg.maxthreads = onfsdarg.maxthreads;
 				nfsdarg.version = 1;
 				nfsdarg.addr = NULL;
 				nfsdarg.addrlen = 0;
 				nfsdarg.dnshost = NULL;
 				nfsdarg.dnshostlen = 0;
 				nfsdarg.dspath = NULL;
 				nfsdarg.dspathlen = 0;
 				nfsdarg.mdspath = NULL;
 				nfsdarg.mdspathlen = 0;
 				nfsdarg.mirrorcnt = 1;
 			}
 		} else
 			error = copyin(uap->argp, &nfsdarg, sizeof(nfsdarg));
 		if (error)
 			goto out;
 		if (nfsdarg.addrlen > 0 && nfsdarg.addrlen < 10000 &&
 		    nfsdarg.dnshostlen > 0 && nfsdarg.dnshostlen < 10000 &&
 		    nfsdarg.dspathlen > 0 && nfsdarg.dspathlen < 10000 &&
 		    nfsdarg.mdspathlen > 0 && nfsdarg.mdspathlen < 10000 &&
 		    nfsdarg.mirrorcnt >= 1 &&
 		    nfsdarg.mirrorcnt <= NFSDEV_MAXMIRRORS &&
 		    nfsdarg.addr != NULL && nfsdarg.dnshost != NULL &&
 		    nfsdarg.dspath != NULL && nfsdarg.mdspath != NULL) {
 			NFSD_DEBUG(1, "addrlen=%d dspathlen=%d dnslen=%d"
 			    " mdspathlen=%d mirrorcnt=%d\n", nfsdarg.addrlen,
 			    nfsdarg.dspathlen, nfsdarg.dnshostlen,
 			    nfsdarg.mdspathlen, nfsdarg.mirrorcnt);
 			cp = malloc(nfsdarg.addrlen + 1, M_TEMP, M_WAITOK);
 			error = copyin(nfsdarg.addr, cp, nfsdarg.addrlen);
 			if (error != 0) {
 				free(cp, M_TEMP);
 				goto out;
 			}
 			cp[nfsdarg.addrlen] = '\0';	/* Ensure nul term. */
 			nfsdarg.addr = cp;
 			cp = malloc(nfsdarg.dnshostlen + 1, M_TEMP, M_WAITOK);
 			error = copyin(nfsdarg.dnshost, cp, nfsdarg.dnshostlen);
 			if (error != 0) {
 				free(nfsdarg.addr, M_TEMP);
 				free(cp, M_TEMP);
 				goto out;
 			}
 			cp[nfsdarg.dnshostlen] = '\0';	/* Ensure nul term. */
 			nfsdarg.dnshost = cp;
 			cp = malloc(nfsdarg.dspathlen + 1, M_TEMP, M_WAITOK);
 			error = copyin(nfsdarg.dspath, cp, nfsdarg.dspathlen);
 			if (error != 0) {
 				free(nfsdarg.addr, M_TEMP);
 				free(nfsdarg.dnshost, M_TEMP);
 				free(cp, M_TEMP);
 				goto out;
 			}
 			cp[nfsdarg.dspathlen] = '\0';	/* Ensure nul term. */
 			nfsdarg.dspath = cp;
 			cp = malloc(nfsdarg.mdspathlen + 1, M_TEMP, M_WAITOK);
 			error = copyin(nfsdarg.mdspath, cp, nfsdarg.mdspathlen);
 			if (error != 0) {
 				free(nfsdarg.addr, M_TEMP);
 				free(nfsdarg.dnshost, M_TEMP);
 				free(nfsdarg.dspath, M_TEMP);
 				free(cp, M_TEMP);
 				goto out;
 			}
 			cp[nfsdarg.mdspathlen] = '\0';	/* Ensure nul term. */
 			nfsdarg.mdspath = cp;
 		} else {
 			nfsdarg.addr = NULL;
 			nfsdarg.addrlen = 0;
 			nfsdarg.dnshost = NULL;
 			nfsdarg.dnshostlen = 0;
 			nfsdarg.dspath = NULL;
 			nfsdarg.dspathlen = 0;
 			nfsdarg.mdspath = NULL;
 			nfsdarg.mdspathlen = 0;
 			nfsdarg.mirrorcnt = 1;
 		}
 		error = nfsrvd_nfsd(td, &nfsdarg);
 		free(nfsdarg.addr, M_TEMP);
 		free(nfsdarg.dnshost, M_TEMP);
 		free(nfsdarg.dspath, M_TEMP);
 		free(nfsdarg.mdspath, M_TEMP);
 	} else if (uap->flag & NFSSVC_PNFSDS) {
 		error = copyin(uap->argp, &pnfsdarg, sizeof(pnfsdarg));
 		if (error == 0 && (pnfsdarg.op == PNFSDOP_DELDSSERVER ||
 		    pnfsdarg.op == PNFSDOP_FORCEDELDS)) {
 			cp = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK);
 			error = copyinstr(pnfsdarg.dspath, cp, PATH_MAX + 1,
 			    NULL);
 			if (error == 0)
 				error = nfsrv_deldsserver(pnfsdarg.op, cp, td);
 			free(cp, M_TEMP);
 		} else if (error == 0 && pnfsdarg.op == PNFSDOP_COPYMR) {
 			cp = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK);
 			buflen = sizeof(*pf) * NFSDEV_MAXMIRRORS;
 			buf = malloc(buflen, M_TEMP, M_WAITOK);
 			error = copyinstr(pnfsdarg.mdspath, cp, PATH_MAX + 1,
 			    NULL);
 			NFSD_DEBUG(4, "pnfsdcopymr cp mdspath=%d\n", error);
 			if (error == 0 && pnfsdarg.dspath != NULL) {
 				cp2 = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK);
 				error = copyinstr(pnfsdarg.dspath, cp2,
 				    PATH_MAX + 1, NULL);
 				NFSD_DEBUG(4, "pnfsdcopymr cp dspath=%d\n",
 				    error);
 			} else
 				cp2 = NULL;
 			if (error == 0 && pnfsdarg.curdspath != NULL) {
 				cp3 = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK);
 				error = copyinstr(pnfsdarg.curdspath, cp3,
 				    PATH_MAX + 1, NULL);
 				NFSD_DEBUG(4, "pnfsdcopymr cp curdspath=%d\n",
 				    error);
 			} else
 				cp3 = NULL;
 			curdvp = NULL;
 			fds = NULL;
 			if (error == 0)
 				error = nfsrv_mdscopymr(cp, cp2, cp3, buf,
 				    &buflen, fname, td, &vp, &nvp, &pf, &ds,
 				    &fds);
 			NFSD_DEBUG(4, "nfsrv_mdscopymr=%d\n", error);
 			if (error == 0) {
 				if (pf->dsf_dir >= nfsrv_dsdirsize) {
 					printf("copymr: dsdir out of range\n");
 					pf->dsf_dir = 0;
 				}
 				NFSD_DEBUG(4, "copymr: buflen=%d\n", buflen);
 				error = nfsrv_copymr(vp, nvp,
 				    ds->nfsdev_dsdir[pf->dsf_dir], ds, pf,
 				    (struct pnfsdsfile *)buf,
 				    buflen / sizeof(*pf), td->td_ucred, td);
 				vput(vp);
 				vput(nvp);
 				if (fds != NULL && error == 0) {
 					curdvp = fds->nfsdev_dsdir[pf->dsf_dir];
 					ret = vn_lock(curdvp, LK_EXCLUSIVE);
 					if (ret == 0) {
 						nfsrv_dsremove(curdvp, fname,
 						    td->td_ucred, td);
 						NFSVOPUNLOCK(curdvp);
 					}
 				}
 				NFSD_DEBUG(4, "nfsrv_copymr=%d\n", error);
 			}
 			free(cp, M_TEMP);
 			free(cp2, M_TEMP);
 			free(cp3, M_TEMP);
 			free(buf, M_TEMP);
 		}
 	} else {
 		error = nfssvc_srvcall(td, uap, td->td_ucred);
 	}
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 static int
 nfssvc_srvcall(struct thread *p, struct nfssvc_args *uap, struct ucred *cred)
 {
 	struct nfsex_args export;
 	struct nfsex_oldargs oexp;
 	struct file *fp = NULL;
 	int stablefd, i, len;
 	struct nfsd_clid adminrevoke;
 	struct nfsd_dumplist dumplist;
 	struct nfsd_dumpclients *dumpclients;
 	struct nfsd_dumplocklist dumplocklist;
 	struct nfsd_dumplocks *dumplocks;
 	struct nameidata nd;
 	vnode_t vp;
 	int error = EINVAL, igotlock;
 	struct proc *procp;
 	gid_t *grps;
 	static int suspend_nfsd = 0;
 
 	if (uap->flag & NFSSVC_PUBLICFH) {
 		NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data,
 		    sizeof (fhandle_t));
 		error = copyin(uap->argp,
 		    &nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t));
 		if (!error)
 			nfs_pubfhset = 1;
 	} else if ((uap->flag & (NFSSVC_V4ROOTEXPORT | NFSSVC_NEWSTRUCT)) ==
 	    (NFSSVC_V4ROOTEXPORT | NFSSVC_NEWSTRUCT)) {
 		error = copyin(uap->argp,(caddr_t)&export,
 		    sizeof (struct nfsex_args));
 		if (!error) {
 			grps = NULL;
 			if (export.export.ex_ngroups > NGROUPS_MAX ||
 			    export.export.ex_ngroups < 0)
 				error = EINVAL;
 			else if (export.export.ex_ngroups > 0) {
 				grps = malloc(export.export.ex_ngroups *
 				    sizeof(gid_t), M_TEMP, M_WAITOK);
 				error = copyin(export.export.ex_groups, grps,
 				    export.export.ex_ngroups * sizeof(gid_t));
 				export.export.ex_groups = grps;
 			} else
 				export.export.ex_groups = NULL;
 			if (!error)
 				error = nfsrv_v4rootexport(&export, cred, p);
 			free(grps, M_TEMP);
 		}
 	} else if ((uap->flag & (NFSSVC_V4ROOTEXPORT | NFSSVC_NEWSTRUCT)) ==
 	    NFSSVC_V4ROOTEXPORT) {
 		error = copyin(uap->argp,(caddr_t)&oexp,
 		    sizeof (struct nfsex_oldargs));
 		if (!error) {
 			memset(&export.export, 0, sizeof(export.export));
 			export.export.ex_flags = (uint64_t)oexp.export.ex_flags;
 			export.export.ex_root = oexp.export.ex_root;
 			export.export.ex_uid = oexp.export.ex_anon.cr_uid;
 			export.export.ex_ngroups =
 			    oexp.export.ex_anon.cr_ngroups;
 			export.export.ex_groups = NULL;
 			if (export.export.ex_ngroups > XU_NGROUPS ||
 			    export.export.ex_ngroups < 0)
 				error = EINVAL;
 			else if (export.export.ex_ngroups > 0) {
 				export.export.ex_groups = malloc(
 				    export.export.ex_ngroups * sizeof(gid_t),
 				    M_TEMP, M_WAITOK);
 				for (i = 0; i < export.export.ex_ngroups; i++)
 					export.export.ex_groups[i] =
 					    oexp.export.ex_anon.cr_groups[i];
 			}
 			export.export.ex_addr = oexp.export.ex_addr;
 			export.export.ex_addrlen = oexp.export.ex_addrlen;
 			export.export.ex_mask = oexp.export.ex_mask;
 			export.export.ex_masklen = oexp.export.ex_masklen;
 			export.export.ex_indexfile = oexp.export.ex_indexfile;
 			export.export.ex_numsecflavors =
 			    oexp.export.ex_numsecflavors;
 			if (export.export.ex_numsecflavors >= MAXSECFLAVORS ||
 			    export.export.ex_numsecflavors < 0)
 				error = EINVAL;
 			else {
 				for (i = 0; i < export.export.ex_numsecflavors;
 				    i++)
 					export.export.ex_secflavors[i] =
 					    oexp.export.ex_secflavors[i];
 			}
 			export.fspec = oexp.fspec;
 			if (error == 0)
 				error = nfsrv_v4rootexport(&export, cred, p);
 			free(export.export.ex_groups, M_TEMP);
 		}
 	} else if (uap->flag & NFSSVC_NOPUBLICFH) {
 		nfs_pubfhset = 0;
 		error = 0;
 	} else if (uap->flag & NFSSVC_STABLERESTART) {
 		error = copyin(uap->argp, (caddr_t)&stablefd,
 		    sizeof (int));
 		if (!error)
 			error = fp_getfvp(p, stablefd, &fp, &vp);
 		if (!error && (NFSFPFLAG(fp) & (FREAD | FWRITE)) != (FREAD | FWRITE))
 			error = EBADF;
 		if (!error && newnfs_numnfsd != 0)
 			error = EPERM;
 		if (!error) {
 			nfsrv_stablefirst.nsf_fp = fp;
 			nfsrv_setupstable(p);
 		}
 	} else if (uap->flag & NFSSVC_ADMINREVOKE) {
 		error = copyin(uap->argp, (caddr_t)&adminrevoke,
 		    sizeof (struct nfsd_clid));
 		if (!error)
 			error = nfsrv_adminrevoke(&adminrevoke, p);
 	} else if (uap->flag & NFSSVC_DUMPCLIENTS) {
 		error = copyin(uap->argp, (caddr_t)&dumplist,
 		    sizeof (struct nfsd_dumplist));
 		if (!error && (dumplist.ndl_size < 1 ||
 			dumplist.ndl_size > NFSRV_MAXDUMPLIST))
 			error = EPERM;
 		if (!error) {
 		    len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size;
 		    dumpclients = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
 		    nfsrv_dumpclients(dumpclients, dumplist.ndl_size);
 		    error = copyout(dumpclients, dumplist.ndl_list, len);
 		    free(dumpclients, M_TEMP);
 		}
 	} else if (uap->flag & NFSSVC_DUMPLOCKS) {
 		error = copyin(uap->argp, (caddr_t)&dumplocklist,
 		    sizeof (struct nfsd_dumplocklist));
 		if (!error && (dumplocklist.ndllck_size < 1 ||
 			dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST))
 			error = EPERM;
 		if (!error)
 			error = nfsrv_lookupfilename(&nd,
 				dumplocklist.ndllck_fname, p);
 		if (!error) {
 			len = sizeof (struct nfsd_dumplocks) *
 				dumplocklist.ndllck_size;
 			dumplocks = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
 			nfsrv_dumplocks(nd.ni_vp, dumplocks,
 			    dumplocklist.ndllck_size, p);
 			vput(nd.ni_vp);
 			error = copyout(dumplocks, dumplocklist.ndllck_list,
 			    len);
 			free(dumplocks, M_TEMP);
 		}
 	} else if (uap->flag & NFSSVC_BACKUPSTABLE) {
 		procp = p->td_proc;
 		PROC_LOCK(procp);
 		nfsd_master_pid = procp->p_pid;
 		bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1);
 		nfsd_master_start = procp->p_stats->p_start;
 		nfsd_master_proc = procp;
 		PROC_UNLOCK(procp);
 	} else if ((uap->flag & NFSSVC_SUSPENDNFSD) != 0) {
 		NFSLOCKV4ROOTMUTEX();
 		if (suspend_nfsd == 0) {
 			/* Lock out all nfsd threads */
 			do {
 				igotlock = nfsv4_lock(&nfsd_suspend_lock, 1,
 				    NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
 			} while (igotlock == 0 && suspend_nfsd == 0);
 			suspend_nfsd = 1;
 		}
 		NFSUNLOCKV4ROOTMUTEX();
 		error = 0;
 	} else if ((uap->flag & NFSSVC_RESUMENFSD) != 0) {
 		NFSLOCKV4ROOTMUTEX();
 		if (suspend_nfsd != 0) {
 			nfsv4_unlock(&nfsd_suspend_lock, 0);
 			suspend_nfsd = 0;
 		}
 		NFSUNLOCKV4ROOTMUTEX();
 		error = 0;
 	}
 
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Check exports.
  * Returns 0 if ok, 1 otherwise.
  */
 int
 nfsvno_testexp(struct nfsrv_descript *nd, struct nfsexstuff *exp)
 {
 	int i;
 
 	/*
 	 * This seems odd, but allow the case where the security flavor
 	 * list is empty. This happens when NFSv4 is traversing non-exported
 	 * file systems. Exported file systems should always have a non-empty
 	 * security flavor list.
 	 */
 	if (exp->nes_numsecflavor == 0)
 		return (0);
 
 	for (i = 0; i < exp->nes_numsecflavor; i++) {
 		/*
 		 * The tests for privacy and integrity must be first,
 		 * since ND_GSS is set for everything but AUTH_SYS.
 		 */
 		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P &&
 		    (nd->nd_flag & ND_GSSPRIVACY))
 			return (0);
 		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I &&
 		    (nd->nd_flag & ND_GSSINTEGRITY))
 			return (0);
 		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 &&
 		    (nd->nd_flag & ND_GSS))
 			return (0);
 		if (exp->nes_secflavors[i] == AUTH_SYS &&
 		    (nd->nd_flag & ND_GSS) == 0)
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * Calculate a hash value for the fid in a file handle.
  */
 uint32_t
 nfsrv_hashfh(fhandle_t *fhp)
 {
 	uint32_t hashval;
 
 	hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0);
 	return (hashval);
 }
 
 /*
  * Calculate a hash value for the sessionid.
  */
 uint32_t
 nfsrv_hashsessionid(uint8_t *sessionid)
 {
 	uint32_t hashval;
 
 	hashval = hash32_buf(sessionid, NFSX_V4SESSIONID, 0);
 	return (hashval);
 }
 
 /*
  * Signal the userland master nfsd to backup the stable restart file.
  */
 void
 nfsrv_backupstable(void)
 {
 	struct proc *procp;
 
 	if (nfsd_master_proc != NULL) {
 		procp = pfind(nfsd_master_pid);
 		/* Try to make sure it is the correct process. */
 		if (procp == nfsd_master_proc &&
 		    procp->p_stats->p_start.tv_sec ==
 		    nfsd_master_start.tv_sec &&
 		    procp->p_stats->p_start.tv_usec ==
 		    nfsd_master_start.tv_usec &&
 		    strcmp(procp->p_comm, nfsd_master_comm) == 0)
 			kern_psignal(procp, SIGUSR2);
 		else
 			nfsd_master_proc = NULL;
 
 		if (procp != NULL)
 			PROC_UNLOCK(procp);
 	}
 }
 
 /*
  * Create a DS data file for nfsrv_pnfscreate(). Called for each mirror.
  * The arguments are in a structure, so that they can be passed through
  * taskqueue for a kernel process to execute this function.
  */
 struct nfsrvdscreate {
 	int			done;
 	int			inprog;
 	struct task		tsk;
 	struct ucred		*tcred;
 	struct vnode		*dvp;
 	NFSPROC_T		*p;
 	struct pnfsdsfile	*pf;
 	int			err;
 	fhandle_t		fh;
 	struct vattr		va;
 	struct vattr		createva;
 };
 
 int
 nfsrv_dscreate(struct vnode *dvp, struct vattr *vap, struct vattr *nvap,
     fhandle_t *fhp, struct pnfsdsfile *pf, struct pnfsdsattr *dsa,
     char *fnamep, struct ucred *tcred, NFSPROC_T *p, struct vnode **nvpp)
 {
 	struct vnode *nvp;
 	struct nameidata named;
 	struct vattr va;
 	char *bufp;
 	u_long *hashp;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	int error;
 
 	NFSNAMEICNDSET(&named.ni_cnd, tcred, CREATE,
 	    LOCKPARENT | LOCKLEAF | SAVESTART | NOCACHE);
 	nfsvno_setpathbuf(&named, &bufp, &hashp);
 	named.ni_cnd.cn_lkflags = LK_EXCLUSIVE;
 	named.ni_cnd.cn_thread = p;
 	named.ni_cnd.cn_nameptr = bufp;
 	if (fnamep != NULL) {
 		strlcpy(bufp, fnamep, PNFS_FILENAME_LEN + 1);
 		named.ni_cnd.cn_namelen = strlen(bufp);
 	} else
 		named.ni_cnd.cn_namelen = nfsrv_putfhname(fhp, bufp);
 	NFSD_DEBUG(4, "nfsrv_dscreate: dvp=%p fname=%s\n", dvp, bufp);
 
 	/* Create the date file in the DS mount. */
 	error = NFSVOPLOCK(dvp, LK_EXCLUSIVE);
 	if (error == 0) {
 		error = VOP_CREATE(dvp, &nvp, &named.ni_cnd, vap);
 		NFSVOPUNLOCK(dvp);
 		if (error == 0) {
 			/* Set the ownership of the file. */
 			error = VOP_SETATTR(nvp, nvap, tcred);
 			NFSD_DEBUG(4, "nfsrv_dscreate:"
 			    " setattr-uid=%d\n", error);
 			if (error != 0)
 				vput(nvp);
 		}
 		if (error != 0)
 			printf("pNFS: pnfscreate failed=%d\n", error);
 	} else
 		printf("pNFS: pnfscreate vnlock=%d\n", error);
 	if (error == 0) {
 		np = VTONFS(nvp);
 		nmp = VFSTONFS(nvp->v_mount);
 		if (strcmp(nvp->v_mount->mnt_vfc->vfc_name, "nfs")
 		    != 0 || nmp->nm_nam->sa_len > sizeof(
 		    struct sockaddr_in6) ||
 		    np->n_fhp->nfh_len != NFSX_MYFH) {
 			printf("Bad DS file: fstype=%s salen=%d"
 			    " fhlen=%d\n",
 			    nvp->v_mount->mnt_vfc->vfc_name,
 			    nmp->nm_nam->sa_len, np->n_fhp->nfh_len);
 			error = ENOENT;
 		}
 
 		/* Set extattrs for the DS on the MDS file. */
 		if (error == 0) {
 			if (dsa != NULL) {
 				error = VOP_GETATTR(nvp, &va, tcred);
 				if (error == 0) {
 					dsa->dsa_filerev = va.va_filerev;
 					dsa->dsa_size = va.va_size;
 					dsa->dsa_atime = va.va_atime;
 					dsa->dsa_mtime = va.va_mtime;
 					dsa->dsa_bytes = va.va_bytes;
 				}
 			}
 			if (error == 0) {
 				NFSBCOPY(np->n_fhp->nfh_fh, &pf->dsf_fh,
 				    NFSX_MYFH);
 				NFSBCOPY(nmp->nm_nam, &pf->dsf_sin,
 				    nmp->nm_nam->sa_len);
 				NFSBCOPY(named.ni_cnd.cn_nameptr,
 				    pf->dsf_filename,
 				    sizeof(pf->dsf_filename));
 			}
 		} else
 			printf("pNFS: pnfscreate can't get DS"
 			    " attr=%d\n", error);
 		if (nvpp != NULL && error == 0)
 			*nvpp = nvp;
 		else
 			vput(nvp);
 	}
 	nfsvno_relpathbuf(&named);
 	return (error);
 }
 
 /*
  * Start up the thread that will execute nfsrv_dscreate().
  */
 static void
 start_dscreate(void *arg, int pending)
 {
 	struct nfsrvdscreate *dsc;
 
 	dsc = (struct nfsrvdscreate *)arg;
 	dsc->err = nfsrv_dscreate(dsc->dvp, &dsc->createva, &dsc->va, &dsc->fh,
 	    dsc->pf, NULL, NULL, dsc->tcred, dsc->p, NULL);
 	dsc->done = 1;
 	NFSD_DEBUG(4, "start_dscreate: err=%d\n", dsc->err);
 }
 
 /*
  * Create a pNFS data file on the Data Server(s).
  */
 static void
 nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     NFSPROC_T *p)
 {
 	struct nfsrvdscreate *dsc, *tdsc = NULL;
 	struct nfsdevice *ds, *tds, *fds;
 	struct mount *mp;
 	struct pnfsdsfile *pf, *tpf;
 	struct pnfsdsattr dsattr;
 	struct vattr va;
 	struct vnode *dvp[NFSDEV_MAXMIRRORS];
 	struct nfsmount *nmp;
 	fhandle_t fh;
 	uid_t vauid;
 	gid_t vagid;
 	u_short vamode;
 	struct ucred *tcred;
 	int dsdir[NFSDEV_MAXMIRRORS], error, i, mirrorcnt, ret;
 	int failpos, timo;
 
 	/* Get a DS server directory in a round-robin order. */
 	mirrorcnt = 1;
 	mp = vp->v_mount;
 	ds = fds = NULL;
 	NFSDDSLOCK();
 	/*
 	 * Search for the first entry that handles this MDS fs, but use the
 	 * first entry for all MDS fs's otherwise.
 	 */
 	TAILQ_FOREACH(tds, &nfsrv_devidhead, nfsdev_list) {
 		if (tds->nfsdev_nmp != NULL) {
 			if (tds->nfsdev_mdsisset == 0 && ds == NULL)
 				ds = tds;
 			else if (tds->nfsdev_mdsisset != 0 && fsidcmp(
 			    &mp->mnt_stat.f_fsid, &tds->nfsdev_mdsfsid) == 0) {
 				ds = fds = tds;
 				break;
 			}
 		}
 	}
 	if (ds == NULL) {
 		NFSDDSUNLOCK();
 		NFSD_DEBUG(4, "nfsrv_pnfscreate: no srv\n");
 		return;
 	}
 	i = dsdir[0] = ds->nfsdev_nextdir;
 	ds->nfsdev_nextdir = (ds->nfsdev_nextdir + 1) % nfsrv_dsdirsize;
 	dvp[0] = ds->nfsdev_dsdir[i];
 	tds = TAILQ_NEXT(ds, nfsdev_list);
 	if (nfsrv_maxpnfsmirror > 1 && tds != NULL) {
 		TAILQ_FOREACH_FROM(tds, &nfsrv_devidhead, nfsdev_list) {
 			if (tds->nfsdev_nmp != NULL &&
 			    ((tds->nfsdev_mdsisset == 0 && fds == NULL) ||
 			     (tds->nfsdev_mdsisset != 0 && fds != NULL &&
 			      fsidcmp(&mp->mnt_stat.f_fsid,
 			      &tds->nfsdev_mdsfsid) == 0))) {
 				dsdir[mirrorcnt] = i;
 				dvp[mirrorcnt] = tds->nfsdev_dsdir[i];
 				mirrorcnt++;
 				if (mirrorcnt >= nfsrv_maxpnfsmirror)
 					break;
 			}
 		}
 	}
 	/* Put at end of list to implement round-robin usage. */
 	TAILQ_REMOVE(&nfsrv_devidhead, ds, nfsdev_list);
 	TAILQ_INSERT_TAIL(&nfsrv_devidhead, ds, nfsdev_list);
 	NFSDDSUNLOCK();
 	dsc = NULL;
 	if (mirrorcnt > 1)
 		tdsc = dsc = malloc(sizeof(*dsc) * (mirrorcnt - 1), M_TEMP,
 		    M_WAITOK | M_ZERO);
 	tpf = pf = malloc(sizeof(*pf) * nfsrv_maxpnfsmirror, M_TEMP, M_WAITOK |
 	    M_ZERO);
 
 	error = nfsvno_getfh(vp, &fh, p);
 	if (error == 0)
 		error = VOP_GETATTR(vp, &va, cred);
 	if (error == 0) {
 		/* Set the attributes for "vp" to Setattr the DS vp. */
 		vauid = va.va_uid;
 		vagid = va.va_gid;
 		vamode = va.va_mode;
 		VATTR_NULL(&va);
 		va.va_uid = vauid;
 		va.va_gid = vagid;
 		va.va_mode = vamode;
 		va.va_size = 0;
 	} else
 		printf("pNFS: pnfscreate getfh+attr=%d\n", error);
 
 	NFSD_DEBUG(4, "nfsrv_pnfscreate: cruid=%d crgid=%d\n", cred->cr_uid,
 	    cred->cr_gid);
 	/* Make data file name based on FH. */
 	tcred = newnfs_getcred();
 
 	/*
 	 * Create the file on each DS mirror, using kernel process(es) for the
 	 * additional mirrors.
 	 */
 	failpos = -1;
 	for (i = 0; i < mirrorcnt - 1 && error == 0; i++, tpf++, tdsc++) {
 		tpf->dsf_dir = dsdir[i];
 		tdsc->tcred = tcred;
 		tdsc->p = p;
 		tdsc->pf = tpf;
 		tdsc->createva = *vap;
 		NFSBCOPY(&fh, &tdsc->fh, sizeof(fh));
 		tdsc->va = va;
 		tdsc->dvp = dvp[i];
 		tdsc->done = 0;
 		tdsc->inprog = 0;
 		tdsc->err = 0;
 		ret = EIO;
 		if (nfs_pnfsiothreads != 0) {
 			ret = nfs_pnfsio(start_dscreate, tdsc);
 			NFSD_DEBUG(4, "nfsrv_pnfscreate: nfs_pnfsio=%d\n", ret);
 		}
 		if (ret != 0) {
 			ret = nfsrv_dscreate(dvp[i], vap, &va, &fh, tpf, NULL,
 			    NULL, tcred, p, NULL);
 			if (ret != 0) {
 				KASSERT(error == 0, ("nfsrv_dscreate err=%d",
 				    error));
 				if (failpos == -1 && nfsds_failerr(ret))
 					failpos = i;
 				else
 					error = ret;
 			}
 		}
 	}
 	if (error == 0) {
 		tpf->dsf_dir = dsdir[mirrorcnt - 1];
 		error = nfsrv_dscreate(dvp[mirrorcnt - 1], vap, &va, &fh, tpf,
 		    &dsattr, NULL, tcred, p, NULL);
 		if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(error)) {
 			failpos = mirrorcnt - 1;
 			error = 0;
 		}
 	}
 	timo = hz / 50;		/* Wait for 20msec. */
 	if (timo < 1)
 		timo = 1;
 	/* Wait for kernel task(s) to complete. */
 	for (tdsc = dsc, i = 0; i < mirrorcnt - 1; i++, tdsc++) {
 		while (tdsc->inprog != 0 && tdsc->done == 0)
 			tsleep(&tdsc->tsk, PVFS, "srvdcr", timo);
 		if (tdsc->err != 0) {
 			if (failpos == -1 && nfsds_failerr(tdsc->err))
 				failpos = i;
 			else if (error == 0)
 				error = tdsc->err;
 		}
 	}
 
 	/*
 	 * If failpos has been set, that mirror has failed, so it needs
 	 * to be disabled.
 	 */
 	if (failpos >= 0) {
 		nmp = VFSTONFS(dvp[failpos]->v_mount);
 		NFSLOCKMNT(nmp);
 		if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM |
 		     NFSMNTP_CANCELRPCS)) == 0) {
 			nmp->nm_privflag |= NFSMNTP_CANCELRPCS;
 			NFSUNLOCKMNT(nmp);
 			ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER, nmp, p);
 			NFSD_DEBUG(4, "dscreatfail fail=%d ds=%p\n", failpos,
 			    ds);
 			if (ds != NULL)
 				nfsrv_killrpcs(nmp);
 			NFSLOCKMNT(nmp);
 			nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
 			wakeup(nmp);
 		}
 		NFSUNLOCKMNT(nmp);
 	}
 
 	NFSFREECRED(tcred);
 	if (error == 0) {
 		ASSERT_VOP_ELOCKED(vp, "nfsrv_pnfscreate vp");
 
 		NFSD_DEBUG(4, "nfsrv_pnfscreate: mirrorcnt=%d maxmirror=%d\n",
 		    mirrorcnt, nfsrv_maxpnfsmirror);
 		/*
 		 * For all mirrors that couldn't be created, fill in the
 		 * *pf structure, but with an IP address == 0.0.0.0.
 		 */
 		tpf = pf + mirrorcnt;
 		for (i = mirrorcnt; i < nfsrv_maxpnfsmirror; i++, tpf++) {
 			*tpf = *pf;
 			tpf->dsf_sin.sin_family = AF_INET;
 			tpf->dsf_sin.sin_len = sizeof(struct sockaddr_in);
 			tpf->dsf_sin.sin_addr.s_addr = 0;
 			tpf->dsf_sin.sin_port = 0;
 		}
 
 		error = vn_extattr_set(vp, IO_NODELOCKED,
 		    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile",
 		    sizeof(*pf) * nfsrv_maxpnfsmirror, (char *)pf, p);
 		if (error == 0)
 			error = vn_extattr_set(vp, IO_NODELOCKED,
 			    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr",
 			    sizeof(dsattr), (char *)&dsattr, p);
 		if (error != 0)
 			printf("pNFS: pnfscreate setextattr=%d\n",
 			    error);
 	} else
 		printf("pNFS: pnfscreate=%d\n", error);
 	free(pf, M_TEMP);
 	free(dsc, M_TEMP);
 }
 
 /*
  * Get the information needed to remove the pNFS Data Server file from the
  * Metadata file.  Upon success, ddvp is set non-NULL to the locked
  * DS directory vnode.  The caller must unlock *ddvp when done with it.
  */
 static void
 nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode **dvpp,
     int *mirrorcntp, char *fname, fhandle_t *fhp)
 {
 	struct vattr va;
 	struct ucred *tcred;
 	char *buf;
 	int buflen, error;
 
 	dvpp[0] = NULL;
 	/* If not an exported regular file or not a pNFS server, just return. */
 	if (vp->v_type != VREG || (vp->v_mount->mnt_flag & MNT_EXPORTED) == 0 ||
 	    nfsrv_devidcnt == 0)
 		return;
 
 	/* Check to see if this is the last hard link. */
 	tcred = newnfs_getcred();
 	error = VOP_GETATTR(vp, &va, tcred);
 	NFSFREECRED(tcred);
 	if (error != 0) {
 		printf("pNFS: nfsrv_pnfsremovesetup getattr=%d\n", error);
 		return;
 	}
 	if (va.va_nlink > 1)
 		return;
 
 	error = nfsvno_getfh(vp, fhp, p);
 	if (error != 0) {
 		printf("pNFS: nfsrv_pnfsremovesetup getfh=%d\n", error);
 		return;
 	}
 
 	buflen = 1024;
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	/* Get the directory vnode for the DS mount and the file handle. */
 	error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, dvpp,
 	    NULL, NULL, fname, NULL, NULL, NULL, NULL, NULL);
 	free(buf, M_TEMP);
 	if (error != 0)
 		printf("pNFS: nfsrv_pnfsremovesetup getsockmnt=%d\n", error);
 }
 
 /*
  * Remove a DS data file for nfsrv_pnfsremove(). Called for each mirror.
  * The arguments are in a structure, so that they can be passed through
  * taskqueue for a kernel process to execute this function.
  */
 struct nfsrvdsremove {
 	int			done;
 	int			inprog;
 	struct task		tsk;
 	struct ucred		*tcred;
 	struct vnode		*dvp;
 	NFSPROC_T		*p;
 	int			err;
 	char			fname[PNFS_FILENAME_LEN + 1];
 };
 
 static int
 nfsrv_dsremove(struct vnode *dvp, char *fname, struct ucred *tcred,
     NFSPROC_T *p)
 {
 	struct nameidata named;
 	struct vnode *nvp;
 	char *bufp;
 	u_long *hashp;
 	int error;
 
 	error = NFSVOPLOCK(dvp, LK_EXCLUSIVE);
 	if (error != 0)
 		return (error);
 	named.ni_cnd.cn_nameiop = DELETE;
 	named.ni_cnd.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	named.ni_cnd.cn_cred = tcred;
 	named.ni_cnd.cn_thread = p;
 	named.ni_cnd.cn_flags = ISLASTCN | LOCKPARENT | LOCKLEAF | SAVENAME;
 	nfsvno_setpathbuf(&named, &bufp, &hashp);
 	named.ni_cnd.cn_nameptr = bufp;
 	named.ni_cnd.cn_namelen = strlen(fname);
 	strlcpy(bufp, fname, NAME_MAX);
 	NFSD_DEBUG(4, "nfsrv_pnfsremove: filename=%s\n", bufp);
 	error = VOP_LOOKUP(dvp, &nvp, &named.ni_cnd);
 	NFSD_DEBUG(4, "nfsrv_pnfsremove: aft LOOKUP=%d\n", error);
 	if (error == 0) {
 		error = VOP_REMOVE(dvp, nvp, &named.ni_cnd);
 		vput(nvp);
 	}
 	NFSVOPUNLOCK(dvp);
 	nfsvno_relpathbuf(&named);
 	if (error != 0)
 		printf("pNFS: nfsrv_pnfsremove failed=%d\n", error);
 	return (error);
 }
 
 /*
  * Start up the thread that will execute nfsrv_dsremove().
  */
 static void
 start_dsremove(void *arg, int pending)
 {
 	struct nfsrvdsremove *dsrm;
 
 	dsrm = (struct nfsrvdsremove *)arg;
 	dsrm->err = nfsrv_dsremove(dsrm->dvp, dsrm->fname, dsrm->tcred,
 	    dsrm->p);
 	dsrm->done = 1;
 	NFSD_DEBUG(4, "start_dsremove: err=%d\n", dsrm->err);
 }
 
 /*
  * Remove a pNFS data file from a Data Server.
  * nfsrv_pnfsremovesetup() must have been called before the MDS file was
  * removed to set up the dvp and fill in the FH.
  */
 static void
 nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp,
     NFSPROC_T *p)
 {
 	struct ucred *tcred;
 	struct nfsrvdsremove *dsrm, *tdsrm;
 	struct nfsdevice *ds;
 	struct nfsmount *nmp;
 	int failpos, i, ret, timo;
 
 	tcred = newnfs_getcred();
 	dsrm = NULL;
 	if (mirrorcnt > 1)
 		dsrm = malloc(sizeof(*dsrm) * mirrorcnt - 1, M_TEMP, M_WAITOK);
 	/*
 	 * Remove the file on each DS mirror, using kernel process(es) for the
 	 * additional mirrors.
 	 */
 	failpos = -1;
 	for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) {
 		tdsrm->tcred = tcred;
 		tdsrm->p = p;
 		tdsrm->dvp = dvp[i];
 		strlcpy(tdsrm->fname, fname, PNFS_FILENAME_LEN + 1);
 		tdsrm->inprog = 0;
 		tdsrm->done = 0;
 		tdsrm->err = 0;
 		ret = EIO;
 		if (nfs_pnfsiothreads != 0) {
 			ret = nfs_pnfsio(start_dsremove, tdsrm);
 			NFSD_DEBUG(4, "nfsrv_pnfsremove: nfs_pnfsio=%d\n", ret);
 		}
 		if (ret != 0) {
 			ret = nfsrv_dsremove(dvp[i], fname, tcred, p);
 			if (failpos == -1 && nfsds_failerr(ret))
 				failpos = i;
 		}
 	}
 	ret = nfsrv_dsremove(dvp[mirrorcnt - 1], fname, tcred, p);
 	if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(ret))
 		failpos = mirrorcnt - 1;
 	timo = hz / 50;		/* Wait for 20msec. */
 	if (timo < 1)
 		timo = 1;
 	/* Wait for kernel task(s) to complete. */
 	for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) {
 		while (tdsrm->inprog != 0 && tdsrm->done == 0)
 			tsleep(&tdsrm->tsk, PVFS, "srvdsrm", timo);
 		if (failpos == -1 && nfsds_failerr(tdsrm->err))
 			failpos = i;
 	}
 
 	/*
 	 * If failpos has been set, that mirror has failed, so it needs
 	 * to be disabled.
 	 */
 	if (failpos >= 0) {
 		nmp = VFSTONFS(dvp[failpos]->v_mount);
 		NFSLOCKMNT(nmp);
 		if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM |
 		     NFSMNTP_CANCELRPCS)) == 0) {
 			nmp->nm_privflag |= NFSMNTP_CANCELRPCS;
 			NFSUNLOCKMNT(nmp);
 			ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER, nmp, p);
 			NFSD_DEBUG(4, "dsremovefail fail=%d ds=%p\n", failpos,
 			    ds);
 			if (ds != NULL)
 				nfsrv_killrpcs(nmp);
 			NFSLOCKMNT(nmp);
 			nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
 			wakeup(nmp);
 		}
 		NFSUNLOCKMNT(nmp);
 	}
 
 	/* Get rid all layouts for the file. */
 	nfsrv_freefilelayouts(fhp);
 
 	NFSFREECRED(tcred);
 	free(dsrm, M_TEMP);
 }
 
 /*
  * Generate a file name based on the file handle and put it in *bufp.
  * Return the number of bytes generated.
  */
 static int
 nfsrv_putfhname(fhandle_t *fhp, char *bufp)
 {
 	int i;
 	uint8_t *cp;
 	const uint8_t *hexdigits = "0123456789abcdef";
 
 	cp = (uint8_t *)fhp;
 	for (i = 0; i < sizeof(*fhp); i++) {
 		bufp[2 * i] = hexdigits[(*cp >> 4) & 0xf];
 		bufp[2 * i + 1] = hexdigits[*cp++ & 0xf];
 	}
 	bufp[2 * i] = '\0';
 	return (2 * i);
 }
 
 /*
  * Update the Metadata file's attributes from the DS file when a Read/Write
  * layout is returned.
  * Basically just call nfsrv_proxyds() with procedure == NFSPROC_LAYOUTRETURN
  * so that it does a nfsrv_getattrdsrpc() and nfsrv_setextattr() on the DS file.
  */
 int
 nfsrv_updatemdsattr(struct vnode *vp, struct nfsvattr *nap, NFSPROC_T *p)
 {
 	struct ucred *tcred;
 	int error;
 
 	/* Do this as root so that it won't fail with EACCES. */
 	tcred = newnfs_getcred();
 	error = nfsrv_proxyds(vp, 0, 0, tcred, p, NFSPROC_LAYOUTRETURN,
 	    NULL, NULL, NULL, nap, NULL, NULL, 0, NULL);
 	NFSFREECRED(tcred);
 	return (error);
 }
 
 /*
  * Set the NFSv4 ACL on the DS file to the same ACL as the MDS file.
  */
 static int
 nfsrv_dssetacl(struct vnode *vp, struct acl *aclp, struct ucred *cred,
     NFSPROC_T *p)
 {
 	int error;
 
 	error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SETACL,
 	    NULL, NULL, NULL, NULL, aclp, NULL, 0, NULL);
 	return (error);
 }
 
 static int
 nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
     struct thread *p, int ioproc, struct mbuf **mpp, char *cp,
     struct mbuf **mpp2, struct nfsvattr *nap, struct acl *aclp,
     off_t *offp, int content, bool *eofp)
 {
 	struct nfsmount *nmp[NFSDEV_MAXMIRRORS], *failnmp;
 	fhandle_t fh[NFSDEV_MAXMIRRORS];
 	struct vnode *dvp[NFSDEV_MAXMIRRORS];
 	struct nfsdevice *ds;
 	struct pnfsdsattr dsattr;
 	struct opnfsdsattr odsattr;
 	char *buf;
 	int buflen, error, failpos, i, mirrorcnt, origmircnt, trycnt;
 
 	NFSD_DEBUG(4, "in nfsrv_proxyds\n");
 	/*
 	 * If not a regular file, not exported or not a pNFS server,
 	 * just return ENOENT.
 	 */
 	if (vp->v_type != VREG || (vp->v_mount->mnt_flag & MNT_EXPORTED) == 0 ||
 	    nfsrv_devidcnt == 0)
 		return (ENOENT);
 
 	buflen = 1024;
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	error = 0;
 
 	/*
 	 * For Getattr, get the Change attribute (va_filerev) and size (va_size)
 	 * from the MetaData file's extended attribute.
 	 */
 	if (ioproc == NFSPROC_GETATTR) {
 		error = vn_extattr_get(vp, IO_NODELOCKED,
 		    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr", &buflen, buf,
 		    p);
 		if (error == 0) {
 			if (buflen == sizeof(odsattr)) {
 				NFSBCOPY(buf, &odsattr, buflen);
 				nap->na_filerev = odsattr.dsa_filerev;
 				nap->na_size = odsattr.dsa_size;
 				nap->na_atime = odsattr.dsa_atime;
 				nap->na_mtime = odsattr.dsa_mtime;
 				/*
 				 * Fake na_bytes by rounding up na_size.
 				 * Since we don't know the block size, just
 				 * use BLKDEV_IOSIZE.
 				 */
 				nap->na_bytes = (odsattr.dsa_size +
 				    BLKDEV_IOSIZE - 1) & ~(BLKDEV_IOSIZE - 1);
 			} else if (buflen == sizeof(dsattr)) {
 				NFSBCOPY(buf, &dsattr, buflen);
 				nap->na_filerev = dsattr.dsa_filerev;
 				nap->na_size = dsattr.dsa_size;
 				nap->na_atime = dsattr.dsa_atime;
 				nap->na_mtime = dsattr.dsa_mtime;
 				nap->na_bytes = dsattr.dsa_bytes;
 			} else
 				error = ENXIO;
 		}
 		if (error == 0) {
 			/*
 			 * If nfsrv_pnfsgetdsattr is 0 or nfsrv_checkdsattr()
 			 * returns 0, just return now.  nfsrv_checkdsattr()
 			 * returns 0 if there is no Read/Write layout
 			 * plus either an Open/Write_access or Write
 			 * delegation issued to a client for the file.
 			 */
 			if (nfsrv_pnfsgetdsattr == 0 ||
 			    nfsrv_checkdsattr(vp, p) == 0) {
 				free(buf, M_TEMP);
 				return (error);
 			}
 		}
 
 		/*
 		 * Clear ENOATTR so the code below will attempt to do a
 		 * nfsrv_getattrdsrpc() to get the attributes and (re)create
 		 * the extended attribute.
 		 */
 		if (error == ENOATTR)
 			error = 0;
 	}
 
 	origmircnt = -1;
 	trycnt = 0;
 tryagain:
 	if (error == 0) {
 		buflen = 1024;
 		if (ioproc == NFSPROC_READDS && NFSVOPISLOCKED(vp) ==
 		    LK_EXCLUSIVE)
 			printf("nfsrv_proxyds: Readds vp exclusively locked\n");
 		error = nfsrv_dsgetsockmnt(vp, LK_SHARED, buf, &buflen,
 		    &mirrorcnt, p, dvp, fh, NULL, NULL, NULL, NULL, NULL,
 		    NULL, NULL);
 		if (error == 0) {
 			for (i = 0; i < mirrorcnt; i++)
 				nmp[i] = VFSTONFS(dvp[i]->v_mount);
 		} else
 			printf("pNFS: proxy getextattr sockaddr=%d\n", error);
 	} else
 		printf("pNFS: nfsrv_dsgetsockmnt=%d\n", error);
 	if (error == 0) {
 		failpos = -1;
 		if (origmircnt == -1)
 			origmircnt = mirrorcnt;
 		/*
 		 * If failpos is set to a mirror#, then that mirror has
 		 * failed and will be disabled. For Read, Getattr and Seek, the
 		 * function only tries one mirror, so if that mirror has
 		 * failed, it will need to be retried. As such, increment
 		 * tryitagain for these cases.
 		 * For Write, Setattr and Setacl, the function tries all
 		 * mirrors and will not return an error for the case where
 		 * one mirror has failed. For these cases, the functioning
 		 * mirror(s) will have been modified, so a retry isn't
 		 * necessary. These functions will set failpos for the
 		 * failed mirror#.
 		 */
 		if (ioproc == NFSPROC_READDS) {
 			error = nfsrv_readdsrpc(fh, off, cnt, cred, p, nmp[0],
 			    mpp, mpp2);
 			if (nfsds_failerr(error) && mirrorcnt > 1) {
 				/*
 				 * Setting failpos will cause the mirror
 				 * to be disabled and then a retry of this
 				 * read is required.
 				 */
 				failpos = 0;
 				error = 0;
 				trycnt++;
 			}
 		} else if (ioproc == NFSPROC_WRITEDS)
 			error = nfsrv_writedsrpc(fh, off, cnt, cred, p, vp,
 			    &nmp[0], mirrorcnt, mpp, cp, &failpos);
 		else if (ioproc == NFSPROC_SETATTR)
 			error = nfsrv_setattrdsrpc(fh, cred, p, vp, &nmp[0],
 			    mirrorcnt, nap, &failpos);
 		else if (ioproc == NFSPROC_SETACL)
 			error = nfsrv_setacldsrpc(fh, cred, p, vp, &nmp[0],
 			    mirrorcnt, aclp, &failpos);
 		else if (ioproc == NFSPROC_SEEKDS) {
 			error = nfsrv_seekdsrpc(fh, offp, content, eofp, cred,
 			    p, nmp[0]);
 			if (nfsds_failerr(error) && mirrorcnt > 1) {
 				/*
 				 * Setting failpos will cause the mirror
 				 * to be disabled and then a retry of this
 				 * read is required.
 				 */
 				failpos = 0;
 				error = 0;
 				trycnt++;
 			}
 		} else if (ioproc == NFSPROC_ALLOCATE)
 			error = nfsrv_allocatedsrpc(fh, off, *offp, cred, p, vp,
 			    &nmp[0], mirrorcnt, &failpos);
 		else {
 			error = nfsrv_getattrdsrpc(&fh[mirrorcnt - 1], cred, p,
 			    vp, nmp[mirrorcnt - 1], nap);
 			if (nfsds_failerr(error) && mirrorcnt > 1) {
 				/*
 				 * Setting failpos will cause the mirror
 				 * to be disabled and then a retry of this
 				 * getattr is required.
 				 */
 				failpos = mirrorcnt - 1;
 				error = 0;
 				trycnt++;
 			}
 		}
 		ds = NULL;
 		if (failpos >= 0) {
 			failnmp = nmp[failpos];
 			NFSLOCKMNT(failnmp);
 			if ((failnmp->nm_privflag & (NFSMNTP_FORCEDISM |
 			     NFSMNTP_CANCELRPCS)) == 0) {
 				failnmp->nm_privflag |= NFSMNTP_CANCELRPCS;
 				NFSUNLOCKMNT(failnmp);
 				ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER,
 				    failnmp, p);
 				NFSD_DEBUG(4, "dsldsnmp fail=%d ds=%p\n",
 				    failpos, ds);
 				if (ds != NULL)
 					nfsrv_killrpcs(failnmp);
 				NFSLOCKMNT(failnmp);
 				failnmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
 				wakeup(failnmp);
 			}
 			NFSUNLOCKMNT(failnmp);
 		}
 		for (i = 0; i < mirrorcnt; i++)
 			NFSVOPUNLOCK(dvp[i]);
 		NFSD_DEBUG(4, "nfsrv_proxyds: aft RPC=%d trya=%d\n", error,
 		    trycnt);
 		/* Try the Read/Getattr again if a mirror was deleted. */
 		if (ds != NULL && trycnt > 0 && trycnt < origmircnt)
 			goto tryagain;
 	} else {
 		/* Return ENOENT for any Extended Attribute error. */
 		error = ENOENT;
 	}
 	free(buf, M_TEMP);
 	NFSD_DEBUG(4, "nfsrv_proxyds: error=%d\n", error);
 	return (error);
 }
 
 /*
  * Get the DS mount point, fh and directory from the "pnfsd.dsfile" extended
  * attribute.
  * newnmpp - If it points to a non-NULL nmp, that is the destination and needs
  *           to be checked.  If it points to a NULL nmp, then it returns
  *           a suitable destination.
  * curnmp - If non-NULL, it is the source mount for the copy.
  */
 int
 nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp,
     int *mirrorcntp, NFSPROC_T *p, struct vnode **dvpp, fhandle_t *fhp,
     char *devid, char *fnamep, struct vnode **nvpp, struct nfsmount **newnmpp,
     struct nfsmount *curnmp, int *ippos, int *dsdirp)
 {
 	struct vnode *dvp, *nvp = NULL, **tdvpp;
 	struct mount *mp;
 	struct nfsmount *nmp, *newnmp;
 	struct sockaddr *sad;
 	struct sockaddr_in *sin;
 	struct nfsdevice *ds, *tds, *fndds;
 	struct pnfsdsfile *pf;
 	uint32_t dsdir;
 	int error, fhiszero, fnd, gotone, i, mirrorcnt;
 
 	ASSERT_VOP_LOCKED(vp, "nfsrv_dsgetsockmnt vp");
 	*mirrorcntp = 1;
 	tdvpp = dvpp;
 	if (nvpp != NULL)
 		*nvpp = NULL;
 	if (dvpp != NULL)
 		*dvpp = NULL;
 	if (ippos != NULL)
 		*ippos = -1;
 	if (newnmpp != NULL)
 		newnmp = *newnmpp;
 	else
 		newnmp = NULL;
 	mp = vp->v_mount;
 	error = vn_extattr_get(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM,
 	    "pnfsd.dsfile", buflenp, buf, p);
 	mirrorcnt = *buflenp / sizeof(*pf);
 	if (error == 0 && (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS ||
 	    *buflenp != sizeof(*pf) * mirrorcnt))
 		error = ENOATTR;
 
 	pf = (struct pnfsdsfile *)buf;
 	/* If curnmp != NULL, check for a match in the mirror list. */
 	if (curnmp != NULL && error == 0) {
 		fnd = 0;
 		for (i = 0; i < mirrorcnt; i++, pf++) {
 			sad = (struct sockaddr *)&pf->dsf_sin;
 			if (nfsaddr2_match(sad, curnmp->nm_nam)) {
 				if (ippos != NULL)
 					*ippos = i;
 				fnd = 1;
 				break;
 			}
 		}
 		if (fnd == 0)
 			error = ENXIO;
 	}
 
 	gotone = 0;
 	pf = (struct pnfsdsfile *)buf;
 	NFSD_DEBUG(4, "nfsrv_dsgetsockmnt: mirrorcnt=%d err=%d\n", mirrorcnt,
 	    error);
 	for (i = 0; i < mirrorcnt && error == 0; i++, pf++) {
 		fhiszero = 0;
 		sad = (struct sockaddr *)&pf->dsf_sin;
 		sin = &pf->dsf_sin;
 		dsdir = pf->dsf_dir;
 		if (dsdir >= nfsrv_dsdirsize) {
 			printf("nfsrv_dsgetsockmnt: dsdir=%d\n", dsdir);
 			error = ENOATTR;
 		} else if (nvpp != NULL && newnmp != NULL &&
 		    nfsaddr2_match(sad, newnmp->nm_nam))
 			error = EEXIST;
 		if (error == 0) {
 			if (ippos != NULL && curnmp == NULL &&
 			    sad->sa_family == AF_INET &&
 			    sin->sin_addr.s_addr == 0)
 				*ippos = i;
 			if (NFSBCMP(&zerofh, &pf->dsf_fh, sizeof(zerofh)) == 0)
 				fhiszero = 1;
 			/* Use the socket address to find the mount point. */
 			fndds = NULL;
 			NFSDDSLOCK();
 			/* Find a match for the IP address. */
 			TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
 				if (ds->nfsdev_nmp != NULL) {
 					dvp = ds->nfsdev_dvp;
 					nmp = VFSTONFS(dvp->v_mount);
 					if (nmp != ds->nfsdev_nmp)
 						printf("different2 nmp %p %p\n",
 						    nmp, ds->nfsdev_nmp);
 					if (nfsaddr2_match(sad, nmp->nm_nam)) {
 						fndds = ds;
 						break;
 					}
 				}
 			}
 			if (fndds != NULL && newnmpp != NULL &&
 			    newnmp == NULL) {
 				/* Search for a place to make a mirror copy. */
 				TAILQ_FOREACH(tds, &nfsrv_devidhead,
 				    nfsdev_list) {
 					if (tds->nfsdev_nmp != NULL &&
 					    fndds != tds &&
 					    ((tds->nfsdev_mdsisset == 0 &&
 					      fndds->nfsdev_mdsisset == 0) ||
 					     (tds->nfsdev_mdsisset != 0 &&
 					      fndds->nfsdev_mdsisset != 0 &&
 					      fsidcmp(&tds->nfsdev_mdsfsid,
 					      &mp->mnt_stat.f_fsid) == 0))) {
 						*newnmpp = tds->nfsdev_nmp;
 						break;
 					}
 				}
 				if (tds != NULL) {
 					/*
 					 * Move this entry to the end of the
 					 * list, so it won't be selected as
 					 * easily the next time.
 					 */
 					TAILQ_REMOVE(&nfsrv_devidhead, tds,
 					    nfsdev_list);
 					TAILQ_INSERT_TAIL(&nfsrv_devidhead, tds,
 					    nfsdev_list);
 				}
 			}
 			NFSDDSUNLOCK();
 			if (fndds != NULL) {
 				dvp = fndds->nfsdev_dsdir[dsdir];
 				if (lktype != 0 || fhiszero != 0 ||
 				    (nvpp != NULL && *nvpp == NULL)) {
 					if (fhiszero != 0)
 						error = vn_lock(dvp,
 						    LK_EXCLUSIVE);
 					else if (lktype != 0)
 						error = vn_lock(dvp, lktype);
 					else
 						error = vn_lock(dvp, LK_SHARED);
 					/*
 					 * If the file handle is all 0's, try to
 					 * do a Lookup against the DS to acquire
 					 * it.
 					 * If dvpp == NULL or the Lookup fails,
 					 * unlock dvp after the call.
 					 */
 					if (error == 0 && (fhiszero != 0 ||
 					    (nvpp != NULL && *nvpp == NULL))) {
 						error = nfsrv_pnfslookupds(vp,
 						    dvp, pf, &nvp, p);
 						if (error == 0) {
 							if (fhiszero != 0)
 								nfsrv_pnfssetfh(
 								    vp, pf,
 								    devid,
 								    fnamep,
 								    nvp, p);
 							if (nvpp != NULL &&
 							    *nvpp == NULL) {
 								*nvpp = nvp;
 								*dsdirp = dsdir;
 							} else
 								vput(nvp);
 						}
 						if (error != 0 || lktype == 0)
 							NFSVOPUNLOCK(dvp);
 					}
 				}
 				if (error == 0) {
 					gotone++;
 					NFSD_DEBUG(4, "gotone=%d\n", gotone);
 					if (devid != NULL) {
 						NFSBCOPY(fndds->nfsdev_deviceid,
 						    devid, NFSX_V4DEVICEID);
 						devid += NFSX_V4DEVICEID;
 					}
 					if (dvpp != NULL)
 						*tdvpp++ = dvp;
 					if (fhp != NULL)
 						NFSBCOPY(&pf->dsf_fh, fhp++,
 						    NFSX_MYFH);
 					if (fnamep != NULL && gotone == 1)
 						strlcpy(fnamep,
 						    pf->dsf_filename,
 						    sizeof(pf->dsf_filename));
 				} else
 					NFSD_DEBUG(4, "nfsrv_dsgetsockmnt "
 					    "err=%d\n", error);
 			}
 		}
 	}
 	if (error == 0 && gotone == 0)
 		error = ENOENT;
 
 	NFSD_DEBUG(4, "eo nfsrv_dsgetsockmnt: gotone=%d err=%d\n", gotone,
 	    error);
 	if (error == 0)
 		*mirrorcntp = gotone;
 	else {
 		if (gotone > 0 && dvpp != NULL) {
 			/*
 			 * If the error didn't occur on the first one and
 			 * dvpp != NULL, the one(s) prior to the failure will
 			 * have locked dvp's that need to be unlocked.
 			 */
 			for (i = 0; i < gotone; i++) {
 				NFSVOPUNLOCK(*dvpp);
 				*dvpp++ = NULL;
 			}
 		}
 		/*
 		 * If it found the vnode to be copied from before a failure,
 		 * it needs to be vput()'d.
 		 */
 		if (nvpp != NULL && *nvpp != NULL) {
 			vput(*nvpp);
 			*nvpp = NULL;
 		}
 	}
 	return (error);
 }
 
 /*
  * Set the extended attribute for the Change attribute.
  */
 static int
 nfsrv_setextattr(struct vnode *vp, struct nfsvattr *nap, NFSPROC_T *p)
 {
 	struct pnfsdsattr dsattr;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, "nfsrv_setextattr vp");
 	dsattr.dsa_filerev = nap->na_filerev;
 	dsattr.dsa_size = nap->na_size;
 	dsattr.dsa_atime = nap->na_atime;
 	dsattr.dsa_mtime = nap->na_mtime;
 	dsattr.dsa_bytes = nap->na_bytes;
 	error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM,
 	    "pnfsd.dsattr", sizeof(dsattr), (char *)&dsattr, p);
 	if (error != 0)
 		printf("pNFS: setextattr=%d\n", error);
 	return (error);
 }
 
 static int
 nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred,
     NFSPROC_T *p, struct nfsmount *nmp, struct mbuf **mpp, struct mbuf **mpendp)
 {
 	uint32_t *tl;
 	struct nfsrv_descript *nd;
 	nfsv4stateid_t st;
 	struct mbuf *m, *m2;
 	int error = 0, retlen, tlen, trimlen;
 
 	NFSD_DEBUG(4, "in nfsrv_readdsrpc\n");
 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
 	*mpp = NULL;
 	/*
 	 * Use a stateid where other is an alternating 01010 pattern and
 	 * seqid is 0xffffffff.  This value is not defined as special by
 	 * the RFC and is used by the FreeBSD NFS server to indicate an
 	 * MDS->DS proxy operation.
 	 */
 	st.other[0] = 0x55555555;
 	st.other[1] = 0x55555555;
 	st.other[2] = 0x55555555;
 	st.seqid = 0xffffffff;
 	nfscl_reqstart(nd, NFSPROC_READDS, nmp, (u_int8_t *)fhp, sizeof(*fhp),
 	    NULL, NULL, 0, 0);
 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED * 3);
 	txdr_hyper(off, tl);
 	*(tl + 2) = txdr_unsigned(len);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0) {
 		free(nd, M_TEMP);
 		return (error);
 	}
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
 		NFSM_STRSIZ(retlen, len);
 		if (retlen > 0) {
 			/* Trim off the pre-data XDR from the mbuf chain. */
 			m = nd->nd_mrep;
 			while (m != NULL && m != nd->nd_md) {
 				if (m->m_next == nd->nd_md) {
 					m->m_next = NULL;
 					m_freem(nd->nd_mrep);
 					nd->nd_mrep = m = nd->nd_md;
 				} else
 					m = m->m_next;
 			}
 			if (m == NULL) {
 				printf("nfsrv_readdsrpc: busted mbuf list\n");
 				error = ENOENT;
 				goto nfsmout;
 			}
 
 			/*
 			 * Now, adjust first mbuf so that any XDR before the
 			 * read data is skipped over.
 			 */
 			trimlen = nd->nd_dpos - mtod(m, char *);
 			if (trimlen > 0) {
 				m->m_len -= trimlen;
 				NFSM_DATAP(m, trimlen);
 			}
 
 			/*
 			 * Truncate the mbuf chain at retlen bytes of data,
 			 * plus XDR padding that brings the length up to a
 			 * multiple of 4.
 			 */
 			tlen = NFSM_RNDUP(retlen);
 			do {
 				if (m->m_len >= tlen) {
 					m->m_len = tlen;
 					tlen = 0;
 					m2 = m->m_next;
 					m->m_next = NULL;
 					m_freem(m2);
 					break;
 				}
 				tlen -= m->m_len;
 				m = m->m_next;
 			} while (m != NULL);
 			if (tlen > 0) {
 				printf("nfsrv_readdsrpc: busted mbuf list\n");
 				error = ENOENT;
 				goto nfsmout;
 			}
 			*mpp = nd->nd_mrep;
 			*mpendp = m;
 			nd->nd_mrep = NULL;
 		}
 	} else
 		error = nd->nd_repstat;
 nfsmout:
 	/* If nd->nd_mrep is already NULL, this is a no-op. */
 	m_freem(nd->nd_mrep);
 	free(nd, M_TEMP);
 	NFSD_DEBUG(4, "nfsrv_readdsrpc error=%d\n", error);
 	return (error);
 }
 
 /*
  * Do a write RPC on a DS data file, using this structure for the arguments,
  * so that this function can be executed by a separate kernel process.
  */
 struct nfsrvwritedsdorpc {
 	int			done;
 	int			inprog;
 	struct task		tsk;
 	fhandle_t		fh;
 	off_t			off;
 	int			len;
 	struct nfsmount		*nmp;
 	struct ucred		*cred;
 	NFSPROC_T		*p;
 	struct mbuf		*m;
 	int			err;
 };
 
 static int
 nfsrv_writedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, int len,
     struct nfsvattr *nap, struct mbuf *m, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript *nd;
 	nfsattrbit_t attrbits;
 	nfsv4stateid_t st;
 	int commit, error, retlen;
 
 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
 	nfscl_reqstart(nd, NFSPROC_WRITE, nmp, (u_int8_t *)fhp,
 	    sizeof(fhandle_t), NULL, NULL, 0, 0);
 
 	/*
 	 * Use a stateid where other is an alternating 01010 pattern and
 	 * seqid is 0xffffffff.  This value is not defined as special by
 	 * the RFC and is used by the FreeBSD NFS server to indicate an
 	 * MDS->DS proxy operation.
 	 */
 	st.other[0] = 0x55555555;
 	st.other[1] = 0x55555555;
 	st.other[2] = 0x55555555;
 	st.seqid = 0xffffffff;
 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED);
 	txdr_hyper(off, tl);
 	tl += 2;
 	/*
 	 * Do all writes FileSync, since the server doesn't hold onto dirty
 	 * buffers.  Since clients should be accessing the DS servers directly
 	 * using the pNFS layouts, this just needs to work correctly as a
 	 * fallback.
 	 */
 	*tl++ = txdr_unsigned(NFSWRITE_FILESYNC);
 	*tl = txdr_unsigned(len);
 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: len=%d\n", len);
 
 	/* Put data in mbuf chain. */
 	nd->nd_mb->m_next = m;
 
 	/* Set nd_mb and nd_bpos to end of data. */
 	while (m->m_next != NULL)
 		m = m->m_next;
 	nd->nd_mb = m;
 	nfsm_set(nd, m->m_len);
 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: lastmb len=%d\n", m->m_len);
 
 	/* Do a Getattr for the attributes that change upon writing. */
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p,
 	    cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0) {
 		free(nd, M_TEMP);
 		return (error);
 	}
 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: aft writerpc=%d\n", nd->nd_repstat);
 	/* Get rid of weak cache consistency data for now. */
 	if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) ==
 	    (ND_NFSV4 | ND_V4WCCATTR)) {
 		error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL,
 		    NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
 		NFSD_DEBUG(4, "nfsrv_writedsdorpc: wcc attr=%d\n", error);
 		if (error != 0)
 			goto nfsmout;
 		/*
 		 * Get rid of Op# and status for next op.
 		 */
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		if (*++tl != 0)
 			nd->nd_flag |= ND_NOMOREDATA;
 	}
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF);
 		retlen = fxdr_unsigned(int, *tl++);
 		commit = fxdr_unsigned(int, *tl);
 		if (commit != NFSWRITE_FILESYNC)
 			error = NFSERR_IO;
 		NFSD_DEBUG(4, "nfsrv_writedsdorpc:retlen=%d commit=%d err=%d\n",
 		    retlen, commit, error);
 	} else
 		error = nd->nd_repstat;
 	/* We have no use for the Write Verifier since we use FileSync. */
 
 	/*
 	 * Get the Change, Size, Access Time and Modify Time attributes and set
 	 * on the Metadata file, so its attributes will be what the file's
 	 * would be if it had been written.
 	 */
 	if (error == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL,
 		    NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
 	}
 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: aft loadattr=%d\n", error);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	free(nd, M_TEMP);
 	NFSD_DEBUG(4, "nfsrv_writedsdorpc error=%d\n", error);
 	return (error);
 }
 
 /*
  * Start up the thread that will execute nfsrv_writedsdorpc().
  */
 static void
 start_writedsdorpc(void *arg, int pending)
 {
 	struct nfsrvwritedsdorpc *drpc;
 
 	drpc = (struct nfsrvwritedsdorpc *)arg;
 	drpc->err = nfsrv_writedsdorpc(drpc->nmp, &drpc->fh, drpc->off,
 	    drpc->len, NULL, drpc->m, drpc->cred, drpc->p);
 	drpc->done = 1;
 	NFSD_DEBUG(4, "start_writedsdorpc: err=%d\n", drpc->err);
 }
 
 static int
 nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred,
     NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt,
     struct mbuf **mpp, char *cp, int *failposp)
 {
 	struct nfsrvwritedsdorpc *drpc, *tdrpc = NULL;
 	struct nfsvattr na;
 	struct mbuf *m;
 	int error, i, offs, ret, timo;
 
 	NFSD_DEBUG(4, "in nfsrv_writedsrpc\n");
 	KASSERT(*mpp != NULL, ("nfsrv_writedsrpc: NULL mbuf chain"));
 	drpc = NULL;
 	if (mirrorcnt > 1)
 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
 		    M_WAITOK);
 
 	/* Calculate offset in mbuf chain that data starts. */
 	offs = cp - mtod(*mpp, char *);
 	NFSD_DEBUG(4, "nfsrv_writedsrpc: mcopy offs=%d len=%d\n", offs, len);
 
 	/*
 	 * Do the write RPC for every DS, using a separate kernel process
 	 * for every DS except the last one.
 	 */
 	error = 0;
 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
 		tdrpc->done = 0;
 		NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp));
 		tdrpc->off = off;
 		tdrpc->len = len;
 		tdrpc->nmp = *nmpp;
 		tdrpc->cred = cred;
 		tdrpc->p = p;
 		tdrpc->inprog = 0;
 		tdrpc->err = 0;
 		tdrpc->m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK);
 		ret = EIO;
 		if (nfs_pnfsiothreads != 0) {
 			ret = nfs_pnfsio(start_writedsdorpc, tdrpc);
 			NFSD_DEBUG(4, "nfsrv_writedsrpc: nfs_pnfsio=%d\n",
 			    ret);
 		}
 		if (ret != 0) {
 			ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, NULL,
 			    tdrpc->m, cred, p);
 			if (nfsds_failerr(ret) && *failposp == -1)
 				*failposp = i;
 			else if (error == 0 && ret != 0)
 				error = ret;
 		}
 		nmpp++;
 		fhp++;
 	}
 	m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK);
 	ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, &na, m, cred, p);
 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
 		*failposp = mirrorcnt - 1;
 	else if (error == 0 && ret != 0)
 		error = ret;
 	if (error == 0)
 		error = nfsrv_setextattr(vp, &na, p);
 	NFSD_DEBUG(4, "nfsrv_writedsrpc: aft setextat=%d\n", error);
 	tdrpc = drpc;
 	timo = hz / 50;		/* Wait for 20msec. */
 	if (timo < 1)
 		timo = 1;
 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
 		/* Wait for RPCs on separate threads to complete. */
 		while (tdrpc->inprog != 0 && tdrpc->done == 0)
 			tsleep(&tdrpc->tsk, PVFS, "srvwrds", timo);
 		if (nfsds_failerr(tdrpc->err) && *failposp == -1)
 			*failposp = i;
 		else if (error == 0 && tdrpc->err != 0)
 			error = tdrpc->err;
 	}
 	free(drpc, M_TEMP);
 	return (error);
 }
 
 /*
  * Do a allocate RPC on a DS data file, using this structure for the arguments,
  * so that this function can be executed by a separate kernel process.
  */
 struct nfsrvallocatedsdorpc {
 	int			done;
 	int			inprog;
 	struct task		tsk;
 	fhandle_t		fh;
 	off_t			off;
 	off_t			len;
 	struct nfsmount		*nmp;
 	struct ucred		*cred;
 	NFSPROC_T		*p;
 	int			err;
 };
 
 static int
 nfsrv_allocatedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off,
     off_t len, struct nfsvattr *nap, struct ucred *cred, NFSPROC_T *p)
 {
 	uint32_t *tl;
 	struct nfsrv_descript *nd;
 	nfsattrbit_t attrbits;
 	nfsv4stateid_t st;
 	int error;
 
 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
 	nfscl_reqstart(nd, NFSPROC_ALLOCATE, nmp, (u_int8_t *)fhp,
 	    sizeof(fhandle_t), NULL, NULL, 0, 0);
 
 	/*
 	 * Use a stateid where other is an alternating 01010 pattern and
 	 * seqid is 0xffffffff.  This value is not defined as special by
 	 * the RFC and is used by the FreeBSD NFS server to indicate an
 	 * MDS->DS proxy operation.
 	 */
 	st.other[0] = 0x55555555;
 	st.other[1] = 0x55555555;
 	st.other[2] = 0x55555555;
 	st.seqid = 0xffffffff;
 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED);
 	txdr_hyper(off, tl); tl += 2;
 	txdr_hyper(len, tl); tl += 2;
 	NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: len=%jd\n", (intmax_t)len);
 
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	NFSGETATTR_ATTRBIT(&attrbits);
 	nfsrv_putattrbit(nd, &attrbits);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p,
 	    cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0) {
 		free(nd, M_TEMP);
 		return (error);
 	}
 	NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: aft allocaterpc=%d\n",
 	    nd->nd_repstat);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL,
 		    NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
 	} else
 		error = nd->nd_repstat;
 	NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: aft loadattr=%d\n", error);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	free(nd, M_TEMP);
 	NFSD_DEBUG(4, "nfsrv_allocatedsdorpc error=%d\n", error);
 	return (error);
 }
 
 /*
  * Start up the thread that will execute nfsrv_allocatedsdorpc().
  */
 static void
 start_allocatedsdorpc(void *arg, int pending)
 {
 	struct nfsrvallocatedsdorpc *drpc;
 
 	drpc = (struct nfsrvallocatedsdorpc *)arg;
 	drpc->err = nfsrv_allocatedsdorpc(drpc->nmp, &drpc->fh, drpc->off,
 	    drpc->len, NULL, drpc->cred, drpc->p);
 	drpc->done = 1;
 	NFSD_DEBUG(4, "start_allocatedsdorpc: err=%d\n", drpc->err);
 }
 
 static int
 nfsrv_allocatedsrpc(fhandle_t *fhp, off_t off, off_t len, struct ucred *cred,
     NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt,
     int *failposp)
 {
 	struct nfsrvallocatedsdorpc *drpc, *tdrpc = NULL;
 	struct nfsvattr na;
 	int error, i, ret, timo;
 
 	NFSD_DEBUG(4, "in nfsrv_allocatedsrpc\n");
 	drpc = NULL;
 	if (mirrorcnt > 1)
 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
 		    M_WAITOK);
 
 	/*
 	 * Do the allocate RPC for every DS, using a separate kernel process
 	 * for every DS except the last one.
 	 */
 	error = 0;
 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
 		tdrpc->done = 0;
 		NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp));
 		tdrpc->off = off;
 		tdrpc->len = len;
 		tdrpc->nmp = *nmpp;
 		tdrpc->cred = cred;
 		tdrpc->p = p;
 		tdrpc->inprog = 0;
 		tdrpc->err = 0;
 		ret = EIO;
 		if (nfs_pnfsiothreads != 0) {
 			ret = nfs_pnfsio(start_allocatedsdorpc, tdrpc);
 			NFSD_DEBUG(4, "nfsrv_allocatedsrpc: nfs_pnfsio=%d\n",
 			    ret);
 		}
 		if (ret != 0) {
 			ret = nfsrv_allocatedsdorpc(*nmpp, fhp, off, len, NULL,
 			    cred, p);
 			if (nfsds_failerr(ret) && *failposp == -1)
 				*failposp = i;
 			else if (error == 0 && ret != 0)
 				error = ret;
 		}
 		nmpp++;
 		fhp++;
 	}
 	ret = nfsrv_allocatedsdorpc(*nmpp, fhp, off, len, &na, cred, p);
 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
 		*failposp = mirrorcnt - 1;
 	else if (error == 0 && ret != 0)
 		error = ret;
 	if (error == 0)
 		error = nfsrv_setextattr(vp, &na, p);
 	NFSD_DEBUG(4, "nfsrv_allocatedsrpc: aft setextat=%d\n", error);
 	tdrpc = drpc;
 	timo = hz / 50;		/* Wait for 20msec. */
 	if (timo < 1)
 		timo = 1;
 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
 		/* Wait for RPCs on separate threads to complete. */
 		while (tdrpc->inprog != 0 && tdrpc->done == 0)
 			tsleep(&tdrpc->tsk, PVFS, "srvalds", timo);
 		if (nfsds_failerr(tdrpc->err) && *failposp == -1)
 			*failposp = i;
 		else if (error == 0 && tdrpc->err != 0)
 			error = tdrpc->err;
 	}
 	free(drpc, M_TEMP);
 	return (error);
 }
 
 static int
 nfsrv_setattrdsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
     struct vnode *vp, struct nfsmount *nmp, struct nfsvattr *nap,
     struct nfsvattr *dsnap)
 {
 	uint32_t *tl;
 	struct nfsrv_descript *nd;
 	nfsv4stateid_t st;
 	nfsattrbit_t attrbits;
 	int error;
 
 	NFSD_DEBUG(4, "in nfsrv_setattrdsdorpc\n");
 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
 	/*
 	 * Use a stateid where other is an alternating 01010 pattern and
 	 * seqid is 0xffffffff.  This value is not defined as special by
 	 * the RFC and is used by the FreeBSD NFS server to indicate an
 	 * MDS->DS proxy operation.
 	 */
 	st.other[0] = 0x55555555;
 	st.other[1] = 0x55555555;
 	st.other[2] = 0x55555555;
 	st.seqid = 0xffffffff;
 	nfscl_reqstart(nd, NFSPROC_SETATTR, nmp, (u_int8_t *)fhp, sizeof(*fhp),
 	    NULL, NULL, 0, 0);
 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
 	nfscl_fillsattr(nd, &nap->na_vattr, vp, NFSSATTR_FULL, 0);
 
 	/* Do a Getattr for the attributes that change due to writing. */
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED);
 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0) {
 		free(nd, M_TEMP);
 		return (error);
 	}
 	NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: aft setattrrpc=%d\n",
 	    nd->nd_repstat);
 	/* Get rid of weak cache consistency data for now. */
 	if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) ==
 	    (ND_NFSV4 | ND_V4WCCATTR)) {
 		error = nfsv4_loadattr(nd, NULL, dsnap, NULL, NULL, 0, NULL,
 		    NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
 		NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: wcc attr=%d\n", error);
 		if (error != 0)
 			goto nfsmout;
 		/*
 		 * Get rid of Op# and status for next op.
 		 */
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		if (*++tl != 0)
 			nd->nd_flag |= ND_NOMOREDATA;
 	}
 	error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
 	if (error != 0)
 		goto nfsmout;
 	if (nd->nd_repstat != 0)
 		error = nd->nd_repstat;
 	/*
 	 * Get the Change, Size, Access Time and Modify Time attributes and set
 	 * on the Metadata file, so its attributes will be what the file's
 	 * would be if it had been written.
 	 */
 	if (error == 0) {
 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		error = nfsv4_loadattr(nd, NULL, dsnap, NULL, NULL, 0, NULL,
 		    NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
 	}
 	NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: aft setattr loadattr=%d\n", error);
 nfsmout:
 	m_freem(nd->nd_mrep);
 	free(nd, M_TEMP);
 	NFSD_DEBUG(4, "nfsrv_setattrdsdorpc error=%d\n", error);
 	return (error);
 }
 
 struct nfsrvsetattrdsdorpc {
 	int			done;
 	int			inprog;
 	struct task		tsk;
 	fhandle_t		fh;
 	struct nfsmount		*nmp;
 	struct vnode		*vp;
 	struct ucred		*cred;
 	NFSPROC_T		*p;
 	struct nfsvattr		na;
 	struct nfsvattr		dsna;
 	int			err;
 };
 
 /*
  * Start up the thread that will execute nfsrv_setattrdsdorpc().
  */
 static void
 start_setattrdsdorpc(void *arg, int pending)
 {
 	struct nfsrvsetattrdsdorpc *drpc;
 
 	drpc = (struct nfsrvsetattrdsdorpc *)arg;
 	drpc->err = nfsrv_setattrdsdorpc(&drpc->fh, drpc->cred, drpc->p,
 	    drpc->vp, drpc->nmp, &drpc->na, &drpc->dsna);
 	drpc->done = 1;
 }
 
 static int
 nfsrv_setattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
     struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt,
     struct nfsvattr *nap, int *failposp)
 {
 	struct nfsrvsetattrdsdorpc *drpc, *tdrpc = NULL;
 	struct nfsvattr na;
 	int error, i, ret, timo;
 
 	NFSD_DEBUG(4, "in nfsrv_setattrdsrpc\n");
 	drpc = NULL;
 	if (mirrorcnt > 1)
 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
 		    M_WAITOK);
 
 	/*
 	 * Do the setattr RPC for every DS, using a separate kernel process
 	 * for every DS except the last one.
 	 */
 	error = 0;
 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
 		tdrpc->done = 0;
 		tdrpc->inprog = 0;
 		NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp));
 		tdrpc->nmp = *nmpp;
 		tdrpc->vp = vp;
 		tdrpc->cred = cred;
 		tdrpc->p = p;
 		tdrpc->na = *nap;
 		tdrpc->err = 0;
 		ret = EIO;
 		if (nfs_pnfsiothreads != 0) {
 			ret = nfs_pnfsio(start_setattrdsdorpc, tdrpc);
 			NFSD_DEBUG(4, "nfsrv_setattrdsrpc: nfs_pnfsio=%d\n",
 			    ret);
 		}
 		if (ret != 0) {
 			ret = nfsrv_setattrdsdorpc(fhp, cred, p, vp, *nmpp, nap,
 			    &na);
 			if (nfsds_failerr(ret) && *failposp == -1)
 				*failposp = i;
 			else if (error == 0 && ret != 0)
 				error = ret;
 		}
 		nmpp++;
 		fhp++;
 	}
 	ret = nfsrv_setattrdsdorpc(fhp, cred, p, vp, *nmpp, nap, &na);
 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
 		*failposp = mirrorcnt - 1;
 	else if (error == 0 && ret != 0)
 		error = ret;
 	if (error == 0)
 		error = nfsrv_setextattr(vp, &na, p);
 	NFSD_DEBUG(4, "nfsrv_setattrdsrpc: aft setextat=%d\n", error);
 	tdrpc = drpc;
 	timo = hz / 50;		/* Wait for 20msec. */
 	if (timo < 1)
 		timo = 1;
 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
 		/* Wait for RPCs on separate threads to complete. */
 		while (tdrpc->inprog != 0 && tdrpc->done == 0)
 			tsleep(&tdrpc->tsk, PVFS, "srvsads", timo);
 		if (nfsds_failerr(tdrpc->err) && *failposp == -1)
 			*failposp = i;
 		else if (error == 0 && tdrpc->err != 0)
 			error = tdrpc->err;
 	}
 	free(drpc, M_TEMP);
 	return (error);
 }
 
 /*
  * Do a Setattr of an NFSv4 ACL on the DS file.
  */
 static int
 nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
     struct vnode *vp, struct nfsmount *nmp, struct acl *aclp)
 {
 	struct nfsrv_descript *nd;
 	nfsv4stateid_t st;
 	nfsattrbit_t attrbits;
 	int error;
 
 	NFSD_DEBUG(4, "in nfsrv_setacldsdorpc\n");
 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
 	/*
 	 * Use a stateid where other is an alternating 01010 pattern and
 	 * seqid is 0xffffffff.  This value is not defined as special by
 	 * the RFC and is used by the FreeBSD NFS server to indicate an
 	 * MDS->DS proxy operation.
 	 */
 	st.other[0] = 0x55555555;
 	st.other[1] = 0x55555555;
 	st.other[2] = 0x55555555;
 	st.seqid = 0xffffffff;
 	nfscl_reqstart(nd, NFSPROC_SETACL, nmp, (u_int8_t *)fhp, sizeof(*fhp),
 	    NULL, NULL, 0, 0);
 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
 	/*
 	 * The "vp" argument to nfsv4_fillattr() is only used for vnode_type(),
 	 * so passing in the metadata "vp" will be ok, since it is of
 	 * the same type (VREG).
 	 */
 	nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL,
 	    NULL, 0, 0, 0, 0, 0, NULL);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0) {
 		free(nd, M_TEMP);
 		return (error);
 	}
 	NFSD_DEBUG(4, "nfsrv_setacldsdorpc: aft setaclrpc=%d\n",
 	    nd->nd_repstat);
 	error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	free(nd, M_TEMP);
 	return (error);
 }
 
 struct nfsrvsetacldsdorpc {
 	int			done;
 	int			inprog;
 	struct task		tsk;
 	fhandle_t		fh;
 	struct nfsmount		*nmp;
 	struct vnode		*vp;
 	struct ucred		*cred;
 	NFSPROC_T		*p;
 	struct acl		*aclp;
 	int			err;
 };
 
 /*
  * Start up the thread that will execute nfsrv_setacldsdorpc().
  */
 static void
 start_setacldsdorpc(void *arg, int pending)
 {
 	struct nfsrvsetacldsdorpc *drpc;
 
 	drpc = (struct nfsrvsetacldsdorpc *)arg;
 	drpc->err = nfsrv_setacldsdorpc(&drpc->fh, drpc->cred, drpc->p,
 	    drpc->vp, drpc->nmp, drpc->aclp);
 	drpc->done = 1;
 }
 
 static int
 nfsrv_setacldsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
     struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, struct acl *aclp,
     int *failposp)
 {
 	struct nfsrvsetacldsdorpc *drpc, *tdrpc = NULL;
 	int error, i, ret, timo;
 
 	NFSD_DEBUG(4, "in nfsrv_setacldsrpc\n");
 	drpc = NULL;
 	if (mirrorcnt > 1)
 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
 		    M_WAITOK);
 
 	/*
 	 * Do the setattr RPC for every DS, using a separate kernel process
 	 * for every DS except the last one.
 	 */
 	error = 0;
 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
 		tdrpc->done = 0;
 		tdrpc->inprog = 0;
 		NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp));
 		tdrpc->nmp = *nmpp;
 		tdrpc->vp = vp;
 		tdrpc->cred = cred;
 		tdrpc->p = p;
 		tdrpc->aclp = aclp;
 		tdrpc->err = 0;
 		ret = EIO;
 		if (nfs_pnfsiothreads != 0) {
 			ret = nfs_pnfsio(start_setacldsdorpc, tdrpc);
 			NFSD_DEBUG(4, "nfsrv_setacldsrpc: nfs_pnfsio=%d\n",
 			    ret);
 		}
 		if (ret != 0) {
 			ret = nfsrv_setacldsdorpc(fhp, cred, p, vp, *nmpp,
 			    aclp);
 			if (nfsds_failerr(ret) && *failposp == -1)
 				*failposp = i;
 			else if (error == 0 && ret != 0)
 				error = ret;
 		}
 		nmpp++;
 		fhp++;
 	}
 	ret = nfsrv_setacldsdorpc(fhp, cred, p, vp, *nmpp, aclp);
 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
 		*failposp = mirrorcnt - 1;
 	else if (error == 0 && ret != 0)
 		error = ret;
 	NFSD_DEBUG(4, "nfsrv_setacldsrpc: aft setextat=%d\n", error);
 	tdrpc = drpc;
 	timo = hz / 50;		/* Wait for 20msec. */
 	if (timo < 1)
 		timo = 1;
 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
 		/* Wait for RPCs on separate threads to complete. */
 		while (tdrpc->inprog != 0 && tdrpc->done == 0)
 			tsleep(&tdrpc->tsk, PVFS, "srvacds", timo);
 		if (nfsds_failerr(tdrpc->err) && *failposp == -1)
 			*failposp = i;
 		else if (error == 0 && tdrpc->err != 0)
 			error = tdrpc->err;
 	}
 	free(drpc, M_TEMP);
 	return (error);
 }
 
 /*
  * Getattr call to the DS for the attributes that change due to writing.
  */
 static int
 nfsrv_getattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
     struct vnode *vp, struct nfsmount *nmp, struct nfsvattr *nap)
 {
 	struct nfsrv_descript *nd;
 	int error;
 	nfsattrbit_t attrbits;
 
 	NFSD_DEBUG(4, "in nfsrv_getattrdsrpc\n");
 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
 	nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, (u_int8_t *)fhp,
 	    sizeof(fhandle_t), NULL, NULL, 0, 0);
 	NFSZERO_ATTRBIT(&attrbits);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED);
 	(void) nfsrv_putattrbit(nd, &attrbits);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0) {
 		free(nd, M_TEMP);
 		return (error);
 	}
 	NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft getattrrpc=%d\n",
 	    nd->nd_repstat);
 	if (nd->nd_repstat == 0) {
 		error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
 		    NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL,
 		    NULL, NULL);
 		/*
 		 * We can only save the updated values in the extended
 		 * attribute if the vp is exclusively locked.
 		 * This should happen when any of the following operations
 		 * occur on the vnode:
 		 *    Close, Delegreturn, LayoutCommit, LayoutReturn
 		 * As such, the updated extended attribute should get saved
 		 * before nfsrv_checkdsattr() returns 0 and allows the cached
 		 * attributes to be returned without calling this function.
 		 */
 		if (error == 0 && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
 			error = nfsrv_setextattr(vp, nap, p);
 			NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft setextat=%d\n",
 			    error);
 		}
 	} else
 		error = nd->nd_repstat;
 	m_freem(nd->nd_mrep);
 	free(nd, M_TEMP);
 	NFSD_DEBUG(4, "nfsrv_getattrdsrpc error=%d\n", error);
 	return (error);
 }
 
 /*
  * Seek call to a DS.
  */
 static int
 nfsrv_seekdsrpc(fhandle_t *fhp, off_t *offp, int content, bool *eofp,
     struct ucred *cred, NFSPROC_T *p, struct nfsmount *nmp)
 {
 	uint32_t *tl;
 	struct nfsrv_descript *nd;
 	nfsv4stateid_t st;
 	int error;
 
 	NFSD_DEBUG(4, "in nfsrv_seekdsrpc\n");
 	/*
 	 * Use a stateid where other is an alternating 01010 pattern and
 	 * seqid is 0xffffffff.  This value is not defined as special by
 	 * the RFC and is used by the FreeBSD NFS server to indicate an
 	 * MDS->DS proxy operation.
 	 */
 	st.other[0] = 0x55555555;
 	st.other[1] = 0x55555555;
 	st.other[2] = 0x55555555;
 	st.seqid = 0xffffffff;
 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
 	nfscl_reqstart(nd, NFSPROC_SEEKDS, nmp, (u_int8_t *)fhp,
 	    sizeof(fhandle_t), NULL, NULL, 0, 0);
 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
 	NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED);
 	txdr_hyper(*offp, tl); tl += 2;
 	*tl = txdr_unsigned(content);
 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
 	if (error != 0) {
 		free(nd, M_TEMP);
 		return (error);
 	}
 	NFSD_DEBUG(4, "nfsrv_seekdsrpc: aft seekrpc=%d\n", nd->nd_repstat);
 	if (nd->nd_repstat == 0) {
 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED + NFSX_HYPER);
 		if (*tl++ == newnfs_true)
 			*eofp = true;
 		else
 			*eofp = false;
 		*offp = fxdr_hyper(tl);
 	} else
 		error = nd->nd_repstat;
 nfsmout:
 	m_freem(nd->nd_mrep);
 	free(nd, M_TEMP);
 	NFSD_DEBUG(4, "nfsrv_seekdsrpc error=%d\n", error);
 	return (error);
 }
 
 /*
  * Get the device id and file handle for a DS file.
  */
 int
 nfsrv_dsgetdevandfh(struct vnode *vp, NFSPROC_T *p, int *mirrorcntp,
     fhandle_t *fhp, char *devid)
 {
 	int buflen, error;
 	char *buf;
 
 	buflen = 1024;
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, NULL,
 	    fhp, devid, NULL, NULL, NULL, NULL, NULL, NULL);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * Do a Lookup against the DS for the filename.
  */
 static int
 nfsrv_pnfslookupds(struct vnode *vp, struct vnode *dvp, struct pnfsdsfile *pf,
     struct vnode **nvpp, NFSPROC_T *p)
 {
 	struct nameidata named;
 	struct ucred *tcred;
 	char *bufp;
 	u_long *hashp;
 	struct vnode *nvp;
 	int error;
 
 	tcred = newnfs_getcred();
 	named.ni_cnd.cn_nameiop = LOOKUP;
 	named.ni_cnd.cn_lkflags = LK_SHARED | LK_RETRY;
 	named.ni_cnd.cn_cred = tcred;
 	named.ni_cnd.cn_thread = p;
 	named.ni_cnd.cn_flags = ISLASTCN | LOCKPARENT | LOCKLEAF | SAVENAME;
 	nfsvno_setpathbuf(&named, &bufp, &hashp);
 	named.ni_cnd.cn_nameptr = bufp;
 	named.ni_cnd.cn_namelen = strlen(pf->dsf_filename);
 	strlcpy(bufp, pf->dsf_filename, NAME_MAX);
 	NFSD_DEBUG(4, "nfsrv_pnfslookupds: filename=%s\n", bufp);
 	error = VOP_LOOKUP(dvp, &nvp, &named.ni_cnd);
 	NFSD_DEBUG(4, "nfsrv_pnfslookupds: aft LOOKUP=%d\n", error);
 	NFSFREECRED(tcred);
 	nfsvno_relpathbuf(&named);
 	if (error == 0)
 		*nvpp = nvp;
 	NFSD_DEBUG(4, "eo nfsrv_pnfslookupds=%d\n", error);
 	return (error);
 }
 
 /*
  * Set the file handle to the correct one.
  */
 static void
 nfsrv_pnfssetfh(struct vnode *vp, struct pnfsdsfile *pf, char *devid,
     char *fnamep, struct vnode *nvp, NFSPROC_T *p)
 {
 	struct nfsnode *np;
 	int ret = 0;
 
 	np = VTONFS(nvp);
 	NFSBCOPY(np->n_fhp->nfh_fh, &pf->dsf_fh, NFSX_MYFH);
 	/*
 	 * We can only do a vn_set_extattr() if the vnode is exclusively
 	 * locked and vn_start_write() has been done.  If devid != NULL or
 	 * fnamep != NULL or the vnode is shared locked, vn_start_write()
 	 * may not have been done.
 	 * If not done now, it will be done on a future call.
 	 */
 	if (devid == NULL && fnamep == NULL && NFSVOPISLOCKED(vp) ==
 	    LK_EXCLUSIVE)
 		ret = vn_extattr_set(vp, IO_NODELOCKED,
 		    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", sizeof(*pf),
 		    (char *)pf, p);
 	NFSD_DEBUG(4, "eo nfsrv_pnfssetfh=%d\n", ret);
 }
 
 /*
  * Cause RPCs waiting on "nmp" to fail.  This is called for a DS mount point
  * when the DS has failed.
  */
 void
 nfsrv_killrpcs(struct nfsmount *nmp)
 {
 
 	/*
 	 * Call newnfs_nmcancelreqs() to cause
 	 * any RPCs in progress on the mount point to
 	 * fail.
 	 * This will cause any process waiting for an
 	 * RPC to complete while holding a vnode lock
 	 * on the mounted-on vnode (such as "df" or
 	 * a non-forced "umount") to fail.
 	 * This will unlock the mounted-on vnode so
 	 * a forced dismount can succeed.
 	 * The NFSMNTP_CANCELRPCS flag should be set when this function is
 	 * called.
 	 */
 	newnfs_nmcancelreqs(nmp);
 }
 
 /*
  * Sum up the statfs info for each of the DSs, so that the client will
  * receive the total for all DSs.
  */
 static int
 nfsrv_pnfsstatfs(struct statfs *sf, struct mount *mp)
 {
 	struct statfs *tsf;
 	struct nfsdevice *ds;
 	struct vnode **dvpp, **tdvpp, *dvp;
 	uint64_t tot;
 	int cnt, error = 0, i;
 
 	if (nfsrv_devidcnt <= 0)
 		return (ENXIO);
 	dvpp = mallocarray(nfsrv_devidcnt, sizeof(*dvpp), M_TEMP, M_WAITOK);
 	tsf = malloc(sizeof(*tsf), M_TEMP, M_WAITOK);
 
 	/* Get an array of the dvps for the DSs. */
 	tdvpp = dvpp;
 	i = 0;
 	NFSDDSLOCK();
 	/* First, search for matches for same file system. */
 	TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
 		if (ds->nfsdev_nmp != NULL && ds->nfsdev_mdsisset != 0 &&
 		    fsidcmp(&ds->nfsdev_mdsfsid, &mp->mnt_stat.f_fsid) == 0) {
 			if (++i > nfsrv_devidcnt)
 				break;
 			*tdvpp++ = ds->nfsdev_dvp;
 		}
 	}
 	/*
 	 * If no matches for same file system, total all servers not assigned
 	 * to a file system.
 	 */
 	if (i == 0) {
 		TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
 			if (ds->nfsdev_nmp != NULL &&
 			    ds->nfsdev_mdsisset == 0) {
 				if (++i > nfsrv_devidcnt)
 					break;
 				*tdvpp++ = ds->nfsdev_dvp;
 			}
 		}
 	}
 	NFSDDSUNLOCK();
 	cnt = i;
 
 	/* Do a VFS_STATFS() for each of the DSs and sum them up. */
 	tdvpp = dvpp;
 	for (i = 0; i < cnt && error == 0; i++) {
 		dvp = *tdvpp++;
 		error = VFS_STATFS(dvp->v_mount, tsf);
 		if (error == 0) {
 			if (sf->f_bsize == 0) {
 				if (tsf->f_bsize > 0)
 					sf->f_bsize = tsf->f_bsize;
 				else
 					sf->f_bsize = 8192;
 			}
 			if (tsf->f_blocks > 0) {
 				if (sf->f_bsize != tsf->f_bsize) {
 					tot = tsf->f_blocks * tsf->f_bsize;
 					sf->f_blocks += (tot / sf->f_bsize);
 				} else
 					sf->f_blocks += tsf->f_blocks;
 			}
 			if (tsf->f_bfree > 0) {
 				if (sf->f_bsize != tsf->f_bsize) {
 					tot = tsf->f_bfree * tsf->f_bsize;
 					sf->f_bfree += (tot / sf->f_bsize);
 				} else
 					sf->f_bfree += tsf->f_bfree;
 			}
 			if (tsf->f_bavail > 0) {
 				if (sf->f_bsize != tsf->f_bsize) {
 					tot = tsf->f_bavail * tsf->f_bsize;
 					sf->f_bavail += (tot / sf->f_bsize);
 				} else
 					sf->f_bavail += tsf->f_bavail;
 			}
 		}
 	}
 	free(tsf, M_TEMP);
 	free(dvpp, M_TEMP);
 	return (error);
 }
 
 /*
  * Set an NFSv4 acl.
  */
 int
 nfsrv_setacl(struct vnode *vp, NFSACL_T *aclp, struct ucred *cred, NFSPROC_T *p)
 {
 	int error;
 
 	if (nfsrv_useacl == 0 || nfs_supportsnfsv4acls(vp) == 0) {
 		error = NFSERR_ATTRNOTSUPP;
 		goto out;
 	}
 	/*
 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries.
 	 * Make sure it has enough room for that - splitting every entry
 	 * into two and appending "canonical six" entries at the end.
 	 * Cribbed out of kern/vfs_acl.c - Rick M.
 	 */
 	if (aclp->acl_cnt > (ACL_MAX_ENTRIES - 6) / 2) {
 		error = NFSERR_ATTRNOTSUPP;
 		goto out;
 	}
 	error = VOP_SETACL(vp, ACL_TYPE_NFS4, aclp, cred, p);
 	if (error == 0) {
 		error = nfsrv_dssetacl(vp, aclp, cred, p);
 		if (error == ENOENT)
 			error = 0;
 	}
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Seek vnode op call (actually it is a VOP_IOCTL()).
  * This function is called with the vnode locked, but unlocks and vrele()s
  * the vp before returning.
  */
 int
 nfsvno_seek(struct nfsrv_descript *nd, struct vnode *vp, u_long cmd,
     off_t *offp, int content, bool *eofp, struct ucred *cred, NFSPROC_T *p)
 {
 	struct nfsvattr at;
 	int error, ret;
 
 	ASSERT_VOP_LOCKED(vp, "nfsvno_seek vp");
 	/*
 	 * Attempt to seek on a DS file. A return of ENOENT implies
 	 * there is no DS file to seek on.
 	 */
 	error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SEEKDS, NULL,
 	    NULL, NULL, NULL, NULL, offp, content, eofp);
 	if (error != ENOENT) {
 		vput(vp);
 		return (error);
 	}
 
 	/*
 	 * Do the VOP_IOCTL() call.  For the case where *offp == file_size,
 	 * VOP_IOCTL() will return ENXIO.  However, the correct reply for
 	 * NFSv4.2 is *eofp == true and error == 0 for this case.
 	 */
 	NFSVOPUNLOCK(vp);
 	error = VOP_IOCTL(vp, cmd, offp, 0, cred, p);
 	*eofp = false;
 	if (error == ENXIO || (error == 0 && cmd == FIOSEEKHOLE)) {
 		/* Handle the cases where we might be at EOF. */
 		ret = nfsvno_getattr(vp, &at, nd, p, 0, NULL);
 		if (ret == 0 && *offp == at.na_size) {
 			*eofp = true;
 			error = 0;
 		}
 		if (ret != 0 && error == 0)
 			error = ret;
 	}
 	vrele(vp);
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Allocate vnode op call.
  */
 int
 nfsvno_allocate(struct vnode *vp, off_t off, off_t len, struct ucred *cred,
     NFSPROC_T *p)
 {
 	int error, trycnt;
 
 	ASSERT_VOP_ELOCKED(vp, "nfsvno_allocate vp");
 	/*
 	 * Attempt to allocate on a DS file. A return of ENOENT implies
 	 * there is no DS file to allocate on.
 	 */
 	error = nfsrv_proxyds(vp, off, 0, cred, p, NFSPROC_ALLOCATE, NULL,
 	    NULL, NULL, NULL, NULL, &len, 0, NULL);
 	if (error != ENOENT)
 		return (error);
 	error = 0;
 
 	/*
 	 * Do the actual VOP_ALLOCATE(), looping a reasonable number of
 	 * times to achieve completion.
 	 */
 	trycnt = 0;
 	while (error == 0 && len > 0 && trycnt++ < 20)
 		error = VOP_ALLOCATE(vp, &off, &len);
 	if (error == 0 && len > 0)
 		error = NFSERR_IO;
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Get Extended Atribute vnode op into an mbuf list.
  */
 int
 nfsvno_getxattr(struct vnode *vp, char *name, uint32_t maxresp,
     struct ucred *cred, uint64_t flag, int maxextsiz, struct thread *p,
     struct mbuf **mpp, struct mbuf **mpendp, int *lenp)
 {
 	struct iovec *iv;
 	struct uio io, *uiop = &io;
 	struct mbuf *m, *m2;
 	int alen, error, len, tlen;
 	size_t siz;
 
 	/* First, find out the size of the extended attribute. */
 	error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, NULL,
 	    &siz, cred, p);
 	if (error != 0)
 		return (NFSERR_NOXATTR);
 	if (siz > maxresp - NFS_MAXXDR)
 		return (NFSERR_XATTR2BIG);
 	len = siz;
 	tlen = NFSM_RNDUP(len);
 	if (tlen > 0) {
 		/*
 		 * If cnt > MCLBYTES and the reply will not be saved, use
 		 * ext_pgs mbufs for TLS.
 		 * For NFSv4.0, we do not know for sure if the reply will
 		 * be saved, so do not use ext_pgs mbufs for NFSv4.0.
 		 * Always use ext_pgs mbufs if ND_EXTPG is set.
 		 */
 		if ((flag & ND_EXTPG) != 0 || (tlen > MCLBYTES &&
 		    (flag & (ND_TLS | ND_SAVEREPLY)) == ND_TLS &&
 		    (flag & (ND_NFSV4 | ND_NFSV41)) != ND_NFSV4))
 			uiop->uio_iovcnt = nfsrv_createiovec_extpgs(tlen,
 			    maxextsiz, &m, &m2, &iv);
 		else
 			uiop->uio_iovcnt = nfsrv_createiovec(tlen, &m, &m2,
 			    &iv);
 		uiop->uio_iov = iv;
 	} else {
 		uiop->uio_iovcnt = 0;
 		uiop->uio_iov = iv = NULL;
 		m = m2 = NULL;
 	}
 	uiop->uio_offset = 0;
 	uiop->uio_resid = tlen;
 	uiop->uio_rw = UIO_READ;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = p;
 #ifdef MAC
 	error = mac_vnode_check_getextattr(cred, vp, EXTATTR_NAMESPACE_USER,
 	    name);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (tlen > 0)
 		error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, uiop,
 		    NULL, cred, p);
 	if (error != 0)
 		goto out;
 	if (uiop->uio_resid > 0) {
 		alen = tlen;
 		len = tlen - uiop->uio_resid;
 		tlen = NFSM_RNDUP(len);
 		if (alen != tlen)
 			printf("nfsvno_getxattr: weird size read\n");
 		if (tlen == 0) {
 			m_freem(m);
 			m = m2 = NULL;
 		} else if (alen != tlen || tlen != len)
 			m2 = nfsrv_adj(m, alen - tlen, tlen - len);
 	}
 	*lenp = len;
 	*mpp = m;
 	*mpendp = m2;
 
 out:
 	if (error != 0) {
 		if (m != NULL)
 			m_freem(m);
 		*lenp = 0;
 	}
 	free(iv, M_TEMP);
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Set Extended attribute vnode op from an mbuf list.
  */
 int
 nfsvno_setxattr(struct vnode *vp, char *name, int len, struct mbuf *m,
     char *cp, struct ucred *cred, struct thread *p)
 {
 	struct iovec *iv;
 	struct uio uio, *uiop = &uio;
 	int cnt, error;
 
 	error = 0;
 #ifdef MAC
 	error = mac_vnode_check_setextattr(cred, vp, EXTATTR_NAMESPACE_USER,
 	    name);
 #endif
 	if (error != 0)
 		goto out;
 
 	uiop->uio_rw = UIO_WRITE;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = p;
 	uiop->uio_offset = 0;
 	uiop->uio_resid = len;
 	if (len > 0) {
 		error = nfsrv_createiovecw(len, m, cp, &iv, &cnt);
 		uiop->uio_iov = iv;
 		uiop->uio_iovcnt = cnt;
 	} else {
 		uiop->uio_iov = iv = NULL;
 		uiop->uio_iovcnt = 0;
 	}
 	if (error == 0) {
 		error = VOP_SETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, uiop,
 		    cred, p);
 		free(iv, M_TEMP);
 	}
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Remove Extended attribute vnode op.
  */
 int
 nfsvno_rmxattr(struct nfsrv_descript *nd, struct vnode *vp, char *name,
     struct ucred *cred, struct thread *p)
 {
 	int error;
 
 	/*
 	 * Get rid of any delegations.  I am not sure why this is required,
 	 * but RFC-8276 says so.
 	 */
 	error = nfsrv_checkremove(vp, 0, nd, nd->nd_clientid, p);
 	if (error != 0)
 		goto out;
 #ifdef MAC
 	error = mac_vnode_check_deleteextattr(cred, vp, EXTATTR_NAMESPACE_USER,
 	    name);
 	if (error != 0)
 		goto out;
 #endif
 
 	error = VOP_DELETEEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, cred, p);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, NULL,
 		    cred, p);
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * List Extended Atribute vnode op into an mbuf list.
  */
 int
 nfsvno_listxattr(struct vnode *vp, uint64_t cookie, struct ucred *cred,
     struct thread *p, u_char **bufp, uint32_t *lenp, bool *eofp)
 {
 	struct iovec iv;
 	struct uio io;
 	int error;
 	size_t siz;
 
 	*bufp = NULL;
 	/* First, find out the size of the extended attribute. */
 	error = VOP_LISTEXTATTR(vp, EXTATTR_NAMESPACE_USER, NULL, &siz, cred,
 	    p);
 	if (error != 0)
 		return (NFSERR_NOXATTR);
 	if (siz <= cookie) {
 		*lenp = 0;
 		*eofp = true;
 		goto out;
 	}
 	if (siz > cookie + *lenp) {
 		siz = cookie + *lenp;
 		*eofp = false;
 	} else
 		*eofp = true;
 	/* Just choose a sanity limit of 10Mbytes for malloc(M_TEMP). */
 	if (siz > 10 * 1024 * 1024) {
 		error = NFSERR_XATTR2BIG;
 		goto out;
 	}
 	*bufp = malloc(siz, M_TEMP, M_WAITOK);
 	iv.iov_base = *bufp;
 	iv.iov_len = siz;
 	io.uio_iovcnt = 1;
 	io.uio_iov = &iv;
 	io.uio_offset = 0;
 	io.uio_resid = siz;
 	io.uio_rw = UIO_READ;
 	io.uio_segflg = UIO_SYSSPACE;
 	io.uio_td = p;
 #ifdef MAC
 	error = mac_vnode_check_listextattr(cred, vp, EXTATTR_NAMESPACE_USER);
 	if (error != 0)
 		goto out;
 #endif
 
 	error = VOP_LISTEXTATTR(vp, EXTATTR_NAMESPACE_USER, &io, NULL, cred,
 	    p);
 	if (error != 0)
 		goto out;
 	if (io.uio_resid > 0)
 		siz -= io.uio_resid;
 	*lenp = siz;
 
 out:
 	if (error != 0) {
 		free(*bufp, M_TEMP);
 		*bufp = NULL;
 	}
 	NFSEXITCODE(error);
 	return (error);
 }
 
 /*
  * Trim trailing data off the mbuf list being built.
  */
 void
 nfsm_trimtrailing(struct nfsrv_descript *nd, struct mbuf *mb, char *bpos,
     int bextpg, int bextpgsiz)
 {
 	vm_page_t pg;
 	int fullpgsiz, i;
 
 	if (mb->m_next != NULL) {
 		m_freem(mb->m_next);
 		mb->m_next = NULL;
 	}
 	if ((mb->m_flags & M_EXTPG) != 0) {
 		KASSERT(bextpg >= 0 && bextpg < mb->m_epg_npgs,
 		    ("nfsm_trimtrailing: bextpg out of range"));
 		KASSERT(bpos == (char *)(void *)
 		    PHYS_TO_DMAP(mb->m_epg_pa[bextpg]) + PAGE_SIZE - bextpgsiz,
 		    ("nfsm_trimtrailing: bextpgsiz bad!"));
 
 		/* First, get rid of any pages after this position. */
 		for (i = mb->m_epg_npgs - 1; i > bextpg; i--) {
 			pg = PHYS_TO_VM_PAGE(mb->m_epg_pa[i]);
 			vm_page_unwire_noq(pg);
 			vm_page_free(pg);
 		}
 		mb->m_epg_npgs = bextpg + 1;
 		if (bextpg == 0)
 			fullpgsiz = PAGE_SIZE - mb->m_epg_1st_off;
 		else
 			fullpgsiz = PAGE_SIZE;
 		mb->m_epg_last_len = fullpgsiz - bextpgsiz;
 		mb->m_len = m_epg_pagelen(mb, 0, mb->m_epg_1st_off);
 		for (i = 1; i < mb->m_epg_npgs; i++)
 			mb->m_len += m_epg_pagelen(mb, i, 0);
 		nd->nd_bextpgsiz = bextpgsiz;
 		nd->nd_bextpg = bextpg;
 	} else
 		mb->m_len = bpos - mtod(mb, char *);
 	nd->nd_mb = mb;
 	nd->nd_bpos = bpos;
 }
 
 extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *);
 
 /*
  * Called once to initialize data structures...
  */
 static int
 nfsd_modevent(module_t mod, int type, void *data)
 {
 	int error = 0, i;
 	static int loaded = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		if (loaded)
 			goto out;
 		newnfs_portinit();
 		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 			mtx_init(&nfsrchash_table[i].mtx, "nfsrtc", NULL,
 			    MTX_DEF);
 			mtx_init(&nfsrcahash_table[i].mtx, "nfsrtca", NULL,
 			    MTX_DEF);
 		}
 		mtx_init(&nfsrc_udpmtx, "nfsuc", NULL, MTX_DEF);
 		mtx_init(&nfs_v4root_mutex, "nfs4rt", NULL, MTX_DEF);
 		mtx_init(&nfsv4root_mnt.mnt_mtx, "nfs4mnt", NULL, MTX_DEF);
 		mtx_init(&nfsrv_dontlistlock_mtx, "nfs4dnl", NULL, MTX_DEF);
 		mtx_init(&nfsrv_recalllock_mtx, "nfs4rec", NULL, MTX_DEF);
 		lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0);
 		nfsrvd_initcache();
 		nfsd_init();
 		NFSD_LOCK();
 		nfsrvd_init(0);
 		NFSD_UNLOCK();
 		nfsd_mntinit();
 #ifdef VV_DISABLEDELEG
 		vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation;
 		vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation;
 #endif
 		nfsd_call_servertimer = nfsrv_servertimer;
 		nfsd_call_nfsd = nfssvc_nfsd;
 		loaded = 1;
 		break;
 
 	case MOD_UNLOAD:
 		if (newnfs_numnfsd != 0) {
 			error = EBUSY;
 			break;
 		}
 
 #ifdef VV_DISABLEDELEG
 		vn_deleg_ops.vndeleg_recall = NULL;
 		vn_deleg_ops.vndeleg_disable = NULL;
 #endif
 		nfsd_call_servertimer = NULL;
 		nfsd_call_nfsd = NULL;
 
 		/* Clean out all NFSv4 state. */
 		nfsrv_throwawayallstate(curthread);
 
 		/* Clean the NFS server reply cache */
 		nfsrvd_cleancache();
 
 		/* Free up the krpc server pool. */
 		if (nfsrvd_pool != NULL)
 			svcpool_destroy(nfsrvd_pool);
 
 		/* and get rid of the locks */
 		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 			mtx_destroy(&nfsrchash_table[i].mtx);
 			mtx_destroy(&nfsrcahash_table[i].mtx);
 		}
 		mtx_destroy(&nfsrc_udpmtx);
 		mtx_destroy(&nfs_v4root_mutex);
 		mtx_destroy(&nfsv4root_mnt.mnt_mtx);
 		mtx_destroy(&nfsrv_dontlistlock_mtx);
 		mtx_destroy(&nfsrv_recalllock_mtx);
 		for (i = 0; i < nfsrv_sessionhashsize; i++)
 			mtx_destroy(&nfssessionhash[i].mtx);
 		if (nfslayouthash != NULL) {
 			for (i = 0; i < nfsrv_layouthashsize; i++)
 				mtx_destroy(&nfslayouthash[i].mtx);
 			free(nfslayouthash, M_NFSDSESSION);
 		}
 		lockdestroy(&nfsv4root_mnt.mnt_explock);
 		free(nfsclienthash, M_NFSDCLIENT);
 		free(nfslockhash, M_NFSDLOCKFILE);
 		free(nfssessionhash, M_NFSDSESSION);
 		loaded = 0;
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 out:
 	NFSEXITCODE(error);
 	return (error);
 }
 static moduledata_t nfsd_mod = {
 	"nfsd",
 	nfsd_modevent,
 	NULL,
 };
 DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(nfsd, 1);
 MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1);
 MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1);
 MODULE_DEPEND(nfsd, krpc, 1, 1, 1);
 MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1);
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index e22c987f8a08..5185723b8d10 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -1,2729 +1,2730 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
  * Copyright (c) 2009 Apple, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_kqueue.h"
 
 #ifdef COMPAT_FREEBSD11
 #define	_WANT_FREEBSD11_KEVENT
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <machine/atomic.h>
 
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 /*
  * This lock is used if multiple kq locks are required.  This possibly
  * should be made into a per proc lock.
  */
 static struct mtx	kq_global;
 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
 	if (!haslck)				\
 		mtx_lock(lck);			\
 	haslck = 1;				\
 } while (0)
 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
 	if (haslck)				\
 		mtx_unlock(lck);			\
 	haslck = 0;				\
 } while (0)
 
 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
 
 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
 		    struct thread *td, int mflag);
 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
 static void	kqueue_release(struct kqueue *kq, int locked);
 static void	kqueue_destroy(struct kqueue *kq);
 static void	kqueue_drain(struct kqueue *kq, struct thread *td);
 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
 		    uintptr_t ident, int mflag);
 static void	kqueue_task(void *arg, int pending);
 static int	kqueue_scan(struct kqueue *kq, int maxevents,
 		    struct kevent_copyops *k_ops,
 		    const struct timespec *timeout,
 		    struct kevent *keva, struct thread *td);
 static void 	kqueue_wakeup(struct kqueue *kq);
 static struct filterops *kqueue_fo_find(int filt);
 static void	kqueue_fo_release(int filt);
 struct g_kevent_args;
 static int	kern_kevent_generic(struct thread *td,
 		    struct g_kevent_args *uap,
 		    struct kevent_copyops *k_ops, const char *struct_name);
 
 static fo_ioctl_t	kqueue_ioctl;
 static fo_poll_t	kqueue_poll;
 static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 static fo_fill_kinfo_t	kqueue_fill_kinfo;
 
 static struct fileops kqueueops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = kqueue_ioctl,
 	.fo_poll = kqueue_poll,
 	.fo_kqfilter = kqueue_kqfilter,
 	.fo_stat = kqueue_stat,
 	.fo_close = kqueue_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = kqueue_fill_kinfo,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
 static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_drop_detached(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(int mflag);
 static void 	knote_free(struct knote *kn);
 
 static void	filt_kqdetach(struct knote *kn);
 static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static int	filt_timerattach(struct knote *kn);
 static void	filt_timerdetach(struct knote *kn);
 static void	filt_timerstart(struct knote *kn, sbintime_t to);
 static void	filt_timertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
 static int	filt_timer(struct knote *kn, long hint);
 static int	filt_userattach(struct knote *kn);
 static void	filt_userdetach(struct knote *kn);
 static int	filt_user(struct knote *kn, long hint);
 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 
 static struct filterops file_filtops = {
 	.f_isfd = 1,
 	.f_attach = filt_fileattach,
 };
 static struct filterops kqread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_kqdetach,
 	.f_event = filt_kqueue,
 };
 /* XXX - move to kern_proc.c?  */
 static struct filterops proc_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_procattach,
 	.f_detach = filt_procdetach,
 	.f_event = filt_proc,
 };
 static struct filterops timer_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_timerattach,
 	.f_detach = filt_timerdetach,
 	.f_event = filt_timer,
 	.f_touch = filt_timertouch,
 };
 static struct filterops user_filtops = {
 	.f_attach = filt_userattach,
 	.f_detach = filt_userdetach,
 	.f_event = filt_user,
 	.f_touch = filt_usertouch,
 };
 
 static uma_zone_t	knote_zone;
 static unsigned int	kq_ncallouts = 0;
 static unsigned int 	kq_calloutmax = 4 * 1024;
 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 /* XXX - ensure not influx ? */
 #define KNOTE_ACTIVATE(kn, islock) do { 				\
 	if ((islock))							\
 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
 	else								\
 		KQ_LOCK((kn)->kn_kq);					\
 	(kn)->kn_status |= KN_ACTIVE;					\
 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
 		knote_enqueue((kn));					\
 	if (!(islock))							\
 		KQ_UNLOCK((kn)->kn_kq);					\
 } while(0)
 #define KQ_LOCK(kq) do {						\
 	mtx_lock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_FLUX_WAKEUP(kq) do {						\
 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
 		wakeup((kq));						\
 	}								\
 } while (0)
 #define KQ_UNLOCK_FLUX(kq) do {						\
 	KQ_FLUX_WAKEUP(kq);						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_UNLOCK(kq) do {						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_OWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
 } while (0)
 #define KQ_NOTOWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
 } while (0)
 
 static struct knlist *
 kn_list_lock(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = kn->kn_knlist;
 	if (knl != NULL)
 		knl->kl_lock(knl->kl_lockarg);
 	return (knl);
 }
 
 static void
 kn_list_unlock(struct knlist *knl)
 {
 	bool do_free;
 
 	if (knl == NULL)
 		return;
 	do_free = knl->kl_autodestroy && knlist_empty(knl);
 	knl->kl_unlock(knl->kl_lockarg);
 	if (do_free) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 static bool
 kn_in_flux(struct knote *kn)
 {
 
 	return (kn->kn_influx > 0);
 }
 
 static void
 kn_enter_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx < INT_MAX);
 	kn->kn_influx++;
 }
 
 static bool
 kn_leave_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx > 0);
 	kn->kn_influx--;
 	return (kn->kn_influx == 0);
 }
 
 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
 	if (islocked)							\
 		KNL_ASSERT_LOCKED(knl);				\
 	else								\
 		KNL_ASSERT_UNLOCKED(knl);				\
 } while (0)
 #ifdef INVARIANTS
 #define	KNL_ASSERT_LOCKED(knl) do {					\
 	knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED);		\
 } while (0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
 	knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED);		\
 } while (0)
 #else /* !INVARIANTS */
 #define	KNL_ASSERT_LOCKED(knl) do {} while(0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
 #endif /* INVARIANTS */
 
 #ifndef	KN_HASHSIZE
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #endif
 
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
 filt_nullattach(struct knote *kn)
 {
 
 	return (ENXIO);
 };
 
 struct filterops null_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_nullattach,
 };
 
 /* XXX - make SYSINIT to add these, and move into respective modules. */
 extern struct filterops sig_filtops;
 extern struct filterops fs_filtops;
 
 /*
  * Table for for all system-defined filters.
  */
 static struct mtx	filterops_lock;
 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
 	MTX_DEF);
 static struct {
 	struct filterops *for_fop;
 	int for_nolock;
 	int for_refcnt;
 } sysfilt_ops[EVFILT_SYSCOUNT] = {
 	{ &file_filtops, 1 },			/* EVFILT_READ */
 	{ &file_filtops, 1 },			/* EVFILT_WRITE */
 	{ &null_filtops },			/* EVFILT_AIO */
 	{ &file_filtops, 1 },			/* EVFILT_VNODE */
 	{ &proc_filtops, 1 },			/* EVFILT_PROC */
 	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
 	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
 	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
 	{ &fs_filtops, 1 },			/* EVFILT_FS */
 	{ &null_filtops },			/* EVFILT_LIO */
 	{ &user_filtops, 1 },			/* EVFILT_USER */
 	{ &null_filtops },			/* EVFILT_SENDFILE */
 	{ &file_filtops, 1 },                   /* EVFILT_EMPTY */
 };
 
 /*
  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
  * method.
  */
 static int
 filt_fileattach(struct knote *kn)
 {
 
 	return (fo_kqfilter(kn->kn_fp, kn));
 }
 
 /*ARGSUSED*/
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_status |= KN_KQUEUE;
 	kn->kn_fop = &kqread_filtops;
 	knlist_add(&kq->kq_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_kqdetach(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, long hint)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	kn->kn_data = kq->kq_count;
 	return (kn->kn_data > 0);
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
 	int error;
 	bool exiting, immediate;
 
 	exiting = immediate = false;
 	if (kn->kn_sfflags & NOTE_EXIT)
 		p = pfind_any(kn->kn_id);
 	else
 		p = pfind(kn->kn_id);
 	if (p == NULL)
 		return (ESRCH);
 	if (p->p_flag & P_WEXIT)
 		exiting = true;
 
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
 	 * Internal flag indicating registration done by kernel for the
 	 * purposes of getting a NOTE_CHILD notification.
 	 */
 	if (kn->kn_flags & EV_FLAG2) {
 		kn->kn_flags &= ~EV_FLAG2;
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
 		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
 		immediate = true; /* Force immediate activation of child note. */
 	}
 	/*
 	 * Internal flag indicating registration done by kernel (for other than
 	 * NOTE_CHILD).
 	 */
 	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
 	knlist_add(p->p_klist, kn, 1);
 
 	/*
 	 * Immediately activate any child notes or, in the case of a zombie
 	 * target process, exit notes.  The latter is necessary to handle the
 	 * case where the target process, e.g. a child, dies before the kevent
 	 * is registered.
 	 */
 	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
 		KNOTE_ACTIVATE(kn, 0);
 
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * The knote may be attached to a different process, which may exit,
  * leaving nothing for the knote to be attached to.  So when the process
  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  * it will be deleted when read out.  However, as part of the knote deletion,
  * this routine is called, so a check is needed to avoid actually performing
  * a detach, because the original process does not exist any more.
  */
 /* XXX - move to kern_proc.c?  */
 static void
 filt_procdetach(struct knote *kn)
 {
 
 	knlist_remove(kn->kn_knlist, kn, 0);
 	kn->kn_ptr.p_proc = NULL;
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_proc(struct knote *kn, long hint)
 {
 	struct proc *p;
 	u_int event;
 
 	p = kn->kn_ptr.p_proc;
 	if (p == NULL) /* already activated, from attach filter */
 		return (0);
 
 	/* Mask off extra data. */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		kn->kn_ptr.p_proc = NULL;
 		if (kn->kn_fflags & NOTE_EXIT)
 			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 /*
  * Called when the process forked. It mostly does the same as the
  * knote(), activating all knotes registered to be activated when the
  * process forked. Additionally, for each knote attached to the
  * parent, check whether user wants to track the new process. If so
  * attach a new knote to it, and immediately report an event with the
  * child's pid.
  */
 void
 knote_fork(struct knlist *list, int pid)
 {
 	struct kqueue *kq;
 	struct knote *kn;
 	struct kevent kev;
 	int error;
 
 	MPASS(list != NULL);
 	KNL_ASSERT_LOCKED(list);
 	if (SLIST_EMPTY(&list->kl_list))
 		return;
 
 	memset(&kev, 0, sizeof(kev));
 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The same as knote(), activate the event.
 		 */
 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The NOTE_TRACK case. In addition to the activation
 		 * of the event, we need to register new events to
 		 * track the child. Drop the locks in preparation for
 		 * the call to kqueue_register().
 		 */
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		list->kl_unlock(list->kl_lockarg);
 
 		/*
 		 * Activate existing knote and register tracking knotes with
 		 * new process.
 		 *
 		 * First register a knote to get just the child notice. This
 		 * must be a separate note from a potential NOTE_EXIT
 		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
 		 * to use the data field (in conflicting ways).
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
 		    EV_FLAG2;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 
 		/*
 		 * Then register another knote to track other potential events
 		 * from the new process.
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
 			KNOTE_ACTIVATE(kn, 0);
 		list->kl_lock(list->kl_lockarg);
 		KQ_LOCK(kq);
 		kn_leave_flux(kn);
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 /*
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
  */
 
 #define NOTE_TIMER_PRECMASK						\
     (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
 
 static sbintime_t
 timer2sbintime(int64_t data, int flags)
 {
 	int64_t secs;
 
         /*
          * Macros for converting to the fractional second portion of an
          * sbintime_t using 64bit multiplication to improve precision.
          */
 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
 	switch (flags & NOTE_TIMER_PRECMASK) {
 	case NOTE_SECONDS:
 #ifdef __LP64__
 		if (data > (SBT_MAX / SBT_1S))
 			return (SBT_MAX);
 #endif
 		return ((sbintime_t)data << 32);
 	case NOTE_MSECONDS: /* FALLTHROUGH */
 	case 0:
 		if (data >= 1000) {
 			secs = data / 1000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | MS_TO_SBT(data % 1000));
 		}
 		return (MS_TO_SBT(data));
 	case NOTE_USECONDS:
 		if (data >= 1000000) {
 			secs = data / 1000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | US_TO_SBT(data % 1000000));
 		}
 		return (US_TO_SBT(data));
 	case NOTE_NSECONDS:
 		if (data >= 1000000000) {
 			secs = data / 1000000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | NS_TO_SBT(data % 1000000000));
 		}
 		return (NS_TO_SBT(data));
 	default:
 		break;
 	}
 	return (-1);
 }
 
 struct kq_timer_cb_data {
 	struct callout c;
 	sbintime_t next;	/* next timer event fires at */
 	sbintime_t to;		/* precalculated timer period, 0 for abs */
 };
 
 static void
 filt_timerexpire(void *knx)
 {
 	struct knote *kn;
 	struct kq_timer_cb_data *kc;
 
 	kn = knx;
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
 	if ((kn->kn_flags & EV_ONESHOT) != 0)
 		return;
 	kc = kn->kn_ptr.p_v;
 	if (kc->to == 0)
 		return;
 	kc->next += kc->to;
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
 	    PCPU_GET(cpuid), C_ABSOLUTE);
 }
 
 /*
  * data contains amount of time to sleep
  */
 static int
 filt_timervalidate(struct knote *kn, sbintime_t *to)
 {
 	struct bintime bt;
 	sbintime_t sbt;
 
 	if (kn->kn_sdata < 0)
 		return (EINVAL);
 	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
 		kn->kn_sdata = 1;
 	/*
 	 * The only fflags values supported are the timer unit
 	 * (precision) and the absolute time indicator.
 	 */
 	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
 		return (EINVAL);
 
 	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		getboottimebin(&bt);
 		sbt = bttosbt(bt);
 		*to -= sbt;
 	}
 	if (*to < 0)
 		return (EINVAL);
 	return (0);
 }
 
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	sbintime_t to;
 	unsigned int ncallouts;
 	int error;
 
 	error = filt_timervalidate(kn, &to);
 	if (error != 0)
 		return (error);
 
 	do {
 		ncallouts = kq_ncallouts;
 		if (ncallouts >= kq_calloutmax)
 			return (ENOMEM);
 	} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
 
 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
 	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
 	callout_init(&kc->c, 1);
 	filt_timerstart(kn, to);
 
 	return (0);
 }
 
 static void
 filt_timerstart(struct knote *kn, sbintime_t to)
 {
 	struct kq_timer_cb_data *kc;
 
 	kc = kn->kn_ptr.p_v;
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		kc->next = to;
 		kc->to = 0;
 	} else {
 		kc->next = to + sbinuptime();
 		kc->to = to;
 	}
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
 	    PCPU_GET(cpuid), C_ABSOLUTE);
 }
 
 static void
 filt_timerdetach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	unsigned int old __unused;
 
 	kc = kn->kn_ptr.p_v;
 	callout_drain(&kc->c);
 	free(kc, M_KQUEUE);
 	old = atomic_fetchadd_int(&kq_ncallouts, -1);
 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
 }
 
 static void
 filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	struct kq_timer_cb_data *kc;	
 	struct kqueue *kq;
 	sbintime_t to;
 	int error;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		/* Handle re-added timers that update data/fflags */
 		if (kev->flags & EV_ADD) {
 			kc = kn->kn_ptr.p_v;
 
 			/* Drain any existing callout. */
 			callout_drain(&kc->c);
 
 			/* Throw away any existing undelivered record
 			 * of the timer expiration. This is done under
 			 * the presumption that if a process is
 			 * re-adding this timer with new parameters,
 			 * it is no longer interested in what may have
 			 * happened under the old parameters. If it is
 			 * interested, it can wait for the expiration,
 			 * delete the old timer definition, and then
 			 * add the new one.
 			 *
 			 * This has to be done while the kq is locked:
 			 *   - if enqueued, dequeue
 			 *   - make it no longer active
 			 *   - clear the count of expiration events
 			 */
 			kq = kn->kn_kq;
 			KQ_LOCK(kq);
 			if (kn->kn_status & KN_QUEUED)
 				knote_dequeue(kn);
 
 			kn->kn_status &= ~KN_ACTIVE;
 			kn->kn_data = 0;
 			KQ_UNLOCK(kq);
 			
 			/* Reschedule timer based on new data/fflags */
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			error = filt_timervalidate(kn, &to);
 			if (error != 0) {
 			  	kn->kn_flags |= EV_ERROR;
 				kn->kn_data = error;
 			} else
 			  	filt_timerstart(kn, to);
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_timertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 static int
 filt_timer(struct knote *kn, long hint)
 {
 
 	return (kn->kn_data != 0);
 }
 
 static int
 filt_userattach(struct knote *kn)
 {
 
 	/* 
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */ 
 	kn->kn_hook = NULL;
 	if (kn->kn_fflags & NOTE_TRIGGER)
 		kn->kn_hookid = 1;
 	else
 		kn->kn_hookid = 0;
 	return (0);
 }
 
 static void
 filt_userdetach(__unused struct knote *kn)
 {
 
 	/*
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */
 }
 
 static int
 filt_user(struct knote *kn, __unused long hint)
 {
 
 	return (kn->kn_hookid);
 }
 
 static void
 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	u_int ffctrl;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		if (kev->fflags & NOTE_TRIGGER)
 			kn->kn_hookid = 1;
 
 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
 		kev->fflags &= NOTE_FFLAGSMASK;
 		switch (ffctrl) {
 		case NOTE_FFNOP:
 			break;
 
 		case NOTE_FFAND:
 			kn->kn_sfflags &= kev->fflags;
 			break;
 
 		case NOTE_FFOR:
 			kn->kn_sfflags |= kev->fflags;
 			break;
 
 		case NOTE_FFCOPY:
 			kn->kn_sfflags = kev->fflags;
 			break;
 
 		default:
 			/* XXX Return error? */
 			break;
 		}
 		kn->kn_sdata = kev->data;
 		if (kev->flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		kev->fflags = kn->kn_sfflags;
 		kev->data = kn->kn_sdata;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_usertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
 	return (kern_kqueue(td, 0, NULL));
 }
 
 static void
 kqueue_init(struct kqueue *kq)
 {
 
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 }
 
 int
 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
 {
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
 	struct ucred *cred;
 	int fd, error;
 
 	fdp = td->td_proc->p_fd;
 	cred = td->td_ucred;
 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
 		return (ENOMEM);
 
 	error = falloc_caps(td, &fp, &fd, flags, fcaps);
 	if (error != 0) {
 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
 		return (error);
 	}
 
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	kqueue_init(kq);
 	kq->kq_fdp = fdp;
 	kq->kq_cred = crhold(cred);
 
 	FILEDESC_XLOCK(fdp);
 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 	FILEDESC_XUNLOCK(fdp);
 
 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (0);
 }
 
 struct g_kevent_args {
 	int	fd;
 	void	*changelist;
 	int	nchanges;
 	void	*eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
 
 int
 sys_kevent(struct thread *td, struct kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent_copyout,
 		.k_copyin = kevent_copyin,
 		.kevent_size = sizeof(struct kevent),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
 }
 
 static int
 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
     struct kevent_copyops *k_ops, const char *struct_name)
 {
 	struct timespec ts, *tsp;
 #ifdef KTRACE
 	struct kevent *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
 		    uap->nchanges, k_ops->kevent_size);
 #endif
 
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    k_ops, tsp);
 
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
 		    td->td_retval[0], k_ops->kevent_size);
 #endif
 
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
 	if (error == 0)
 		uap->changelist += count;
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD11
 static int
 kevent11_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct kevent_freebsd11 kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		kev11.ident = kevp->ident;
 		kev11.filter = kevp->filter;
 		kev11.flags = kevp->flags;
 		kev11.fflags = kevp->fflags;
 		kev11.data = kevp->data;
 		kev11.udata = kevp->udata;
 		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
 		if (error != 0)
 			break;
 		uap->eventlist++;
 		kevp++;
 	}
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent11_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct kevent_freebsd11 kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		error = copyin(uap->changelist, &kev11, sizeof(kev11));
 		if (error != 0)
 			break;
 		kevp->ident = kev11.ident;
 		kevp->filter = kev11.filter;
 		kevp->flags = kev11.flags;
 		kevp->fflags = kev11.fflags;
 		kevp->data = (uintptr_t)kev11.data;
 		kevp->udata = kev11.udata;
 		bzero(&kevp->ext, sizeof(kevp->ext));
 		uap->changelist++;
 		kevp++;
 	}
 	return (error);
 }
 
 int
 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent11_copyout,
 		.k_copyin = kevent11_copyin,
 		.kevent_size = sizeof(struct kevent_freebsd11),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11"));
 }
 #endif
 
 int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	int error;
 
 	cap_rights_init_zero(&rights);
 	if (nchanges > 0)
 		cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE);
 	if (nevents > 0)
 		cap_rights_set_one(&rights, CAP_KQUEUE_EVENT);
 	error = fget(td, fd, &rights, &fp);
 	if (error != 0)
 		return (error);
 
 	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
 	fdrop(fp, td);
 
 	return (error);
 }
 
 static int
 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kevent keva[KQ_NEVENTS];
 	struct kevent *kevp, *changes;
 	int i, n, nerrors, error;
 
 	nerrors = 0;
 	while (nchanges > 0) {
 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
 		error = k_ops->k_copyin(k_ops->arg, keva, n);
 		if (error)
 			return (error);
 		changes = keva;
 		for (i = 0; i < n; i++) {
 			kevp = &changes[i];
 			if (!kevp->filter)
 				continue;
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td, M_WAITOK);
 			if (error || (kevp->flags & EV_RECEIPT)) {
 				if (nevents == 0)
 					return (error);
 				kevp->flags = EV_ERROR;
 				kevp->data = error;
 				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
 				nevents--;
 				nerrors++;
 			}
 		}
 		nchanges -= n;
 	}
 	if (nerrors) {
 		td->td_retval[0] = nerrors;
 		return (0);
 	}
 
 	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
 }
 
 int
 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kqueue *kq;
 	int error;
 
 	error = kqueue_acquire(fp, &kq);
 	if (error != 0)
 		return (error);
 	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
 	kqueue_release(kq, 0);
 	return (error);
 }
 
 /*
  * Performs a kevent() call on a temporarily created kqueue. This can be
  * used to perform one-shot polling, similar to poll() and select().
  */
 int
 kern_kevent_anonymous(struct thread *td, int nevents,
     struct kevent_copyops *k_ops)
 {
 	struct kqueue kq = {};
 	int error;
 
 	kqueue_init(&kq);
 	kq.kq_refcnt = 1;
 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
 	kqueue_drain(&kq, td);
 	kqueue_destroy(&kq);
 	return (error);
 }
 
 int
 kqueue_add_filteropts(int filt, struct filterops *filtops)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
 		printf(
 "trying to add a filterop that is out of range: %d is beyond %d\n",
 		    ~filt, EVFILT_SYSCOUNT);
 		return EINVAL;
 	}
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
 	    sysfilt_ops[~filt].for_fop != NULL)
 		error = EEXIST;
 	else {
 		sysfilt_ops[~filt].for_fop = filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return (error);
 }
 
 int
 kqueue_del_filteropts(int filt)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return EINVAL;
 
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
 	    sysfilt_ops[~filt].for_fop == NULL)
 		error = EINVAL;
 	else if (sysfilt_ops[~filt].for_refcnt != 0)
 		error = EBUSY;
 	else {
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return error;
 }
 
 static struct filterops *
 kqueue_fo_find(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return NULL;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return sysfilt_ops[~filt].for_fop;
 
 	mtx_lock(&filterops_lock);
 	sysfilt_ops[~filt].for_refcnt++;
 	if (sysfilt_ops[~filt].for_fop == NULL)
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 	mtx_unlock(&filterops_lock);
 
 	return sysfilt_ops[~filt].for_fop;
 }
 
 static void
 kqueue_fo_release(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return;
 
 	mtx_lock(&filterops_lock);
 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
 	    ("filter object refcount not valid on release"));
 	sysfilt_ops[~filt].for_refcnt--;
 	mtx_unlock(&filterops_lock);
 }
 
 /*
  * A ref to kq (obtained via kqueue_acquire) must be held.
  */
 static int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
     int mflag)
 {
 	struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
 	struct knlist *knl;
 	int error, filt, event;
 	int haskqglobal, filedesc_unlock;
 
 	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
 		return (EINVAL);
 
 	fp = NULL;
 	kn = NULL;
 	knl = NULL;
 	error = 0;
 	haskqglobal = 0;
 	filedesc_unlock = 0;
 
 	filt = kev->filter;
 	fops = kqueue_fo_find(filt);
 	if (fops == NULL)
 		return EINVAL;
 
 	if (kev->flags & EV_ADD) {
 		/*
 		 * Prevent waiting with locks.  Non-sleepable
 		 * allocation failures are handled in the loop, only
 		 * if the spare knote appears to be actually required.
 		 */
 		tkn = knote_alloc(mflag);
 	} else {
 		tkn = NULL;
 	}
 
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
 		if (kev->ident > INT_MAX)
 			error = EBADF;
 		else
 			error = fget(td, kev->ident, &cap_event_rights, &fp);
 		if (error)
 			goto done;
 
 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
 		    kev->ident, M_NOWAIT) != 0) {
 			/* try again */
 			fdrop(fp, td);
 			fp = NULL;
 			error = kqueue_expand(kq, fops, kev->ident, mflag);
 			if (error)
 				goto done;
 			goto findkn;
 		}
 
 		if (fp->f_type == DTYPE_KQUEUE) {
 			/*
 			 * If we add some intelligence about what we are doing,
 			 * we should be able to support events on ourselves.
 			 * We need to know when we are doing this to prevent
 			 * getting both the knlist lock and the kq lock since
 			 * they are the same thing.
 			 */
 			if (fp->f_data == kq) {
 				error = EINVAL;
 				goto done;
 			}
 
 			/*
 			 * Pre-lock the filedesc before the global
 			 * lock mutex, see the comment in
 			 * kqueue_close().
 			 */
 			FILEDESC_XLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 1;
 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 		}
 
 		KQ_LOCK(kq);
 		if (kev->ident < kq->kq_knlistsize) {
 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
 				if (kev->filter == kn->kn_filter)
 					break;
 		}
 	} else {
 		if ((kev->flags & EV_ADD) == EV_ADD) {
 			error = kqueue_expand(kq, fops, kev->ident, mflag);
 			if (error != 0)
 				goto done;
 		}
 
 		KQ_LOCK(kq);
 
 		/*
 		 * If possible, find an existing knote to use for this kevent.
 		 */
 		if (kev->filter == EVFILT_PROC &&
 		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
 			/* This is an internal creation of a process tracking
 			 * note. Don't attempt to coalesce this with an
 			 * existing note.
 			 */
 			;			
 		} else if (kq->kq_knhashmask != 0) {
 			struct klist *list;
 
 			list = &kq->kq_knhash[
 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
 			SLIST_FOREACH(kn, list, kn_link)
 				if (kev->ident == kn->kn_id &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	}
 
 	/* knote is in the process of changing, wait for it to stabilize. */
 	if (kn != NULL && kn_in_flux(kn)) {
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 		if (filedesc_unlock) {
 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 0;
 		}
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
 		if (fp != NULL) {
 			fdrop(fp, td);
 			fp = NULL;
 		}
 		goto findkn;
 	}
 
 	/*
 	 * kn now contains the matching knote, or NULL if no match
 	 */
 	if (kn == NULL) {
 		if (kev->flags & EV_ADD) {
 			kn = tkn;
 			tkn = NULL;
 			if (kn == NULL) {
 				KQ_UNLOCK(kq);
 				error = ENOMEM;
 				goto done;
 			}
 			kn->kn_fp = fp;
 			kn->kn_kq = kq;
 			kn->kn_fop = fops;
 			/*
 			 * apply reference counts to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
 			fops = NULL;
 			fp = NULL;
 
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
 			kn->kn_status = KN_DETACHED;
 			if ((kev->flags & EV_DISABLE) != 0)
 				kn->kn_status |= KN_DISABLED;
 			kn_enter_flux(kn);
 
 			error = knote_attach(kn, kq);
 			KQ_UNLOCK(kq);
 			if (error != 0) {
 				tkn = kn;
 				goto done;
 			}
 
 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
 				knote_drop_detached(kn, td);
 				goto done;
 			}
 			knl = kn_list_lock(kn);
 			goto done_ev_add;
 		} else {
 			/* No matching knote and the EV_ADD flag is not set. */
 			KQ_UNLOCK(kq);
 			error = ENOENT;
 			goto done;
 		}
 	}
 
 	if (kev->flags & EV_DELETE) {
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		knote_drop(kn, td);
 		goto done;
 	}
 
 	if (kev->flags & EV_FORCEONESHOT) {
 		kn->kn_flags |= EV_ONESHOT;
 		KNOTE_ACTIVATE(kn, 1);
 	}
 
 	if ((kev->flags & EV_ENABLE) != 0)
 		kn->kn_status &= ~KN_DISABLED;
 	else if ((kev->flags & EV_DISABLE) != 0)
 		kn->kn_status |= KN_DISABLED;
 
 	/*
 	 * The user may change some filter values after the initial EV_ADD,
 	 * but doing so will not reset any filter which has already been
 	 * triggered.
 	 */
 	kn->kn_status |= KN_SCAN;
 	kn_enter_flux(kn);
 	KQ_UNLOCK(kq);
 	knl = kn_list_lock(kn);
 	kn->kn_kevent.udata = kev->udata;
 	if (!fops->f_isfd && fops->f_touch != NULL) {
 		fops->f_touch(kn, kev, EVENT_REGISTER);
 	} else {
 		kn->kn_sfflags = kev->fflags;
 		kn->kn_sdata = kev->data;
 	}
 
 done_ev_add:
 	/*
 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
 	 * the initial attach event decides that the event is "completed" 
 	 * already, e.g., filt_procattach() is called on a zombie process.  It
 	 * will call filt_proc() which will remove it from the list, and NULL
 	 * kn_knlist.
 	 *
 	 * KN_DISABLED will be stable while the knote is in flux, so the
 	 * unlocked read will not race with an update.
 	 */
 	if ((kn->kn_status & KN_DISABLED) == 0)
 		event = kn->kn_fop->f_event(kn, 0);
 	else
 		event = 0;
 
 	KQ_LOCK(kq);
 	if (event)
 		kn->kn_status |= KN_ACTIVE;
 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
 	    KN_ACTIVE)
 		knote_enqueue(kn);
 	kn->kn_status &= ~KN_SCAN;
 	kn_leave_flux(kn);
 	kn_list_unlock(knl);
 	KQ_UNLOCK_FLUX(kq);
 
 done:
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	if (fp != NULL)
 		fdrop(fp, td);
 	knote_free(tkn);
 	if (fops != NULL)
 		kqueue_fo_release(filt);
 	return (error);
 }
 
 static int
 kqueue_acquire(struct file *fp, struct kqueue **kqp)
 {
 	int error;
 	struct kqueue *kq;
 
 	error = 0;
 
 	kq = fp->f_data;
 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
 		return (EBADF);
 	*kqp = kq;
 	KQ_LOCK(kq);
 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 		KQ_UNLOCK(kq);
 		return (EBADF);
 	}
 	kq->kq_refcnt++;
 	KQ_UNLOCK(kq);
 
 	return error;
 }
 
 static void
 kqueue_release(struct kqueue *kq, int locked)
 {
 	if (locked)
 		KQ_OWNED(kq);
 	else
 		KQ_LOCK(kq);
 	kq->kq_refcnt--;
 	if (kq->kq_refcnt == 1)
 		wakeup(&kq->kq_refcnt);
 	if (!locked)
 		KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_schedtask(struct kqueue *kq)
 {
 
 	KQ_OWNED(kq);
 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
 	    ("scheduling kqueue task while draining"));
 
 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
 		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
 		kq->kq_state |= KQ_TASKSCHED;
 	}
 }
 
 /*
  * Expand the kq to make sure we have storage for fops/ident pair.
  *
  * Return 0 on success (or no work necessary), return errno on failure.
  */
 static int
 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
     int mflag)
 {
 	struct klist *list, *tmp_knhash, *to_free;
 	u_long tmp_knhashmask;
 	int error, fd, size;
 
 	KQ_NOTOWNED(kq);
 
 	error = 0;
 	to_free = NULL;
 	if (fops->f_isfd) {
 		fd = ident;
 		if (kq->kq_knlistsize <= fd) {
 			size = kq->kq_knlistsize;
 			while (size <= fd)
 				size += KQEXTENT;
 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
 			if (list == NULL)
 				return ENOMEM;
 			KQ_LOCK(kq);
 			if ((kq->kq_state & KQ_CLOSING) != 0) {
 				to_free = list;
 				error = EBADF;
 			} else if (kq->kq_knlistsize > fd) {
 				to_free = list;
 			} else {
 				if (kq->kq_knlist != NULL) {
 					bcopy(kq->kq_knlist, list,
 					    kq->kq_knlistsize * sizeof(*list));
 					to_free = kq->kq_knlist;
 					kq->kq_knlist = NULL;
 				}
 				bzero((caddr_t)list +
 				    kq->kq_knlistsize * sizeof(*list),
 				    (size - kq->kq_knlistsize) * sizeof(*list));
 				kq->kq_knlistsize = size;
 				kq->kq_knlist = list;
 			}
 			KQ_UNLOCK(kq);
 		}
 	} else {
 		if (kq->kq_knhashmask == 0) {
 			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
 			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
 			    HASH_WAITOK : HASH_NOWAIT);
 			if (tmp_knhash == NULL)
 				return (ENOMEM);
 			KQ_LOCK(kq);
 			if ((kq->kq_state & KQ_CLOSING) != 0) {
 				to_free = tmp_knhash;
 				error = EBADF;
 			} else if (kq->kq_knhashmask == 0) {
 				kq->kq_knhash = tmp_knhash;
 				kq->kq_knhashmask = tmp_knhashmask;
 			} else {
 				to_free = tmp_knhash;
 			}
 			KQ_UNLOCK(kq);
 		}
 	}
 	free(to_free, M_KQUEUE);
 
 	KQ_NOTOWNED(kq);
 	return (error);
 }
 
 static void
 kqueue_task(void *arg, int pending)
 {
 	struct kqueue *kq;
 	int haskqglobal;
 
 	haskqglobal = 0;
 	kq = arg;
 
 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 	KQ_LOCK(kq);
 
 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
 
 	kq->kq_state &= ~KQ_TASKSCHED;
 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
 		wakeup(&kq->kq_state);
 	}
 	KQ_UNLOCK(kq);
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 }
 
 /*
  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
  * We treat KN_MARKER knotes as if they are in flux.
  */
 static int
 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
     const struct timespec *tsp, struct kevent *keva, struct thread *td)
 {
 	struct kevent *kevp;
 	struct knote *kn, *marker;
 	struct knlist *knl;
 	sbintime_t asbt, rsbt;
 	int count, error, haskqglobal, influx, nkev, touch;
 
 	count = maxevents;
 	nkev = 0;
 	error = 0;
 	haskqglobal = 0;
 
 	if (maxevents == 0)
 		goto done_nl;
 
 	rsbt = 0;
 	if (tsp != NULL) {
 		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
 		    tsp->tv_nsec >= 1000000000) {
 			error = EINVAL;
 			goto done_nl;
 		}
 		if (timespecisset(tsp)) {
 			if (tsp->tv_sec <= INT32_MAX) {
 				rsbt = tstosbt(*tsp);
 				if (TIMESEL(&asbt, rsbt))
 					asbt += tc_tick_sbt;
 				if (asbt <= SBT_MAX - rsbt)
 					asbt += rsbt;
 				else
 					asbt = 0;
 				rsbt >>= tc_precexp;
 			} else
 				asbt = 0;
 		} else
 			asbt = -1;
 	} else
 		asbt = 0;
 	marker = knote_alloc(M_WAITOK);
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
 
 retry:
 	kevp = keva;
 	if (kq->kq_count == 0) {
 		if (asbt == -1) {
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
 			    "kqread", asbt, rsbt, C_ABSOLUTE);
 		}
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
 		if (error == ERESTART)
 			error = EINTR;
 		else if (error == EWOULDBLOCK)
 			error = 0;
 		goto done;
 	}
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
 	influx = 0;
 	while (count) {
 		KQ_OWNED(kq);
 		kn = TAILQ_FIRST(&kq->kq_head);
 
 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
 		    kn_in_flux(kn)) {
 			if (influx) {
 				influx = 0;
 				KQ_FLUX_WAKEUP(kq);
 			}
 			kq->kq_state |= KQ_FLUXWAIT;
 			error = msleep(kq, &kq->kq_lock, PSOCK,
 			    "kqflxwt", 0);
 			continue;
 		}
 
 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			continue;
 		}
 		if (kn == marker) {
 			KQ_FLUX_WAKEUP(kq);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		KASSERT(!kn_in_flux(kn),
 		    ("knote %p is unexpectedly in flux", kn));
 
 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked it as in flux.
 			 */
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			continue;
 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked the knote as being in flux.
 			 */
 			*kevp = kn->kn_kevent;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			kn = NULL;
 		} else {
 			kn->kn_status |= KN_SCAN;
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 			knl = kn_list_lock(kn);
 			if (kn->kn_fop->f_event(kn, 0) == 0) {
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
 				    KN_SCAN);
 				kn_leave_flux(kn);
 				kq->kq_count--;
 				kn_list_unlock(knl);
 				influx = 1;
 				continue;
 			}
 			touch = (!kn->kn_fop->f_isfd &&
 			    kn->kn_fop->f_touch != NULL);
 			if (touch)
 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
 			else
 				*kevp = kn->kn_kevent;
 			KQ_LOCK(kq);
 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
 				/* 
 				 * Manually clear knotes who weren't 
 				 * 'touch'ed.
 				 */
 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
 					kn->kn_data = 0;
 					kn->kn_fflags = 0;
 				}
 				if (kn->kn_flags & EV_DISPATCH)
 					kn->kn_status |= KN_DISABLED;
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 				kq->kq_count--;
 			} else
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 			
 			kn->kn_status &= ~KN_SCAN;
 			kn_leave_flux(kn);
 			kn_list_unlock(knl);
 			influx = 1;
 		}
 
 		/* we are returning a copy to the user */
 		kevp++;
 		nkev++;
 		count--;
 
 		if (nkev == KQ_NEVENTS) {
 			influx = 0;
 			KQ_UNLOCK_FLUX(kq);
 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 			nkev = 0;
 			kevp = keva;
 			KQ_LOCK(kq);
 			if (error)
 				break;
 		}
 	}
 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
 done:
 	KQ_OWNED(kq);
 	KQ_UNLOCK_FLUX(kq);
 	knote_free(marker);
 done_nl:
 	KQ_NOTOWNED(kq);
 	if (nkev != 0)
 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 	td->td_retval[0] = maxevents - count;
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
 	/*
 	 * Enabling sigio causes two major problems:
 	 * 1) infinite recursion:
 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
 	 * set.  On receipt of a signal this will cause a kqueue to recurse
 	 * into itself over and over.  Sending the sigio causes the kqueue
 	 * to become ready, which in turn posts sigio again, forever.
 	 * Solution: this can be solved by setting a flag in the kqueue that
 	 * we have a SIGIO in progress.
 	 * 2) locking problems:
 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
 	 * us above the proc and pgrp locks.
 	 * Solution: Post a signal using an async mechanism, being sure to
 	 * record a generation count in the delivery so that we do not deliver
 	 * a signal to the wrong process.
 	 *
 	 * Note, these two mechanisms are somewhat mutually exclusive!
 	 */
 #if 0
 	struct kqueue *kq;
 
 	kq = fp->f_data;
 	switch (cmd) {
 	case FIOASYNC:
 		if (*(int *)data) {
 			kq->kq_state |= KQ_ASYNC;
 		} else {
 			kq->kq_state &= ~KQ_ASYNC;
 		}
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &kq->kq_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&kq->kq_sigio);
 		return (0);
 	}
 #endif
 
 	return (ENOTTY);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct kqueue *kq;
 	int revents = 0;
 	int error;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return POLLERR;
 
 	KQ_LOCK(kq);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (kq->kq_count) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			selrecord(td, &kq->kq_sel);
 			if (SEL_WAITING(&kq->kq_sel))
 				kq->kq_state |= KQ_SEL;
 		}
 	}
 	kqueue_release(kq, 1);
 	KQ_UNLOCK(kq);
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 
 	bzero((void *)st, sizeof *st);
 	/*
 	 * We no longer return kq_count because the unlocked value is useless.
 	 * If you spent all this time getting the count, why not spend your
 	 * syscall better by calling kevent?
 	 *
 	 * XXX - This is needed for libc_r.
 	 */
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 static void
 kqueue_drain(struct kqueue *kq, struct thread *td)
 {
 	struct knote *kn;
 	int i;
 
 	KQ_LOCK(kq);
 
 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
 	    ("kqueue already closing"));
 	kq->kq_state |= KQ_CLOSING;
 	if (kq->kq_refcnt > 1)
 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
 
 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
 
 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
 	    ("kqueue's knlist not empty"));
 
 	for (i = 0; i < kq->kq_knlistsize; i++) {
 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
 			if (kn_in_flux(kn)) {
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
 				continue;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 	}
 	if (kq->kq_knhashmask != 0) {
 		for (i = 0; i <= kq->kq_knhashmask; i++) {
 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
 				if (kn_in_flux(kn)) {
 					kq->kq_state |= KQ_FLUXWAIT;
 					msleep(kq, &kq->kq_lock, PSOCK,
 					       "kqclo2", 0);
 					continue;
 				}
 				kn_enter_flux(kn);
 				KQ_UNLOCK(kq);
 				knote_drop(kn, td);
 				KQ_LOCK(kq);
 			}
 		}
 	}
 
 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
 		kq->kq_state |= KQ_TASKDRAIN;
 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
 	}
 
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 
 	KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_destroy(struct kqueue *kq)
 {
 
 	KASSERT(kq->kq_fdp == NULL,
 	    ("kqueue still attached to a file descriptor"));
 	seldrain(&kq->kq_sel);
 	knlist_destroy(&kq->kq_sel.si_note);
 	mtx_destroy(&kq->kq_lock);
 
 	if (kq->kq_knhash != NULL)
 		free(kq->kq_knhash, M_KQUEUE);
 	if (kq->kq_knlist != NULL)
 		free(kq->kq_knlist, M_KQUEUE);
 
 	funsetown(&kq->kq_sigio);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = fp->f_data;
 	struct filedesc *fdp;
 	int error;
 	int filedesc_unlock;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return error;
 	kqueue_drain(kq, td);
 
 	/*
 	 * We could be called due to the knote_drop() doing fdrop(),
 	 * called from kqueue_register().  In this case the global
 	 * lock is owned, and filedesc sx is locked before, to not
 	 * take the sleepable lock after non-sleepable.
 	 */
 	fdp = kq->kq_fdp;
 	kq->kq_fdp = NULL;
 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
 		FILEDESC_XLOCK(fdp);
 		filedesc_unlock = 1;
 	} else
 		filedesc_unlock = 0;
 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(fdp);
 
 	kqueue_destroy(kq);
 	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
 	crfree(kq->kq_cred);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
 static int
 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_KQUEUE;
 	return (0);
 }
 
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
 		kq->kq_state &= ~KQ_SLEEP;
 		wakeup(kq);
 	}
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 	if (!knlist_empty(&kq->kq_sel.si_note))
 		kqueue_schedtask(kq);
 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
 		pgsigio(&kq->kq_sigio, SIGIO, 0);
 	}
 }
 
 /*
  * Walk down a list of knotes, activating them if their event has triggered.
  *
  * There is a possibility to optimize in the case of one kq watching another.
  * Instead of scheduling a task to wake it up, you could pass enough state
  * down the chain to make up the parent kqueue.  Make this code functional
  * first.
  */
 void
 knote(struct knlist *list, long hint, int lockflags)
 {
 	struct kqueue *kq;
 	struct knote *kn, *tkn;
 	int error;
 
 	if (list == NULL)
 		return;
 
 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
 
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
 	 * If we unlock the list lock (and enter influx), we can
 	 * eliminate the kqueue scheduling, but this will introduce
 	 * four lock/unlock's for each knote to test.  Also, marker
 	 * would be needed to keep iteration position, since filters
 	 * or other threads could remove events.
 	 */
 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			/*
 			 * Do not process the influx notes, except for
 			 * the influx coming from the kq unlock in the
 			 * kqueue_scan().  In the later case, we do
 			 * not interfere with the scan, since the code
 			 * fragment in kqueue_scan() locks the knlist,
 			 * and cannot proceed until we finished.
 			 */
 			KQ_UNLOCK(kq);
 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			error = kn->kn_fop->f_event(kn, hint);
 			KQ_LOCK(kq);
 			kn_leave_flux(kn);
 			if (error)
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK_FLUX(kq);
 		} else {
 			if (kn->kn_fop->f_event(kn, hint))
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK(kq);
 		}
 	}
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_unlock(list->kl_lockarg); 
 }
 
 /*
  * add a knote to a knlist
  */
 void
 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	KNL_ASSERT_LOCK(knl, islocked);
 	KQ_NOTOWNED(kn->kn_kq);
 	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p was not detached", kn));
 	if (!islocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
 	if (!islocked)
 		knl->kl_unlock(knl->kl_lockarg);
 	KQ_LOCK(kn->kn_kq);
 	kn->kn_knlist = knl;
 	kn->kn_status &= ~KN_DETACHED;
 	KQ_UNLOCK(kn->kn_kq);
 }
 
 static void
 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
     int kqislocked)
 {
 
 	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
 	KNL_ASSERT_LOCK(knl, knlislocked);
 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
 	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) == 0,
 	    ("knote %p was already detached", kn));
 	if (!knlislocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
 	kn->kn_knlist = NULL;
 	if (!knlislocked)
 		kn_list_unlock(knl);
 	if (!kqislocked)
 		KQ_LOCK(kn->kn_kq);
 	kn->kn_status |= KN_DETACHED;
 	if (!kqislocked)
 		KQ_UNLOCK(kn->kn_kq);
 }
 
 /*
  * remove knote from the specified knlist
  */
 void
 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	knlist_remove_kq(knl, kn, islocked, 0);
 }
 
 int
 knlist_empty(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	return (SLIST_EMPTY(&knl->kl_list));
 }
 
 static struct mtx knlist_lock;
 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
     MTX_DEF);
 static void knlist_mtx_lock(void *arg);
 static void knlist_mtx_unlock(void *arg);
 
 static void
 knlist_mtx_lock(void *arg)
 {
 
 	mtx_lock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_unlock(void *arg)
 {
 
 	mtx_unlock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_assert_lock(void *arg, int what)
 {
 
 	if (what == LA_LOCKED)
 		mtx_assert((struct mtx *)arg, MA_OWNED);
 	else
 		mtx_assert((struct mtx *)arg, MA_NOTOWNED);
 }
 
 static void
 knlist_rw_rlock(void *arg)
 {
 
 	rw_rlock((struct rwlock *)arg);
 }
 
 static void
 knlist_rw_runlock(void *arg)
 {
 
 	rw_runlock((struct rwlock *)arg);
 }
 
 static void
 knlist_rw_assert_lock(void *arg, int what)
 {
 
 	if (what == LA_LOCKED)
 		rw_assert((struct rwlock *)arg, RA_LOCKED);
 	else
 		rw_assert((struct rwlock *)arg, RA_UNLOCKED);
 }
 
 void
 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
     void (*kl_unlock)(void *),
     void (*kl_assert_lock)(void *, int))
 {
 
 	if (lock == NULL)
 		knl->kl_lockarg = &knlist_lock;
 	else
 		knl->kl_lockarg = lock;
 
 	if (kl_lock == NULL)
 		knl->kl_lock = knlist_mtx_lock;
 	else
 		knl->kl_lock = kl_lock;
 	if (kl_unlock == NULL)
 		knl->kl_unlock = knlist_mtx_unlock;
 	else
 		knl->kl_unlock = kl_unlock;
 	if (kl_assert_lock == NULL)
 		knl->kl_assert_lock = knlist_mtx_assert_lock;
 	else
 		knl->kl_assert_lock = kl_assert_lock;
 
 	knl->kl_autodestroy = 0;
 	SLIST_INIT(&knl->kl_list);
 }
 
 void
 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
 {
 
 	knlist_init(knl, lock, NULL, NULL, NULL);
 }
 
 struct knlist *
 knlist_alloc(struct mtx *lock)
 {
 	struct knlist *knl;
 
 	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
 	knlist_init_mtx(knl, lock);
 	return (knl);
 }
 
 void
 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
 {
 
 	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
 	    knlist_rw_assert_lock);
 }
 
 void
 knlist_destroy(struct knlist *knl)
 {
 
 	KASSERT(KNLIST_EMPTY(knl),
 	    ("destroying knlist %p with knotes on it", knl));
 }
 
 void
 knlist_detach(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	knl->kl_autodestroy = 1;
 	if (knlist_empty(knl)) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 /*
  * Even if we are locked, we may need to drop the lock to allow any influx
  * knotes time to "settle".
  */
 void
 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
 {
 	struct knote *kn, *kn2;
 	struct kqueue *kq;
 
 	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		KNL_ASSERT_UNLOCKED(knl);
 again:		/* need to reacquire lock since we have dropped it */
 		knl->kl_lock(knl->kl_lockarg);
 	}
 
 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn)) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 		knlist_remove_kq(knl, kn, 1, 1);
 		if (killkn) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop_detached(kn, td);
 		} else {
 			/* Make sure cleared knotes disappear soon */
 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
 			KQ_UNLOCK(kq);
 		}
 		kq = NULL;
 	}
 
 	if (!SLIST_EMPTY(&knl->kl_list)) {
 		/* there are still in flux knotes remaining */
 		kn = SLIST_FIRST(&knl->kl_list);
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
 		knl->kl_unlock(knl->kl_lockarg);
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
 		kq = NULL;
 		goto again;
 	}
 
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		knl->kl_unlock(knl->kl_lockarg);
 		KNL_ASSERT_UNLOCKED(knl);
 	}
 }
 
 /*
  * Remove all knotes referencing a specified fd must be called with FILEDESC
  * lock.  This prevents a race where a new fd comes along and occupies the
  * entry and we attach a knote to the fd.
  */
 void
 knote_fdclose(struct thread *td, int fd)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct kqueue *kq;
 	struct knote *kn;
 	int influx;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We shouldn't have to worry about new kevents appearing on fd
 	 * since filedesc is locked.
 	 */
 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
 		KQ_LOCK(kq);
 
 again:
 		influx = 0;
 		while (kq->kq_knlistsize > fd &&
 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
 			if (kn_in_flux(kn)) {
 				/* someone else might be waiting on our knote */
 				if (influx)
 					wakeup(kq);
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
 				goto again;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			influx = 1;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 static int
 knote_attach(struct knote *kn, struct kqueue *kq)
 {
 	struct klist *list;
 
 	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_CLOSING) != 0)
 		return (EBADF);
 	if (kn->kn_fop->f_isfd) {
 		if (kn->kn_id >= kq->kq_knlistsize)
 			return (ENOMEM);
 		list = &kq->kq_knlist[kn->kn_id];
 	} else {
 		if (kq->kq_knhash == NULL)
 			return (ENOMEM);
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 	}
 	SLIST_INSERT_HEAD(list, kn, kn_link);
 	return (0);
 }
 
 static void
 knote_drop(struct knote *kn, struct thread *td)
 {
 
 	if ((kn->kn_status & KN_DETACHED) == 0)
 		kn->kn_fop->f_detach(kn);
 	knote_drop_detached(kn, td);
 }
 
 static void
 knote_drop_detached(struct knote *kn, struct thread *td)
 {
 	struct kqueue *kq;
 	struct klist *list;
 
 	kq = kn->kn_kq;
 
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p still attached", kn));
 	KQ_NOTOWNED(kq);
 
 	KQ_LOCK(kq);
 	KASSERT(kn->kn_influx == 1,
 	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
 
 	if (kn->kn_fop->f_isfd)
 		list = &kq->kq_knlist[kn->kn_id];
 	else
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
 	if (!SLIST_EMPTY(list))
 		SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	KQ_UNLOCK_FLUX(kq);
 
 	if (kn->kn_fop->f_isfd) {
 		fdrop(kn->kn_fp, td);
 		kn->kn_fp = NULL;
 	}
 	kqueue_fo_release(kn->kn_kevent.filter);
 	kn->kn_fop = NULL;
 	knote_free(kn);
 }
 
 static void
 knote_enqueue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
 	kqueue_wakeup(kq);
 }
 
 static void
 knote_dequeue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
 }
 
 static void
 knote_init(void)
 {
 
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 
 static struct knote *
 knote_alloc(int mflag)
 {
 
 	return (uma_zalloc(knote_zone, mflag | M_ZERO));
 }
 
 static void
 knote_free(struct knote *kn)
 {
 
 	uma_zfree(knote_zone, kn);
 }
 
 /*
  * Register the kev w/ the kq specified by fd.
  */
 int 
 kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
 {
 	struct kqueue *kq;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
-	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
+	error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE),
+	    &fp);
 	if (error != 0)
 		return (error);
 	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto noacquire;
 
 	error = kqueue_register(kq, kev, td, mflag);
 	kqueue_release(kq, 0);
 
 noacquire:
 	fdrop(fp, td);
 	return (error);
 }
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index 4b1f7ca52abe..00b4df675311 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -1,2454 +1,2454 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_printf.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #ifdef COMPAT_43TTY
 #include <sys/ioctl_compat.h>
 #endif /* COMPAT_43TTY */
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/serial.h>
 #include <sys/signal.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/ttycom.h>
 #define TTYDEFCHARS
 #include <sys/ttydefaults.h>
 #undef TTYDEFCHARS
 #include <sys/ucred.h>
 #include <sys/vnode.h>
 
 #include <fs/devfs/devfs.h>
 
 #include <machine/stdarg.h>
 
 static MALLOC_DEFINE(M_TTY, "tty", "tty device");
 
 static void tty_rel_free(struct tty *tp);
 
 static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
 static struct sx tty_list_sx;
 SX_SYSINIT(tty_list, &tty_list_sx, "tty list");
 static unsigned int tty_list_count = 0;
 
 /* Character device of /dev/console. */
 static struct cdev	*dev_console;
 static const char	*dev_console_filename;
 
 /*
  * Flags that are supported and stored by this implementation.
  */
 #define TTYSUP_IFLAG	(IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|\
 			INLCR|IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL)
 #define TTYSUP_OFLAG	(OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET)
 #define TTYSUP_LFLAG	(ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\
 			ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\
 			FLUSHO|NOKERNINFO|NOFLSH)
 #define TTYSUP_CFLAG	(CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\
 			HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\
 			CDSR_OFLOW|CCAR_OFLOW|CNO_RTSDTR)
 
 #define	TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)
 
 static int  tty_drainwait = 5 * 60;
 SYSCTL_INT(_kern, OID_AUTO, tty_drainwait, CTLFLAG_RWTUN,
     &tty_drainwait, 0, "Default output drain timeout in seconds");
 
 /*
  * Set TTY buffer sizes.
  */
 
 #define	TTYBUF_MAX	65536
 
 #ifdef PRINTF_BUFR_SIZE
 #define	TTY_PRBUF_SIZE	PRINTF_BUFR_SIZE
 #else
 #define	TTY_PRBUF_SIZE	256
 #endif
 
 /*
  * Allocate buffer space if necessary, and set low watermarks, based on speed.
  * Note that the ttyxxxq_setsize() functions may drop and then reacquire the tty
  * lock during memory allocation.  They will return ENXIO if the tty disappears
  * while unlocked.
  */
 static int
 tty_watermarks(struct tty *tp)
 {
 	size_t bs = 0;
 	int error;
 
 	/* Provide an input buffer for 2 seconds of data. */
 	if (tp->t_termios.c_cflag & CREAD)
 		bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX);
 	error = ttyinq_setsize(&tp->t_inq, tp, bs);
 	if (error != 0)
 		return (error);
 
 	/* Set low watermark at 10% (when 90% is available). */
 	tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10;
 
 	/* Provide an output buffer for 2 seconds of data. */
 	bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX);
 	error = ttyoutq_setsize(&tp->t_outq, tp, bs);
 	if (error != 0)
 		return (error);
 
 	/* Set low watermark at 10% (when 90% is available). */
 	tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10;
 
 	return (0);
 }
 
 static int
 tty_drain(struct tty *tp, int leaving)
 {
 	sbintime_t timeout_at;
 	size_t bytes;
 	int error;
 
 	if (ttyhook_hashook(tp, getc_inject))
 		/* buffer is inaccessible */
 		return (0);
 
 	/*
 	 * For close(), use the recent historic timeout of "1 second without
 	 * making progress".  For tcdrain(), use t_drainwait as the timeout,
 	 * with zero meaning "no timeout" which gives POSIX behavior.
 	 */
 	if (leaving)
 		timeout_at = getsbinuptime() + SBT_1S;
 	else if (tp->t_drainwait != 0)
 		timeout_at = getsbinuptime() + SBT_1S * tp->t_drainwait;
 	else
 		timeout_at = 0;
 
 	/*
 	 * Poll the output buffer and the hardware for completion, at 10 Hz.
 	 * Polling is required for devices which are not able to signal an
 	 * interrupt when the transmitter becomes idle (most USB serial devs).
 	 * The unusual structure of this loop ensures we check for busy one more
 	 * time after tty_timedwait() returns EWOULDBLOCK, so that success has
 	 * higher priority than timeout if the IO completed in the last 100mS.
 	 */
 	error = 0;
 	bytes = ttyoutq_bytesused(&tp->t_outq);
 	for (;;) {
 		if (ttyoutq_bytesused(&tp->t_outq) == 0 && !ttydevsw_busy(tp))
 			return (0);
 		if (error != 0)
 			return (error);
 		ttydevsw_outwakeup(tp);
 		error = tty_timedwait(tp, &tp->t_outwait, hz / 10);
 		if (error != 0 && error != EWOULDBLOCK)
 			return (error);
 		else if (timeout_at == 0 || getsbinuptime() < timeout_at)
 			error = 0;
 		else if (leaving && ttyoutq_bytesused(&tp->t_outq) < bytes) {
 			/* In close, making progress, grant an extra second. */
 			error = 0;
 			timeout_at += SBT_1S;
 			bytes = ttyoutq_bytesused(&tp->t_outq);
 		}
 	}
 }
 
 /*
  * Though ttydev_enter() and ttydev_leave() seem to be related, they
  * don't have to be used together. ttydev_enter() is used by the cdev
  * operations to prevent an actual operation from being processed when
  * the TTY has been abandoned. ttydev_leave() is used by ttydev_open()
  * and ttydev_close() to determine whether per-TTY data should be
  * deallocated.
  */
 
 static __inline int
 ttydev_enter(struct tty *tp)
 {
 
 	tty_lock(tp);
 
 	if (tty_gone(tp) || !tty_opened(tp)) {
 		/* Device is already gone. */
 		tty_unlock(tp);
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 static void
 ttydev_leave(struct tty *tp)
 {
 
 	tty_assert_locked(tp);
 
 	if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) {
 		/* Device is still opened somewhere. */
 		tty_unlock(tp);
 		return;
 	}
 
 	tp->t_flags |= TF_OPENCLOSE;
 
 	/* Remove console TTY. */
 	if (constty == tp)
 		constty_clear();
 
 	/* Drain any output. */
 	if (!tty_gone(tp))
 		tty_drain(tp, 1);
 
 	ttydisc_close(tp);
 
 	/* Free i/o queues now since they might be large. */
 	ttyinq_free(&tp->t_inq);
 	tp->t_inlow = 0;
 	ttyoutq_free(&tp->t_outq);
 	tp->t_outlow = 0;
 
 	knlist_clear(&tp->t_inpoll.si_note, 1);
 	knlist_clear(&tp->t_outpoll.si_note, 1);
 
 	if (!tty_gone(tp))
 		ttydevsw_close(tp);
 
 	tp->t_flags &= ~TF_OPENCLOSE;
 	cv_broadcast(&tp->t_dcdwait);
 	tty_rel_free(tp);
 }
 
 /*
  * Operations that are exposed through the character device in /dev.
  */
 static int
 ttydev_open(struct cdev *dev, int oflags, int devtype __unused,
     struct thread *td)
 {
 	struct tty *tp;
 	int error;
 
 	tp = dev->si_drv1;
 	error = 0;
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		/* Device is already gone. */
 		tty_unlock(tp);
 		return (ENXIO);
 	}
 
 	/*
 	 * Block when other processes are currently opening or closing
 	 * the TTY.
 	 */
 	while (tp->t_flags & TF_OPENCLOSE) {
 		error = tty_wait(tp, &tp->t_dcdwait);
 		if (error != 0) {
 			tty_unlock(tp);
 			return (error);
 		}
 	}
 	tp->t_flags |= TF_OPENCLOSE;
 
 	/*
 	 * Make sure the "tty" and "cua" device cannot be opened at the
 	 * same time.  The console is a "tty" device.
 	 */
 	if (TTY_CALLOUT(tp, dev)) {
 		if (tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) {
 			error = EBUSY;
 			goto done;
 		}
 	} else {
 		if (tp->t_flags & TF_OPENED_OUT) {
 			error = EBUSY;
 			goto done;
 		}
 	}
 
 	if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) {
 		error = EBUSY;
 		goto done;
 	}
 
 	if (!tty_opened(tp)) {
 		/* Set proper termios flags. */
 		if (TTY_CALLOUT(tp, dev))
 			tp->t_termios = tp->t_termios_init_out;
 		else
 			tp->t_termios = tp->t_termios_init_in;
 		ttydevsw_param(tp, &tp->t_termios);
 		/* Prevent modem control on callout devices and /dev/console. */
 		if (TTY_CALLOUT(tp, dev) || dev == dev_console)
 			tp->t_termios.c_cflag |= CLOCAL;
 
 		if ((tp->t_termios.c_cflag & CNO_RTSDTR) == 0)
 			ttydevsw_modem(tp, SER_DTR|SER_RTS, 0);
 
 		error = ttydevsw_open(tp);
 		if (error != 0)
 			goto done;
 
 		ttydisc_open(tp);
 		error = tty_watermarks(tp);
 		if (error != 0)
 			goto done;
 	}
 
 	/* Wait for Carrier Detect. */
 	if ((oflags & O_NONBLOCK) == 0 &&
 	    (tp->t_termios.c_cflag & CLOCAL) == 0) {
 		while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) {
 			error = tty_wait(tp, &tp->t_dcdwait);
 			if (error != 0)
 				goto done;
 		}
 	}
 
 	if (dev == dev_console)
 		tp->t_flags |= TF_OPENED_CONS;
 	else if (TTY_CALLOUT(tp, dev))
 		tp->t_flags |= TF_OPENED_OUT;
 	else
 		tp->t_flags |= TF_OPENED_IN;
 	MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 ||
 	    (tp->t_flags & TF_OPENED_OUT) == 0);
 
 done:	tp->t_flags &= ~TF_OPENCLOSE;
 	cv_broadcast(&tp->t_dcdwait);
 	ttydev_leave(tp);
 
 	return (error);
 }
 
 static int
 ttydev_close(struct cdev *dev, int fflag, int devtype __unused,
     struct thread *td __unused)
 {
 	struct tty *tp = dev->si_drv1;
 
 	tty_lock(tp);
 
 	/*
 	 * Don't actually close the device if it is being used as the
 	 * console.
 	 */
 	MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 ||
 	    (tp->t_flags & TF_OPENED_OUT) == 0);
 	if (dev == dev_console)
 		tp->t_flags &= ~TF_OPENED_CONS;
 	else
 		tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT);
 
 	if (tp->t_flags & TF_OPENED) {
 		tty_unlock(tp);
 		return (0);
 	}
 
 	/* If revoking, flush output now to avoid draining it later. */
 	if (fflag & FREVOKE)
 		tty_flush(tp, FWRITE);
 
 	tp->t_flags &= ~TF_EXCLUDE;
 
 	/* Properly wake up threads that are stuck - revoke(). */
 	tp->t_revokecnt++;
 	tty_wakeup(tp, FREAD|FWRITE);
 	cv_broadcast(&tp->t_bgwait);
 	cv_broadcast(&tp->t_dcdwait);
 
 	ttydev_leave(tp);
 
 	return (0);
 }
 
 static __inline int
 tty_is_ctty(struct tty *tp, struct proc *p)
 {
 
 	tty_assert_locked(tp);
 
 	return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT);
 }
 
 int
 tty_wait_background(struct tty *tp, struct thread *td, int sig)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	ksiginfo_t ksi;
 	int error;
 
 	MPASS(sig == SIGTTIN || sig == SIGTTOU);
 	tty_assert_locked(tp);
 
 	p = td->td_proc;
 	for (;;) {
 		pg = p->p_pgrp;
 		PGRP_LOCK(pg);
 		PROC_LOCK(p);
 
 		/*
 		 * pg may no longer be our process group.
 		 * Re-check after locking.
 		 */
 		if (p->p_pgrp != pg) {
 			PROC_UNLOCK(p);
 			PGRP_UNLOCK(pg);
 			continue;
 		}
 
 		/*
 		 * The process should only sleep, when:
 		 * - This terminal is the controlling terminal
 		 * - Its process group is not the foreground process
 		 *   group
 		 * - The parent process isn't waiting for the child to
 		 *   exit
 		 * - the signal to send to the process isn't masked
 		 */
 		if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) {
 			/* Allow the action to happen. */
 			PROC_UNLOCK(p);
 			PGRP_UNLOCK(pg);
 			return (0);
 		}
 
 		if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) ||
 		    SIGISMEMBER(td->td_sigmask, sig)) {
 			/* Only allow them in write()/ioctl(). */
 			PROC_UNLOCK(p);
 			PGRP_UNLOCK(pg);
 			return (sig == SIGTTOU ? 0 : EIO);
 		}
 
 		if ((p->p_flag & P_PPWAIT) != 0 ||
 		    (pg->pg_flags & PGRP_ORPHANED) != 0) {
 			/* Don't allow the action to happen. */
 			PROC_UNLOCK(p);
 			PGRP_UNLOCK(pg);
 			return (EIO);
 		}
 		PROC_UNLOCK(p);
 
 		/*
 		 * Send the signal and sleep until we're the new
 		 * foreground process group.
 		 */
 		if (sig != 0) {
 			ksiginfo_init(&ksi);
 			ksi.ksi_code = SI_KERNEL;
 			ksi.ksi_signo = sig;
 			sig = 0;
 		}
 
 		pgsignal(pg, ksi.ksi_signo, 1, &ksi);
 		PGRP_UNLOCK(pg);
 
 		error = tty_wait(tp, &tp->t_bgwait);
 		if (error)
 			return (error);
 	}
 }
 
 static int
 ttydev_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		goto done;
 	error = ttydisc_read(tp, uio, ioflag);
 	tty_unlock(tp);
 
 	/*
 	 * The read() call should not throw an error when the device is
 	 * being destroyed. Silently convert it to an EOF.
 	 */
 done:	if (error == ENXIO)
 		error = 0;
 	return (error);
 }
 
 static int
 ttydev_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	if (tp->t_termios.c_lflag & TOSTOP) {
 		error = tty_wait_background(tp, curthread, SIGTTOU);
 		if (error)
 			goto done;
 	}
 
 	if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) {
 		/* Allow non-blocking writes to bypass serialization. */
 		error = ttydisc_write(tp, uio, ioflag);
 	} else {
 		/* Serialize write() calls. */
 		while (tp->t_flags & TF_BUSY_OUT) {
 			error = tty_wait(tp, &tp->t_outserwait);
 			if (error)
 				goto done;
 		}
 
 		tp->t_flags |= TF_BUSY_OUT;
 		error = ttydisc_write(tp, uio, ioflag);
 		tp->t_flags &= ~TF_BUSY_OUT;
 		cv_signal(&tp->t_outserwait);
 	}
 
 done:	tty_unlock(tp);
 	return (error);
 }
 
 static int
 ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	switch (cmd) {
 	case TIOCCBRK:
 	case TIOCCONS:
 	case TIOCDRAIN:
 	case TIOCEXCL:
 	case TIOCFLUSH:
 	case TIOCNXCL:
 	case TIOCSBRK:
 	case TIOCSCTTY:
 	case TIOCSETA:
 	case TIOCSETAF:
 	case TIOCSETAW:
 	case TIOCSPGRP:
 	case TIOCSTART:
 	case TIOCSTAT:
 	case TIOCSTI:
 	case TIOCSTOP:
 	case TIOCSWINSZ:
 #if 0
 	case TIOCSDRAINWAIT:
 	case TIOCSETD:
 #endif
 #ifdef COMPAT_43TTY
 	case  TIOCLBIC:
 	case  TIOCLBIS:
 	case  TIOCLSET:
 	case  TIOCSETC:
 	case OTIOCSETD:
 	case  TIOCSETN:
 	case  TIOCSETP:
 	case  TIOCSLTC:
 #endif /* COMPAT_43TTY */
 		/*
 		 * If the ioctl() causes the TTY to be modified, let it
 		 * wait in the background.
 		 */
 		error = tty_wait_background(tp, curthread, SIGTTOU);
 		if (error)
 			goto done;
 	}
 
 	if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) {
 		struct termios *old = &tp->t_termios;
 		struct termios *new = (struct termios *)data;
 		struct termios *lock = TTY_CALLOUT(tp, dev) ?
 		    &tp->t_termios_lock_out : &tp->t_termios_lock_in;
 		int cc;
 
 		/*
 		 * Lock state devices.  Just overwrite the values of the
 		 * commands that are currently in use.
 		 */
 		new->c_iflag = (old->c_iflag & lock->c_iflag) |
 		    (new->c_iflag & ~lock->c_iflag);
 		new->c_oflag = (old->c_oflag & lock->c_oflag) |
 		    (new->c_oflag & ~lock->c_oflag);
 		new->c_cflag = (old->c_cflag & lock->c_cflag) |
 		    (new->c_cflag & ~lock->c_cflag);
 		new->c_lflag = (old->c_lflag & lock->c_lflag) |
 		    (new->c_lflag & ~lock->c_lflag);
 		for (cc = 0; cc < NCCS; ++cc)
 			if (lock->c_cc[cc])
 				new->c_cc[cc] = old->c_cc[cc];
 		if (lock->c_ispeed)
 			new->c_ispeed = old->c_ispeed;
 		if (lock->c_ospeed)
 			new->c_ospeed = old->c_ospeed;
 	}
 
 	error = tty_ioctl(tp, cmd, data, fflag, td);
 done:	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ttydev_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error, revents = 0;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
 
 	if (events & (POLLIN|POLLRDNORM)) {
 		/* See if we can read something. */
 		if (ttydisc_read_poll(tp) > 0)
 			revents |= events & (POLLIN|POLLRDNORM);
 	}
 
 	if (tp->t_flags & TF_ZOMBIE) {
 		/* Hangup flag on zombie state. */
 		revents |= POLLHUP;
 	} else if (events & (POLLOUT|POLLWRNORM)) {
 		/* See if we can write something. */
 		if (ttydisc_write_poll(tp) > 0)
 			revents |= events & (POLLOUT|POLLWRNORM);
 	}
 
 	if (revents == 0) {
 		if (events & (POLLIN|POLLRDNORM))
 			selrecord(td, &tp->t_inpoll);
 		if (events & (POLLOUT|POLLWRNORM))
 			selrecord(td, &tp->t_outpoll);
 	}
 
 	tty_unlock(tp);
 
 	return (revents);
 }
 
 static int
 ttydev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	/* Handle mmap() through the driver. */
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (-1);
 	error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr);
 	tty_unlock(tp);
 
 	return (error);
 }
 
 /*
  * kqueue support.
  */
 
 static void
 tty_kqops_read_detach(struct knote *kn)
 {
 	struct tty *tp = kn->kn_hook;
 
 	knlist_remove(&tp->t_inpoll.si_note, kn, 0);
 }
 
 static int
 tty_kqops_read_event(struct knote *kn, long hint __unused)
 {
 	struct tty *tp = kn->kn_hook;
 
 	tty_assert_locked(tp);
 
 	if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_read_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static void
 tty_kqops_write_detach(struct knote *kn)
 {
 	struct tty *tp = kn->kn_hook;
 
 	knlist_remove(&tp->t_outpoll.si_note, kn, 0);
 }
 
 static int
 tty_kqops_write_event(struct knote *kn, long hint __unused)
 {
 	struct tty *tp = kn->kn_hook;
 
 	tty_assert_locked(tp);
 
 	if (tty_gone(tp)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_write_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static struct filterops tty_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = tty_kqops_read_detach,
 	.f_event = tty_kqops_read_event,
 };
 
 static struct filterops tty_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = tty_kqops_write_detach,
 	.f_event = tty_kqops_write_event,
 };
 
 static int
 ttydev_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	error = ttydev_enter(tp);
 	if (error)
 		return (error);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_hook = tp;
 		kn->kn_fop = &tty_kqops_read;
 		knlist_add(&tp->t_inpoll.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		kn->kn_hook = tp;
 		kn->kn_fop = &tty_kqops_write;
 		knlist_add(&tp->t_outpoll.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	tty_unlock(tp);
 	return (error);
 }
 
 static struct cdevsw ttydev_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttydev_open,
 	.d_close	= ttydev_close,
 	.d_read		= ttydev_read,
 	.d_write	= ttydev_write,
 	.d_ioctl	= ttydev_ioctl,
 	.d_kqfilter	= ttydev_kqfilter,
 	.d_poll		= ttydev_poll,
 	.d_mmap		= ttydev_mmap,
 	.d_name		= "ttydev",
 	.d_flags	= D_TTY,
 };
 
 /*
  * Init/lock-state devices
  */
 
 static int
 ttyil_open(struct cdev *dev, int oflags __unused, int devtype __unused,
     struct thread *td)
 {
 	struct tty *tp;
 	int error;
 
 	tp = dev->si_drv1;
 	error = 0;
 	tty_lock(tp);
 	if (tty_gone(tp))
 		error = ENODEV;
 	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ttyil_close(struct cdev *dev __unused, int flag __unused, int mode __unused,
     struct thread *td __unused)
 {
 
 	return (0);
 }
 
 static int
 ttyil_rdwr(struct cdev *dev __unused, struct uio *uio __unused,
     int ioflag __unused)
 {
 
 	return (ENODEV);
 }
 
 static int
 ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	struct tty *tp = dev->si_drv1;
 	int error;
 
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		error = ENODEV;
 		goto done;
 	}
 
 	error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
 	if (error != ENOIOCTL)
 		goto done;
 	error = 0;
 
 	switch (cmd) {
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		*(struct termios*)data = *(struct termios*)dev->si_drv2;
 		break;
 	case TIOCSETA:
 		/* Set terminal flags through tcsetattr(). */
 		error = priv_check(td, PRIV_TTY_SETA);
 		if (error)
 			break;
 		*(struct termios*)dev->si_drv2 = *(struct termios*)data;
 		break;
 	case TIOCGETD:
 		*(int *)data = TTYDISC;
 		break;
 	case TIOCGWINSZ:
 		bzero(data, sizeof(struct winsize));
 		break;
 	default:
 		error = ENOTTY;
 	}
 
 done:	tty_unlock(tp);
 	return (error);
 }
 
 static struct cdevsw ttyil_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttyil_open,
 	.d_close	= ttyil_close,
 	.d_read		= ttyil_rdwr,
 	.d_write	= ttyil_rdwr,
 	.d_ioctl	= ttyil_ioctl,
 	.d_name		= "ttyil",
 	.d_flags	= D_TTY,
 };
 
 static void
 tty_init_termios(struct tty *tp)
 {
 	struct termios *t = &tp->t_termios_init_in;
 
 	t->c_cflag = TTYDEF_CFLAG;
 	t->c_iflag = TTYDEF_IFLAG;
 	t->c_lflag = TTYDEF_LFLAG;
 	t->c_oflag = TTYDEF_OFLAG;
 	t->c_ispeed = TTYDEF_SPEED;
 	t->c_ospeed = TTYDEF_SPEED;
 	memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars);
 
 	tp->t_termios_init_out = *t;
 }
 
 void
 tty_init_console(struct tty *tp, speed_t s)
 {
 	struct termios *ti = &tp->t_termios_init_in;
 	struct termios *to = &tp->t_termios_init_out;
 
 	if (s != 0) {
 		ti->c_ispeed = ti->c_ospeed = s;
 		to->c_ispeed = to->c_ospeed = s;
 	}
 
 	ti->c_cflag |= CLOCAL;
 	to->c_cflag |= CLOCAL;
 }
 
 /*
  * Standard device routine implementations, mostly meant for
  * pseudo-terminal device drivers. When a driver creates a new terminal
  * device class, missing routines are patched.
  */
 
 static int
 ttydevsw_defopen(struct tty *tp __unused)
 {
 
 	return (0);
 }
 
 static void
 ttydevsw_defclose(struct tty *tp __unused)
 {
 
 }
 
 static void
 ttydevsw_defoutwakeup(struct tty *tp __unused)
 {
 
 	panic("Terminal device has output, while not implemented");
 }
 
 static void
 ttydevsw_definwakeup(struct tty *tp __unused)
 {
 
 }
 
 static int
 ttydevsw_defioctl(struct tty *tp __unused, u_long cmd __unused,
     caddr_t data __unused, struct thread *td __unused)
 {
 
 	return (ENOIOCTL);
 }
 
 static int
 ttydevsw_defcioctl(struct tty *tp __unused, int unit __unused,
     u_long cmd __unused, caddr_t data __unused, struct thread *td __unused)
 {
 
 	return (ENOIOCTL);
 }
 
 static int
 ttydevsw_defparam(struct tty *tp __unused, struct termios *t)
 {
 
 	/*
 	 * Allow the baud rate to be adjusted for pseudo-devices, but at
 	 * least restrict it to 115200 to prevent excessive buffer
 	 * usage.  Also disallow 0, to prevent foot shooting.
 	 */
 	if (t->c_ispeed < B50)
 		t->c_ispeed = B50;
 	else if (t->c_ispeed > B115200)
 		t->c_ispeed = B115200;
 	if (t->c_ospeed < B50)
 		t->c_ospeed = B50;
 	else if (t->c_ospeed > B115200)
 		t->c_ospeed = B115200;
 	t->c_cflag |= CREAD;
 
 	return (0);
 }
 
 static int
 ttydevsw_defmodem(struct tty *tp __unused, int sigon __unused,
     int sigoff __unused)
 {
 
 	/* Simulate a carrier to make the TTY layer happy. */
 	return (SER_DCD);
 }
 
 static int
 ttydevsw_defmmap(struct tty *tp __unused, vm_ooffset_t offset __unused,
     vm_paddr_t *paddr __unused, int nprot __unused,
     vm_memattr_t *memattr __unused)
 {
 
 	return (-1);
 }
 
 static void
 ttydevsw_defpktnotify(struct tty *tp __unused, char event __unused)
 {
 
 }
 
 static void
 ttydevsw_deffree(void *softc __unused)
 {
 
 	panic("Terminal device freed without a free-handler");
 }
 
 static bool
 ttydevsw_defbusy(struct tty *tp __unused)
 {
 
 	return (FALSE);
 }
 
 /*
  * TTY allocation and deallocation. TTY devices can be deallocated when
  * the driver doesn't use it anymore, when the TTY isn't a session's
  * controlling TTY and when the device node isn't opened through devfs.
  */
 
 struct tty *
 tty_alloc(struct ttydevsw *tsw, void *sc)
 {
 
 	return (tty_alloc_mutex(tsw, sc, NULL));
 }
 
 struct tty *
 tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex)
 {
 	struct tty *tp;
 
 	/* Make sure the driver defines all routines. */
 #define PATCH_FUNC(x) do {				\
 	if (tsw->tsw_ ## x == NULL)			\
 		tsw->tsw_ ## x = ttydevsw_def ## x;	\
 } while (0)
 	PATCH_FUNC(open);
 	PATCH_FUNC(close);
 	PATCH_FUNC(outwakeup);
 	PATCH_FUNC(inwakeup);
 	PATCH_FUNC(ioctl);
 	PATCH_FUNC(cioctl);
 	PATCH_FUNC(param);
 	PATCH_FUNC(modem);
 	PATCH_FUNC(mmap);
 	PATCH_FUNC(pktnotify);
 	PATCH_FUNC(free);
 	PATCH_FUNC(busy);
 #undef PATCH_FUNC
 
 	tp = malloc(sizeof(struct tty) + TTY_PRBUF_SIZE, M_TTY,
 	    M_WAITOK | M_ZERO);
 	tp->t_prbufsz = TTY_PRBUF_SIZE;
 	tp->t_devsw = tsw;
 	tp->t_devswsoftc = sc;
 	tp->t_flags = tsw->tsw_flags;
 	tp->t_drainwait = tty_drainwait;
 
 	tty_init_termios(tp);
 
 	cv_init(&tp->t_inwait, "ttyin");
 	cv_init(&tp->t_outwait, "ttyout");
 	cv_init(&tp->t_outserwait, "ttyosr");
 	cv_init(&tp->t_bgwait, "ttybg");
 	cv_init(&tp->t_dcdwait, "ttydcd");
 
 	/* Allow drivers to use a custom mutex to lock the TTY. */
 	if (mutex != NULL) {
 		tp->t_mtx = mutex;
 	} else {
 		tp->t_mtx = &tp->t_mtxobj;
 		mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF);
 	}
 
 	knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx);
 
 	return (tp);
 }
 
 static void
 tty_dealloc(void *arg)
 {
 	struct tty *tp = arg;
 
 	/*
 	 * ttyydev_leave() usually frees the i/o queues earlier, but it is
 	 * not always called between queue allocation and here.  The queues
 	 * may be allocated by ioctls on a pty control device without the
 	 * corresponding pty slave device ever being open, or after it is
 	 * closed.
 	 */
 	ttyinq_free(&tp->t_inq);
 	ttyoutq_free(&tp->t_outq);
 	seldrain(&tp->t_inpoll);
 	seldrain(&tp->t_outpoll);
 	knlist_destroy(&tp->t_inpoll.si_note);
 	knlist_destroy(&tp->t_outpoll.si_note);
 
 	cv_destroy(&tp->t_inwait);
 	cv_destroy(&tp->t_outwait);
 	cv_destroy(&tp->t_bgwait);
 	cv_destroy(&tp->t_dcdwait);
 	cv_destroy(&tp->t_outserwait);
 
 	if (tp->t_mtx == &tp->t_mtxobj)
 		mtx_destroy(&tp->t_mtxobj);
 	ttydevsw_free(tp);
 	free(tp, M_TTY);
 }
 
 static void
 tty_rel_free(struct tty *tp)
 {
 	struct cdev *dev;
 
 	tty_assert_locked(tp);
 
 #define	TF_ACTIVITY	(TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE)
 	if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) {
 		/* TTY is still in use. */
 		tty_unlock(tp);
 		return;
 	}
 
 	/* Stop asynchronous I/O. */
 	funsetown(&tp->t_sigio);
 
 	/* TTY can be deallocated. */
 	dev = tp->t_dev;
 	tp->t_dev = NULL;
 	tty_unlock(tp);
 
 	if (dev != NULL) {
 		sx_xlock(&tty_list_sx);
 		TAILQ_REMOVE(&tty_list, tp, t_list);
 		tty_list_count--;
 		sx_xunlock(&tty_list_sx);
 		destroy_dev_sched_cb(dev, tty_dealloc, tp);
 	}
 }
 
 void
 tty_rel_pgrp(struct tty *tp, struct pgrp *pg)
 {
 
 	MPASS(tp->t_sessioncnt > 0);
 	tty_assert_locked(tp);
 
 	if (tp->t_pgrp == pg)
 		tp->t_pgrp = NULL;
 
 	tty_unlock(tp);
 }
 
 void
 tty_rel_sess(struct tty *tp, struct session *sess)
 {
 
 	MPASS(tp->t_sessioncnt > 0);
 
 	/* Current session has left. */
 	if (tp->t_session == sess) {
 		tp->t_session = NULL;
 		MPASS(tp->t_pgrp == NULL);
 	}
 	tp->t_sessioncnt--;
 	tty_rel_free(tp);
 }
 
 void
 tty_rel_gone(struct tty *tp)
 {
 
 	tty_assert_locked(tp);
 	MPASS(!tty_gone(tp));
 
 	/* Simulate carrier removal. */
 	ttydisc_modem(tp, 0);
 
 	/* Wake up all blocked threads. */
 	tty_wakeup(tp, FREAD|FWRITE);
 	cv_broadcast(&tp->t_bgwait);
 	cv_broadcast(&tp->t_dcdwait);
 
 	tp->t_flags |= TF_GONE;
 	tty_rel_free(tp);
 }
 
 static int
 tty_drop_ctty(struct tty *tp, struct proc *p)
 {
 	struct session *session;
 	struct vnode *vp;
 
 	/*
 	 * This looks terrible, but it's generally safe as long as the tty
 	 * hasn't gone away while we had the lock dropped.  All of our sanity
 	 * checking that this operation is OK happens after we've picked it back
 	 * up, so other state changes are generally not fatal and the potential
 	 * for this particular operation to happen out-of-order in a
 	 * multithreaded scenario is likely a non-issue.
 	 */
 	tty_unlock(tp);
 	sx_xlock(&proctree_lock);
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		sx_xunlock(&proctree_lock);
 		return (ENODEV);
 	}
 
 	/*
 	 * If the session doesn't have a controlling TTY, or if we weren't
 	 * invoked on the controlling TTY, we'll return ENOIOCTL as we've
 	 * historically done.
 	 */
 	session = p->p_session;
 	if (session->s_ttyp == NULL || session->s_ttyp != tp) {
 		sx_xunlock(&proctree_lock);
 		return (ENOTTY);
 	}
 
 	if (!SESS_LEADER(p)) {
 		sx_xunlock(&proctree_lock);
 		return (EPERM);
 	}
 
 	PROC_LOCK(p);
 	SESS_LOCK(session);
 	vp = session->s_ttyvp;
 	session->s_ttyp = NULL;
 	session->s_ttyvp = NULL;
 	session->s_ttydp = NULL;
 	SESS_UNLOCK(session);
 
 	tp->t_sessioncnt--;
 	p->p_flag &= ~P_CONTROLT;
 	PROC_UNLOCK(p);
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * If we did have a vnode, release our reference.  Ordinarily we manage
 	 * these at the devfs layer, but we can't necessarily know that we were
 	 * invoked on the vnode referenced in the session (i.e. the vnode we
 	 * hold a reference to).  We explicitly don't check VBAD/VIRF_DOOMED here
 	 * to avoid a vnode leak -- in circumstances elsewhere where we'd hit a
 	 * VIRF_DOOMED vnode, release has been deferred until the controlling TTY
 	 * is either changed or released.
 	 */
 	if (vp != NULL)
 		devfs_ctty_unref(vp);
 	return (0);
 }
 
 /*
  * Exposing information about current TTY's through sysctl
  */
 
 static void
 tty_to_xtty(struct tty *tp, struct xtty *xt)
 {
 
 	tty_assert_locked(tp);
 
 	xt->xt_size = sizeof(struct xtty);
 	xt->xt_insize = ttyinq_getsize(&tp->t_inq);
 	xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq);
 	xt->xt_inlc = ttyinq_bytesline(&tp->t_inq);
 	xt->xt_inlow = tp->t_inlow;
 	xt->xt_outsize = ttyoutq_getsize(&tp->t_outq);
 	xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq);
 	xt->xt_outlow = tp->t_outlow;
 	xt->xt_column = tp->t_column;
 	xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
 	xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0;
 	xt->xt_flags = tp->t_flags;
 	xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : (uint32_t)NODEV;
 }
 
 static int
 sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long lsize;
 	struct xtty *xtlist, *xt;
 	struct tty *tp;
 	int error;
 
 	sx_slock(&tty_list_sx);
 	lsize = tty_list_count * sizeof(struct xtty);
 	if (lsize == 0) {
 		sx_sunlock(&tty_list_sx);
 		return (0);
 	}
 
 	xtlist = xt = malloc(lsize, M_TTY, M_WAITOK);
 
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		tty_lock(tp);
 		tty_to_xtty(tp, xt);
 		tty_unlock(tp);
 		xt++;
 	}
 	sx_sunlock(&tty_list_sx);
 
 	error = SYSCTL_OUT(req, xtlist, lsize);
 	free(xtlist, M_TTY);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
 	0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs");
 
 /*
  * Device node creation. Device has been set up, now we can expose it to
  * the user.
  */
 
 int
 tty_makedevf(struct tty *tp, struct ucred *cred, int flags,
     const char *fmt, ...)
 {
 	va_list ap;
 	struct make_dev_args args;
 	struct cdev *dev, *init, *lock, *cua, *cinit, *clock;
 	const char *prefix = "tty";
 	char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */
 	uid_t uid;
 	gid_t gid;
 	mode_t mode;
 	int error;
 
 	/* Remove "tty" prefix from devices like PTY's. */
 	if (tp->t_flags & TF_NOPREFIX)
 		prefix = "";
 
 	va_start(ap, fmt);
 	vsnrprintf(name, sizeof name, 32, fmt, ap);
 	va_end(ap);
 
 	if (cred == NULL) {
 		/* System device. */
 		uid = UID_ROOT;
 		gid = GID_WHEEL;
 		mode = S_IRUSR|S_IWUSR;
 	} else {
 		/* User device. */
 		uid = cred->cr_ruid;
 		gid = GID_TTY;
 		mode = S_IRUSR|S_IWUSR|S_IWGRP;
 	}
 
 	flags = flags & TTYMK_CLONING ? MAKEDEV_REF : 0;
 	flags |= MAKEDEV_CHECKNAME;
 
 	/* Master call-in device. */
 	make_dev_args_init(&args);
 	args.mda_flags = flags;
 	args.mda_devsw = &ttydev_cdevsw;
 	args.mda_cr = cred;
 	args.mda_uid = uid;
 	args.mda_gid = gid;
 	args.mda_mode = mode;
 	args.mda_si_drv1 = tp;
 	error = make_dev_s(&args, &dev, "%s%s", prefix, name);
 	if (error != 0)
 		return (error);
 	tp->t_dev = dev;
 
 	init = lock = cua = cinit = clock = NULL;
 
 	/* Slave call-in devices. */
 	if (tp->t_flags & TF_INITLOCK) {
 		args.mda_devsw = &ttyil_cdevsw;
 		args.mda_unit = TTYUNIT_INIT;
 		args.mda_si_drv1 = tp;
 		args.mda_si_drv2 = &tp->t_termios_init_in;
 		error = make_dev_s(&args, &init, "%s%s.init", prefix, name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, init);
 
 		args.mda_unit = TTYUNIT_LOCK;
 		args.mda_si_drv2 = &tp->t_termios_lock_in;
 		error = make_dev_s(&args, &lock, "%s%s.lock", prefix, name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, lock);
 	}
 
 	/* Call-out devices. */
 	if (tp->t_flags & TF_CALLOUT) {
 		make_dev_args_init(&args);
 		args.mda_flags = flags;
 		args.mda_devsw = &ttydev_cdevsw;
 		args.mda_cr = cred;
 		args.mda_uid = UID_UUCP;
 		args.mda_gid = GID_DIALER;
 		args.mda_mode = 0660;
 		args.mda_unit = TTYUNIT_CALLOUT;
 		args.mda_si_drv1 = tp;
 		error = make_dev_s(&args, &cua, "cua%s", name);
 		if (error != 0)
 			goto fail;
 		dev_depends(dev, cua);
 
 		/* Slave call-out devices. */
 		if (tp->t_flags & TF_INITLOCK) {
 			args.mda_devsw = &ttyil_cdevsw;
 			args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_INIT;
 			args.mda_si_drv2 = &tp->t_termios_init_out;
 			error = make_dev_s(&args, &cinit, "cua%s.init", name);
 			if (error != 0)
 				goto fail;
 			dev_depends(dev, cinit);
 
 			args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_LOCK;
 			args.mda_si_drv2 = &tp->t_termios_lock_out;
 			error = make_dev_s(&args, &clock, "cua%s.lock", name);
 			if (error != 0)
 				goto fail;
 			dev_depends(dev, clock);
 		}
 	}
 
 	sx_xlock(&tty_list_sx);
 	TAILQ_INSERT_TAIL(&tty_list, tp, t_list);
 	tty_list_count++;
 	sx_xunlock(&tty_list_sx);
 
 	return (0);
 
 fail:
 	destroy_dev(dev);
 	if (init)
 		destroy_dev(init);
 	if (lock)
 		destroy_dev(lock);
 	if (cinit)
 		destroy_dev(cinit);
 	if (clock)
 		destroy_dev(clock);
 
 	return (error);
 }
 
 /*
  * Signalling processes.
  */
 
 void
 tty_signal_sessleader(struct tty *tp, int sig)
 {
 	struct proc *p;
 	struct session *s;
 
 	tty_assert_locked(tp);
 	MPASS(sig >= 1 && sig < NSIG);
 
 	/* Make signals start output again. */
 	tp->t_flags &= ~TF_STOPPED;
 	tp->t_termios.c_lflag &= ~FLUSHO;
 
 	/*
 	 * Load s_leader exactly once to avoid race where s_leader is
 	 * set to NULL by a concurrent invocation of killjobc() by the
 	 * session leader.  Note that we are not holding t_session's
 	 * lock for the read.
 	 */
 	if ((s = tp->t_session) != NULL &&
 	    (p = atomic_load_ptr(&s->s_leader)) != NULL) {
 		PROC_LOCK(p);
 		kern_psignal(p, sig);
 		PROC_UNLOCK(p);
 	}
 }
 
 void
 tty_signal_pgrp(struct tty *tp, int sig)
 {
 	ksiginfo_t ksi;
 
 	tty_assert_locked(tp);
 	MPASS(sig >= 1 && sig < NSIG);
 
 	/* Make signals start output again. */
 	tp->t_flags &= ~TF_STOPPED;
 	tp->t_termios.c_lflag &= ~FLUSHO;
 
 	if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO))
 		tty_info(tp);
 	if (tp->t_pgrp != NULL) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = sig;
 		ksi.ksi_code = SI_KERNEL;
 		PGRP_LOCK(tp->t_pgrp);
 		pgsignal(tp->t_pgrp, sig, 1, &ksi);
 		PGRP_UNLOCK(tp->t_pgrp);
 	}
 }
 
 void
 tty_wakeup(struct tty *tp, int flags)
 {
 
 	if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL)
 		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
 
 	if (flags & FWRITE) {
 		cv_broadcast(&tp->t_outwait);
 		selwakeup(&tp->t_outpoll);
 		KNOTE_LOCKED(&tp->t_outpoll.si_note, 0);
 	}
 	if (flags & FREAD) {
 		cv_broadcast(&tp->t_inwait);
 		selwakeup(&tp->t_inpoll);
 		KNOTE_LOCKED(&tp->t_inpoll.si_note, 0);
 	}
 }
 
 int
 tty_wait(struct tty *tp, struct cv *cv)
 {
 	int error;
 	int revokecnt = tp->t_revokecnt;
 
 	tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
 	MPASS(!tty_gone(tp));
 
 	error = cv_wait_sig(cv, tp->t_mtx);
 
 	/* Bail out when the device slipped away. */
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	/* Restart the system call when we may have been revoked. */
 	if (tp->t_revokecnt != revokecnt)
 		return (ERESTART);
 
 	return (error);
 }
 
 int
 tty_timedwait(struct tty *tp, struct cv *cv, int hz)
 {
 	int error;
 	int revokecnt = tp->t_revokecnt;
 
 	tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
 	MPASS(!tty_gone(tp));
 
 	error = cv_timedwait_sig(cv, tp->t_mtx, hz);
 
 	/* Bail out when the device slipped away. */
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	/* Restart the system call when we may have been revoked. */
 	if (tp->t_revokecnt != revokecnt)
 		return (ERESTART);
 
 	return (error);
 }
 
 void
 tty_flush(struct tty *tp, int flags)
 {
 
 	if (flags & FWRITE) {
 		tp->t_flags &= ~TF_HIWAT_OUT;
 		ttyoutq_flush(&tp->t_outq);
 		tty_wakeup(tp, FWRITE);
 		if (!tty_gone(tp)) {
 			ttydevsw_outwakeup(tp);
 			ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE);
 		}
 	}
 	if (flags & FREAD) {
 		tty_hiwat_in_unblock(tp);
 		ttyinq_flush(&tp->t_inq);
 		tty_wakeup(tp, FREAD);
 		if (!tty_gone(tp)) {
 			ttydevsw_inwakeup(tp);
 			ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD);
 		}
 	}
 }
 
 void
 tty_set_winsize(struct tty *tp, const struct winsize *wsz)
 {
 
 	if (memcmp(&tp->t_winsize, wsz, sizeof(*wsz)) == 0)
 		return;
 	tp->t_winsize = *wsz;
 	tty_signal_pgrp(tp, SIGWINCH);
 }
 
 static int
 tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
     struct thread *td)
 {
 	int error;
 
 	switch (cmd) {
 	/*
 	 * Modem commands.
 	 * The SER_* and TIOCM_* flags are the same, but one bit
 	 * shifted. I don't know why.
 	 */
 	case TIOCSDTR:
 		ttydevsw_modem(tp, SER_DTR, 0);
 		return (0);
 	case TIOCCDTR:
 		ttydevsw_modem(tp, 0, SER_DTR);
 		return (0);
 	case TIOCMSET: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp,
 		    (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1,
 		    ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1);
 		return (0);
 	}
 	case TIOCMBIS: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0);
 		return (0);
 	}
 	case TIOCMBIC: {
 		int bits = *(int *)data;
 		ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1);
 		return (0);
 	}
 	case TIOCMGET:
 		*(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1);
 		return (0);
 
 	case FIOASYNC:
 		if (*(int *)data)
 			tp->t_flags |= TF_ASYNC;
 		else
 			tp->t_flags &= ~TF_ASYNC;
 		return (0);
 	case FIONBIO:
 		/* This device supports non-blocking operation. */
 		return (0);
 	case FIONREAD:
 		*(int *)data = ttyinq_bytescanonicalized(&tp->t_inq);
 		return (0);
 	case FIONWRITE:
 	case TIOCOUTQ:
 		*(int *)data = ttyoutq_bytesused(&tp->t_outq);
 		return (0);
 	case FIOSETOWN:
 		if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
 			/* Not allowed to set ownership. */
 			return (ENOTTY);
 
 		/* Temporarily unlock the TTY to set ownership. */
 		tty_unlock(tp);
 		error = fsetown(*(int *)data, &tp->t_sigio);
 		tty_lock(tp);
 		return (error);
 	case FIOGETOWN:
 		if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
 			/* Not allowed to set ownership. */
 			return (ENOTTY);
 
 		/* Get ownership. */
 		*(int *)data = fgetown(&tp->t_sigio);
 		return (0);
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		*(struct termios*)data = tp->t_termios;
 		return (0);
 	case TIOCSETA:
 	case TIOCSETAW:
 	case TIOCSETAF: {
 		struct termios *t = data;
 
 		/*
 		 * Who makes up these funny rules? According to POSIX,
 		 * input baud rate is set equal to the output baud rate
 		 * when zero.
 		 */
 		if (t->c_ispeed == 0)
 			t->c_ispeed = t->c_ospeed;
 
 		/* Discard any unsupported bits. */
 		t->c_iflag &= TTYSUP_IFLAG;
 		t->c_oflag &= TTYSUP_OFLAG;
 		t->c_lflag &= TTYSUP_LFLAG;
 		t->c_cflag &= TTYSUP_CFLAG;
 
 		/* Set terminal flags through tcsetattr(). */
 		if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
 			error = tty_drain(tp, 0);
 			if (error)
 				return (error);
 			if (cmd == TIOCSETAF)
 				tty_flush(tp, FREAD);
 		}
 
 		/*
 		 * Only call param() when the flags really change.
 		 */
 		if ((t->c_cflag & CIGNORE) == 0 &&
 		    (tp->t_termios.c_cflag != t->c_cflag ||
 		    ((tp->t_termios.c_iflag ^ t->c_iflag) &
 		    (IXON|IXOFF|IXANY)) ||
 		    tp->t_termios.c_ispeed != t->c_ispeed ||
 		    tp->t_termios.c_ospeed != t->c_ospeed)) {
 			error = ttydevsw_param(tp, t);
 			if (error)
 				return (error);
 
 			/* XXX: CLOCAL? */
 
 			tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
 			tp->t_termios.c_ispeed = t->c_ispeed;
 			tp->t_termios.c_ospeed = t->c_ospeed;
 
 			/* Baud rate has changed - update watermarks. */
 			error = tty_watermarks(tp);
 			if (error)
 				return (error);
 		}
 
 		/* Copy new non-device driver parameters. */
 		tp->t_termios.c_iflag = t->c_iflag;
 		tp->t_termios.c_oflag = t->c_oflag;
 		tp->t_termios.c_lflag = t->c_lflag;
 		memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc);
 
 		ttydisc_optimize(tp);
 
 		if ((t->c_lflag & ICANON) == 0) {
 			/*
 			 * When in non-canonical mode, wake up all
 			 * readers. Canonicalize any partial input. VMIN
 			 * and VTIME could also be adjusted.
 			 */
 			ttyinq_canonicalize(&tp->t_inq);
 			tty_wakeup(tp, FREAD);
 		}
 
 		/*
 		 * For packet mode: notify the PTY consumer that VSTOP
 		 * and VSTART may have been changed.
 		 */
 		if (tp->t_termios.c_iflag & IXON &&
 		    tp->t_termios.c_cc[VSTOP] == CTRL('S') &&
 		    tp->t_termios.c_cc[VSTART] == CTRL('Q'))
 			ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP);
 		else
 			ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP);
 		return (0);
 	}
 	case TIOCGETD:
 		/* For compatibility - we only support TTYDISC. */
 		*(int *)data = TTYDISC;
 		return (0);
 	case TIOCGPGRP:
 		if (!tty_is_ctty(tp, td->td_proc))
 			return (ENOTTY);
 
 		if (tp->t_pgrp != NULL)
 			*(int *)data = tp->t_pgrp->pg_id;
 		else
 			*(int *)data = NO_PID;
 		return (0);
 	case TIOCGSID:
 		if (!tty_is_ctty(tp, td->td_proc))
 			return (ENOTTY);
 
 		MPASS(tp->t_session);
 		*(int *)data = tp->t_session->s_sid;
 		return (0);
 	case TIOCNOTTY:
 		return (tty_drop_ctty(tp, td->td_proc));
 	case TIOCSCTTY: {
 		struct proc *p = td->td_proc;
 
 		/* XXX: This looks awful. */
 		tty_unlock(tp);
 		sx_xlock(&proctree_lock);
 		tty_lock(tp);
 
 		if (!SESS_LEADER(p)) {
 			/* Only the session leader may do this. */
 			sx_xunlock(&proctree_lock);
 			return (EPERM);
 		}
 
 		if (tp->t_session != NULL && tp->t_session == p->p_session) {
 			/* This is already our controlling TTY. */
 			sx_xunlock(&proctree_lock);
 			return (0);
 		}
 
 		if (p->p_session->s_ttyp != NULL ||
 		    (tp->t_session != NULL && tp->t_session->s_ttyvp != NULL &&
 		    tp->t_session->s_ttyvp->v_type != VBAD)) {
 			/*
 			 * There is already a relation between a TTY and
 			 * a session, or the caller is not the session
 			 * leader.
 			 *
 			 * Allow the TTY to be stolen when the vnode is
 			 * invalid, but the reference to the TTY is
 			 * still active.  This allows immediate reuse of
 			 * TTYs of which the session leader has been
 			 * killed or the TTY revoked.
 			 */
 			sx_xunlock(&proctree_lock);
 			return (EPERM);
 		}
 
 		/* Connect the session to the TTY. */
 		tp->t_session = p->p_session;
 		tp->t_session->s_ttyp = tp;
 		tp->t_sessioncnt++;
 
 		/* Assign foreground process group. */
 		tp->t_pgrp = p->p_pgrp;
 		PROC_LOCK(p);
 		p->p_flag |= P_CONTROLT;
 		PROC_UNLOCK(p);
 
 		sx_xunlock(&proctree_lock);
 		return (0);
 	}
 	case TIOCSPGRP: {
 		struct pgrp *pg;
 
 		/*
 		 * XXX: Temporarily unlock the TTY to locate the process
 		 * group. This code would be lot nicer if we would ever
 		 * decompose proctree_lock.
 		 */
 		tty_unlock(tp);
 		sx_slock(&proctree_lock);
 		pg = pgfind(*(int *)data);
 		if (pg != NULL)
 			PGRP_UNLOCK(pg);
 		if (pg == NULL || pg->pg_session != td->td_proc->p_session) {
 			sx_sunlock(&proctree_lock);
 			tty_lock(tp);
 			return (EPERM);
 		}
 		tty_lock(tp);
 
 		/*
 		 * Determine if this TTY is the controlling TTY after
 		 * relocking the TTY.
 		 */
 		if (!tty_is_ctty(tp, td->td_proc)) {
 			sx_sunlock(&proctree_lock);
 			return (ENOTTY);
 		}
 		tp->t_pgrp = pg;
 		sx_sunlock(&proctree_lock);
 
 		/* Wake up the background process groups. */
 		cv_broadcast(&tp->t_bgwait);
 		return (0);
 	}
 	case TIOCFLUSH: {
 		int flags = *(int *)data;
 
 		if (flags == 0)
 			flags = (FREAD|FWRITE);
 		else
 			flags &= (FREAD|FWRITE);
 		tty_flush(tp, flags);
 		return (0);
 	}
 	case TIOCDRAIN:
 		/* Drain TTY output. */
 		return tty_drain(tp, 0);
 	case TIOCGDRAINWAIT:
 		*(int *)data = tp->t_drainwait;
 		return (0);
 	case TIOCSDRAINWAIT:
 		error = priv_check(td, PRIV_TTY_DRAINWAIT);
 		if (error == 0)
 			tp->t_drainwait = *(int *)data;
 		return (error);
 	case TIOCCONS:
 		/* Set terminal as console TTY. */
 		if (*(int *)data) {
 			error = priv_check(td, PRIV_TTY_CONSOLE);
 			if (error)
 				return (error);
 
 			/*
 			 * XXX: constty should really need to be locked!
 			 * XXX: allow disconnected constty's to be stolen!
 			 */
 
 			if (constty == tp)
 				return (0);
 			if (constty != NULL)
 				return (EBUSY);
 
 			tty_unlock(tp);
 			constty_set(tp);
 			tty_lock(tp);
 		} else if (constty == tp) {
 			constty_clear();
 		}
 		return (0);
 	case TIOCGWINSZ:
 		/* Obtain window size. */
 		*(struct winsize*)data = tp->t_winsize;
 		return (0);
 	case TIOCSWINSZ:
 		/* Set window size. */
 		tty_set_winsize(tp, data);
 		return (0);
 	case TIOCEXCL:
 		tp->t_flags |= TF_EXCLUDE;
 		return (0);
 	case TIOCNXCL:
 		tp->t_flags &= ~TF_EXCLUDE;
 		return (0);
 	case TIOCSTOP:
 		tp->t_flags |= TF_STOPPED;
 		ttydevsw_pktnotify(tp, TIOCPKT_STOP);
 		return (0);
 	case TIOCSTART:
 		tp->t_flags &= ~TF_STOPPED;
 		tp->t_termios.c_lflag &= ~FLUSHO;
 		ttydevsw_outwakeup(tp);
 		ttydevsw_pktnotify(tp, TIOCPKT_START);
 		return (0);
 	case TIOCSTAT:
 		tty_info(tp);
 		return (0);
 	case TIOCSTI:
 		if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
 			return (EPERM);
 		if (!tty_is_ctty(tp, td->td_proc) &&
 		    priv_check(td, PRIV_TTY_STI))
 			return (EACCES);
 		ttydisc_rint(tp, *(char *)data, 0);
 		ttydisc_rint_done(tp);
 		return (0);
 	}
 
 #ifdef COMPAT_43TTY
 	return tty_ioctl_compat(tp, cmd, data, fflag, td);
 #else /* !COMPAT_43TTY */
 	return (ENOIOCTL);
 #endif /* COMPAT_43TTY */
 }
 
 int
 tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td)
 {
 	int error;
 
 	tty_assert_locked(tp);
 
 	if (tty_gone(tp))
 		return (ENXIO);
 
 	error = ttydevsw_ioctl(tp, cmd, data, td);
 	if (error == ENOIOCTL)
 		error = tty_generic_ioctl(tp, cmd, data, fflag, td);
 
 	return (error);
 }
 
 dev_t
 tty_udev(struct tty *tp)
 {
 
 	if (tp->t_dev)
 		return (dev2udev(tp->t_dev));
 	else
 		return (NODEV);
 }
 
 int
 tty_checkoutq(struct tty *tp)
 {
 
 	/* 256 bytes should be enough to print a log message. */
 	return (ttyoutq_bytesleft(&tp->t_outq) >= 256);
 }
 
 void
 tty_hiwat_in_block(struct tty *tp)
 {
 
 	if ((tp->t_flags & TF_HIWAT_IN) == 0 &&
 	    tp->t_termios.c_iflag & IXOFF &&
 	    tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) {
 		/*
 		 * Input flow control. Only enter the high watermark when we
 		 * can successfully store the VSTOP character.
 		 */
 		if (ttyoutq_write_nofrag(&tp->t_outq,
 		    &tp->t_termios.c_cc[VSTOP], 1) == 0)
 			tp->t_flags |= TF_HIWAT_IN;
 	} else {
 		/* No input flow control. */
 		tp->t_flags |= TF_HIWAT_IN;
 	}
 }
 
 void
 tty_hiwat_in_unblock(struct tty *tp)
 {
 
 	if (tp->t_flags & TF_HIWAT_IN &&
 	    tp->t_termios.c_iflag & IXOFF &&
 	    tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) {
 		/*
 		 * Input flow control. Only leave the high watermark when we
 		 * can successfully store the VSTART character.
 		 */
 		if (ttyoutq_write_nofrag(&tp->t_outq,
 		    &tp->t_termios.c_cc[VSTART], 1) == 0)
 			tp->t_flags &= ~TF_HIWAT_IN;
 	} else {
 		/* No input flow control. */
 		tp->t_flags &= ~TF_HIWAT_IN;
 	}
 
 	if (!tty_gone(tp))
 		ttydevsw_inwakeup(tp);
 }
 
 /*
  * TTY hooks interface.
  */
 
 static int
 ttyhook_defrint(struct tty *tp, char c, int flags)
 {
 
 	if (ttyhook_rint_bypass(tp, &c, 1) != 1)
 		return (-1);
 
 	return (0);
 }
 
 int
 ttyhook_register(struct tty **rtp, struct proc *p, int fd, struct ttyhook *th,
     void *softc)
 {
 	struct tty *tp;
 	struct file *fp;
 	struct cdev *dev;
 	struct cdevsw *cdp;
 	struct filedesc *fdp;
 	cap_rights_t rights;
 	int error, ref;
 
 	/* Validate the file descriptor. */
 	fdp = p->p_fd;
-	error = fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_TTYHOOK),
+	error = fget_unlocked(fdp, fd, cap_rights_init_one(&rights, CAP_TTYHOOK),
 	    &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		error = EBADF;
 		goto done1;
 	}
 
 	/*
 	 * Make sure the vnode is bound to a character device.
 	 * Unlocked check for the vnode type is ok there, because we
 	 * only shall prevent calling devvn_refthread on the file that
 	 * never has been opened over a character device.
 	 */
 	if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) {
 		error = EINVAL;
 		goto done1;
 	}
 
 	/* Make sure it is a TTY. */
 	cdp = devvn_refthread(fp->f_vnode, &dev, &ref);
 	if (cdp == NULL) {
 		error = ENXIO;
 		goto done1;
 	}
 	if (dev != fp->f_data) {
 		error = ENXIO;
 		goto done2;
 	}
 	if (cdp != &ttydev_cdevsw) {
 		error = ENOTTY;
 		goto done2;
 	}
 	tp = dev->si_drv1;
 
 	/* Try to attach the hook to the TTY. */
 	error = EBUSY;
 	tty_lock(tp);
 	MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0));
 	if (tp->t_flags & TF_HOOK)
 		goto done3;
 
 	tp->t_flags |= TF_HOOK;
 	tp->t_hook = th;
 	tp->t_hooksoftc = softc;
 	*rtp = tp;
 	error = 0;
 
 	/* Maybe we can switch into bypass mode now. */
 	ttydisc_optimize(tp);
 
 	/* Silently convert rint() calls to rint_bypass() when possible. */
 	if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass))
 		th->th_rint = ttyhook_defrint;
 
 done3:	tty_unlock(tp);
 done2:	dev_relthread(dev, ref);
 done1:	fdrop(fp, curthread);
 	return (error);
 }
 
 void
 ttyhook_unregister(struct tty *tp)
 {
 
 	tty_assert_locked(tp);
 	MPASS(tp->t_flags & TF_HOOK);
 
 	/* Disconnect the hook. */
 	tp->t_flags &= ~TF_HOOK;
 	tp->t_hook = NULL;
 
 	/* Maybe we need to leave bypass mode. */
 	ttydisc_optimize(tp);
 
 	/* Maybe deallocate the TTY as well. */
 	tty_rel_free(tp);
 }
 
 /*
  * /dev/console handling.
  */
 
 static int
 ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct tty *tp;
 
 	/* System has no console device. */
 	if (dev_console_filename == NULL)
 		return (ENXIO);
 
 	/* Look up corresponding TTY by device name. */
 	sx_slock(&tty_list_sx);
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		if (strcmp(dev_console_filename, tty_devname(tp)) == 0) {
 			dev_console->si_drv1 = tp;
 			break;
 		}
 	}
 	sx_sunlock(&tty_list_sx);
 
 	/* System console has no TTY associated. */
 	if (dev_console->si_drv1 == NULL)
 		return (ENXIO);
 
 	return (ttydev_open(dev, oflags, devtype, td));
 }
 
 static int
 ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 
 	log_console(uio);
 
 	return (ttydev_write(dev, uio, ioflag));
 }
 
 /*
  * /dev/console is a little different than normal TTY's.  When opened,
  * it determines which TTY to use.  When data gets written to it, it
  * will be logged in the kernel message buffer.
  */
 static struct cdevsw ttyconsdev_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= ttyconsdev_open,
 	.d_close	= ttydev_close,
 	.d_read		= ttydev_read,
 	.d_write	= ttyconsdev_write,
 	.d_ioctl	= ttydev_ioctl,
 	.d_kqfilter	= ttydev_kqfilter,
 	.d_poll		= ttydev_poll,
 	.d_mmap		= ttydev_mmap,
 	.d_name		= "ttyconsdev",
 	.d_flags	= D_TTY,
 };
 
 static void
 ttyconsdev_init(void *unused __unused)
 {
 
 	dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0,
 	    NULL, UID_ROOT, GID_WHEEL, 0600, "console");
 }
 
 SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL);
 
 void
 ttyconsdev_select(const char *name)
 {
 
 	dev_console_filename = name;
 }
 
 /*
  * Debugging routines.
  */
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 
 static const struct {
 	int flag;
 	char val;
 } ttystates[] = {
 #if 0
 	{ TF_NOPREFIX,		'N' },
 #endif
 	{ TF_INITLOCK,		'I' },
 	{ TF_CALLOUT,		'C' },
 
 	/* Keep these together -> 'Oi' and 'Oo'. */
 	{ TF_OPENED,		'O' },
 	{ TF_OPENED_IN,		'i' },
 	{ TF_OPENED_OUT,	'o' },
 	{ TF_OPENED_CONS,	'c' },
 
 	{ TF_GONE,		'G' },
 	{ TF_OPENCLOSE,		'B' },
 	{ TF_ASYNC,		'Y' },
 	{ TF_LITERAL,		'L' },
 
 	/* Keep these together -> 'Hi' and 'Ho'. */
 	{ TF_HIWAT,		'H' },
 	{ TF_HIWAT_IN,		'i' },
 	{ TF_HIWAT_OUT,		'o' },
 
 	{ TF_STOPPED,		'S' },
 	{ TF_EXCLUDE,		'X' },
 	{ TF_BYPASS,		'l' },
 	{ TF_ZOMBIE,		'Z' },
 	{ TF_HOOK,		's' },
 
 	/* Keep these together -> 'bi' and 'bo'. */
 	{ TF_BUSY,		'b' },
 	{ TF_BUSY_IN,		'i' },
 	{ TF_BUSY_OUT,		'o' },
 
 	{ 0,			'\0'},
 };
 
 #define	TTY_FLAG_BITS \
 	"\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN" \
 	"\5OPENED_OUT\6OPENED_CONS\7GONE\10OPENCLOSE" \
 	"\11ASYNC\12LITERAL\13HIWAT_IN\14HIWAT_OUT" \
 	"\15STOPPED\16EXCLUDE\17BYPASS\20ZOMBIE" \
 	"\21HOOK\22BUSY_IN\23BUSY_OUT"
 
 #define DB_PRINTSYM(name, addr) \
 	db_printf("%s  " #name ": ", sep); \
 	db_printsym((db_addr_t) addr, DB_STGY_ANY); \
 	db_printf("\n");
 
 static void
 _db_show_devsw(const char *sep, const struct ttydevsw *tsw)
 {
 
 	db_printf("%sdevsw: ", sep);
 	db_printsym((db_addr_t)tsw, DB_STGY_ANY);
 	db_printf(" (%p)\n", tsw);
 	DB_PRINTSYM(open, tsw->tsw_open);
 	DB_PRINTSYM(close, tsw->tsw_close);
 	DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup);
 	DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup);
 	DB_PRINTSYM(ioctl, tsw->tsw_ioctl);
 	DB_PRINTSYM(param, tsw->tsw_param);
 	DB_PRINTSYM(modem, tsw->tsw_modem);
 	DB_PRINTSYM(mmap, tsw->tsw_mmap);
 	DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify);
 	DB_PRINTSYM(free, tsw->tsw_free);
 }
 
 static void
 _db_show_hooks(const char *sep, const struct ttyhook *th)
 {
 
 	db_printf("%shook: ", sep);
 	db_printsym((db_addr_t)th, DB_STGY_ANY);
 	db_printf(" (%p)\n", th);
 	if (th == NULL)
 		return;
 	DB_PRINTSYM(rint, th->th_rint);
 	DB_PRINTSYM(rint_bypass, th->th_rint_bypass);
 	DB_PRINTSYM(rint_done, th->th_rint_done);
 	DB_PRINTSYM(rint_poll, th->th_rint_poll);
 	DB_PRINTSYM(getc_inject, th->th_getc_inject);
 	DB_PRINTSYM(getc_capture, th->th_getc_capture);
 	DB_PRINTSYM(getc_poll, th->th_getc_poll);
 	DB_PRINTSYM(close, th->th_close);
 }
 
 static void
 _db_show_termios(const char *name, const struct termios *t)
 {
 
 	db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x "
 	    "lflag 0x%x ispeed %u ospeed %u\n", name,
 	    t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag,
 	    t->c_ispeed, t->c_ospeed);
 }
 
 /* DDB command to show TTY statistics. */
 DB_SHOW_COMMAND(tty, db_show_tty)
 {
 	struct tty *tp;
 
 	if (!have_addr) {
 		db_printf("usage: show tty <addr>\n");
 		return;
 	}
 	tp = (struct tty *)addr;
 
 	db_printf("%p: %s\n", tp, tty_devname(tp));
 	db_printf("\tmtx: %p\n", tp->t_mtx);
 	db_printf("\tflags: 0x%b\n", tp->t_flags, TTY_FLAG_BITS);
 	db_printf("\trevokecnt: %u\n", tp->t_revokecnt);
 
 	/* Buffering mechanisms. */
 	db_printf("\tinq: %p begin %u linestart %u reprint %u end %u "
 	    "nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin,
 	    tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end,
 	    tp->t_inq.ti_nblocks, tp->t_inq.ti_quota);
 	db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n",
 	    &tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end,
 	    tp->t_outq.to_nblocks, tp->t_outq.to_quota);
 	db_printf("\tinlow: %zu\n", tp->t_inlow);
 	db_printf("\toutlow: %zu\n", tp->t_outlow);
 	_db_show_termios("\ttermios", &tp->t_termios);
 	db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n",
 	    tp->t_winsize.ws_row, tp->t_winsize.ws_col,
 	    tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel);
 	db_printf("\tcolumn: %u\n", tp->t_column);
 	db_printf("\twritepos: %u\n", tp->t_writepos);
 	db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags);
 
 	/* Init/lock-state devices. */
 	_db_show_termios("\ttermios_init_in", &tp->t_termios_init_in);
 	_db_show_termios("\ttermios_init_out", &tp->t_termios_init_out);
 	_db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in);
 	_db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out);
 
 	/* Hooks */
 	_db_show_devsw("\t", tp->t_devsw);
 	_db_show_hooks("\t", tp->t_hook);
 
 	/* Process info. */
 	db_printf("\tpgrp: %p gid %d\n", tp->t_pgrp,
 	    tp->t_pgrp ? tp->t_pgrp->pg_id : 0);
 	db_printf("\tsession: %p", tp->t_session);
 	if (tp->t_session != NULL)
 	    db_printf(" count %u leader %p tty %p sid %d login %s",
 		tp->t_session->s_count, tp->t_session->s_leader,
 		tp->t_session->s_ttyp, tp->t_session->s_sid,
 		tp->t_session->s_login);
 	db_printf("\n");
 	db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt);
 	db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc);
 	db_printf("\thooksoftc: %p\n", tp->t_hooksoftc);
 	db_printf("\tdev: %p\n", tp->t_dev);
 }
 
 /* DDB command to list TTYs. */
 DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys)
 {
 	struct tty *tp;
 	size_t isiz, osiz;
 	int i, j;
 
 	/* Make the output look like `pstat -t'. */
 	db_printf("PTR        ");
 #if defined(__LP64__)
 	db_printf("        ");
 #endif
 	db_printf("      LINE   INQ  CAN  LIN  LOW  OUTQ  USE  LOW   "
 	    "COL  SESS  PGID STATE\n");
 
 	TAILQ_FOREACH(tp, &tty_list, t_list) {
 		isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE;
 		osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE;
 
 		db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d "
 		    "%5d ", tp, tty_devname(tp), isiz,
 		    tp->t_inq.ti_linestart - tp->t_inq.ti_begin,
 		    tp->t_inq.ti_end - tp->t_inq.ti_linestart,
 		    isiz - tp->t_inlow, osiz,
 		    tp->t_outq.to_end - tp->t_outq.to_begin,
 		    osiz - tp->t_outlow, MIN(tp->t_column, 99999),
 		    tp->t_session ? tp->t_session->s_sid : 0,
 		    tp->t_pgrp ? tp->t_pgrp->pg_id : 0);
 
 		/* Flag bits. */
 		for (i = j = 0; ttystates[i].flag; i++)
 			if (tp->t_flags & ttystates[i].flag) {
 				db_printf("%c", ttystates[i].val);
 				j++;
 			}
 		if (j == 0)
 			db_printf("-");
 		db_printf("\n");
 	}
 }
 #endif /* DDB */
diff --git a/sys/kern/uipc_sem.c b/sys/kern/uipc_sem.c
index bb47fe9a3905..db1d84696df0 100644
--- a/sys/kern/uipc_sem.c
+++ b/sys/kern/uipc_sem.c
@@ -1,1110 +1,1111 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
  * Copyright (c) 2003-2005 SPARTA, Inc.
  * Copyright (c) 2005, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/_semaphore.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
 /*
  * TODO
  *
  * - Resource limits?
  * - Replace global sem_lock with mtx_pool locks?
  * - Add a MAC check_create() hook for creating new named semaphores.
  */
 
 #ifndef SEM_MAX
 #define	SEM_MAX	30
 #endif
 
 #ifdef SEM_DEBUG
 #define	DP(x)	printf x
 #else
 #define	DP(x)
 #endif
 
 struct ksem_mapping {
 	char		*km_path;
 	Fnv32_t		km_fnv;
 	struct ksem	*km_ksem;
 	LIST_ENTRY(ksem_mapping) km_link;
 };
 
 static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
 static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
 static struct sx ksem_dict_lock;
 static struct mtx ksem_count_lock;
 static struct mtx sem_lock;
 static u_long ksem_hash;
 static int ksem_dead;
 
 #define	KSEM_HASH(fnv)	(&ksem_dictionary[(fnv) & ksem_hash])
 
 static int nsems = 0;
 SYSCTL_DECL(_p1003_1b);
 SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
     "Number of active kernel POSIX semaphores");
 
 static int	kern_sem_wait(struct thread *td, semid_t id, int tryflag,
 		    struct timespec *abstime);
 static int	ksem_access(struct ksem *ks, struct ucred *ucred);
 static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
 		    unsigned int value);
 static int	ksem_create(struct thread *td, const char *path,
 		    semid_t *semidp, mode_t mode, unsigned int value,
 		    int flags, int compat32);
 static void	ksem_drop(struct ksem *ks);
 static int	ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp);
 static struct ksem *ksem_hold(struct ksem *ks);
 static void	ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
 static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
 static void	ksem_module_destroy(void);
 static int	ksem_module_init(void);
 static int	ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	sem_modload(struct module *module, int cmd, void *arg);
 
 static fo_stat_t	ksem_stat;
 static fo_close_t	ksem_closef;
 static fo_chmod_t	ksem_chmod;
 static fo_chown_t	ksem_chown;
 static fo_fill_kinfo_t	ksem_fill_kinfo;
 
 /* File descriptor operations. */
 static struct fileops ksem_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = ksem_stat,
 	.fo_close = ksem_closef,
 	.fo_chmod = ksem_chmod,
 	.fo_chown = ksem_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = ksem_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 FEATURE(posix_sem, "POSIX semaphores");
 
 static int
 ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 #ifdef MAC
 	int error;
 #endif
 
 	ks = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * Attempt to return sanish values for fstat() on a semaphore
 	 * file descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 
 	mtx_lock(&sem_lock);
 	sb->st_atim = ks->ks_atime;
 	sb->st_ctim = ks->ks_ctime;
 	sb->st_mtim = ks->ks_mtime;
 	sb->st_birthtim = ks->ks_birthtime;
 	sb->st_uid = ks->ks_uid;
 	sb->st_gid = ks->ks_gid;
 	sb->st_mode = S_IFREG | ks->ks_mode;		/* XXX */
 	mtx_unlock(&sem_lock);
 
 	return (0);
 }
 
 static int
 ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setmode(active_cred, ks, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
 	    active_cred);
 	if (error != 0)
 		goto out;
 	ks->ks_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = ks->ks_uid;
 	if (gid == (gid_t)-1)
                  gid = ks->ks_gid;
 	if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
 	    (gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	ks->ks_uid = uid;
 	ks->ks_gid = gid;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_closef(struct file *fp, struct thread *td)
 {
 	struct ksem *ks;
 
 	ks = fp->f_data;
 	fp->f_data = NULL;
 	ksem_drop(ks);
 
 	return (0);
 }
 
 static int
 ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	const char *path, *pr_path;
 	struct ksem *ks;
 	size_t pr_pathlen;
 
 	kif->kf_type = KF_TYPE_SEM;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	kif->kf_un.kf_sem.kf_sem_value = ks->ks_value;
 	kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode;	/* XXX */
 	mtx_unlock(&sem_lock);
 	if (ks->ks_path != NULL) {
 		sx_slock(&ksem_dict_lock);
 		if (ks->ks_path != NULL) {
 			path = ks->ks_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				if (strncmp(path, pr_path, pr_pathlen) == 0 &&
 				    path[pr_pathlen] == '/')
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 		sx_sunlock(&ksem_dict_lock);
 	}
 	return (0);
 }
 
 /*
  * ksem object management including creation and reference counting
  * routines.
  */
 static struct ksem *
 ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
 {
 	struct ksem *ks;
 
 	mtx_lock(&ksem_count_lock);
 	if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
 		mtx_unlock(&ksem_count_lock);
 		return (NULL);
 	}
 	nsems++;
 	mtx_unlock(&ksem_count_lock);
 	ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
 	ks->ks_uid = ucred->cr_uid;
 	ks->ks_gid = ucred->cr_gid;
 	ks->ks_mode = mode;
 	ks->ks_value = value;
 	cv_init(&ks->ks_cv, "ksem");
 	vfs_timestamp(&ks->ks_birthtime);
 	ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
 	refcount_init(&ks->ks_ref, 1);
 #ifdef MAC
 	mac_posixsem_init(ks);
 	mac_posixsem_create(ucred, ks);
 #endif
 
 	return (ks);
 }
 
 static struct ksem *
 ksem_hold(struct ksem *ks)
 {
 
 	refcount_acquire(&ks->ks_ref);
 	return (ks);
 }
 
 static void
 ksem_drop(struct ksem *ks)
 {
 
 	if (refcount_release(&ks->ks_ref)) {
 #ifdef MAC
 		mac_posixsem_destroy(ks);
 #endif
 		cv_destroy(&ks->ks_cv);
 		free(ks, M_KSEM);
 		mtx_lock(&ksem_count_lock);
 		nsems--;
 		mtx_unlock(&ksem_count_lock);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for read
  * and write access.
  */
 static int
 ksem_access(struct ksem *ks, struct ucred *ucred)
 {
 	int error;
 
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
 	    VREAD | VWRITE, ucred);
 	if (error)
 		error = priv_check_cred(ucred, PRIV_SEM_WRITE);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to semaphore objects.  We use the FNV hash on the path to
  * store the mappings in a hash table.
  */
 static struct ksem *
 ksem_lookup(char *path, Fnv32_t fnv)
 {
 	struct ksem_mapping *map;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0)
 			return (map->km_ksem);
 	}
 
 	return (NULL);
 }
 
 static void
 ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
 {
 	struct ksem_mapping *map;
 
 	map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
 	map->km_path = path;
 	map->km_fnv = fnv;
 	map->km_ksem = ksem_hold(ks);
 	ks->ks_path = path;
 	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
 }
 
 static int
 ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct ksem_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixsem_check_unlink(ucred, map->km_ksem);
 			if (error)
 				return (error);
 #endif
 			error = ksem_access(map->km_ksem, ucred);
 			if (error)
 				return (error);
 			map->km_ksem->ks_path = NULL;
 			LIST_REMOVE(map, km_link);
 			ksem_drop(map->km_ksem);
 			free(map->km_path, M_KSEM);
 			free(map, M_KSEM);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static int
 ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
     int compat32)
 {
 	semid_t semid;
 #ifdef COMPAT_FREEBSD32
 	int32_t semid32;
 #endif
 	void *ptr;
 	size_t ptrs;
 
 #ifdef COMPAT_FREEBSD32
 	if (compat32) {
 		semid32 = fd;
 		ptr = &semid32;
 		ptrs = sizeof(semid32);
 	} else {
 #endif
 		semid = fd;
 		ptr = &semid;
 		ptrs = sizeof(semid);
 		compat32 = 0; /* silence gcc */
 #ifdef COMPAT_FREEBSD32
 	}
 #endif
 
 	return (copyout(ptr, semidp, ptrs));
 }
 
 /* Other helper routines. */
 static int
 ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
     unsigned int value, int flags, int compat32)
 {
 	struct pwddesc *pdp;
 	struct ksem *ks;
 	struct file *fp;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error, fd;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_VALUE(value);
 
 	if (value > SEM_VALUE_MAX)
 		return (EINVAL);
 
 	pdp = td->td_proc->p_pd;
 	mode = (mode & ~pdp->pd_cmask) & ACCESSPERMS;
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error) {
 		if (name == NULL)
 			error = ENOSPC;
 		return (error);
 	}
 
 	/*
 	 * Go ahead and copyout the file descriptor now.  This is a bit
 	 * premature, but it is a lot easier to handle errors as opposed
 	 * to later when we've possibly created a new semaphore, etc.
 	 */
 	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
 	if (error) {
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	if (name == NULL) {
 		/* Create an anonymous semaphore. */
 		ks = ksem_alloc(td->td_ucred, mode, value);
 		if (ks == NULL)
 			error = ENOSPC;
 		else
 			ks->ks_flags |= KS_ANONYMOUS;
 	} else {
 		path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
 		pr_path = td->td_ucred->cr_prison->pr_path;
 
 		/* Construct a full pathname for jailed callers. */
 		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 		    : strlcpy(path, pr_path, MAXPATHLEN);
 		error = copyinstr(name, path + pr_pathlen,
 		    MAXPATHLEN - pr_pathlen, NULL);
 
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_KSEM);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&ksem_dict_lock);
 		ks = ksem_lookup(path, fnv);
 		if (ks == NULL) {
 			/* Object does not exist, create it if requested. */
 			if (flags & O_CREAT) {
 				ks = ksem_alloc(td->td_ucred, mode, value);
 				if (ks == NULL)
 					error = ENFILE;
 				else {
 					ksem_insert(path, fnv, ks);
 					path = NULL;
 				}
 			} else
 				error = ENOENT;
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixsem_check_open(td->td_ucred,
 				    ks);
 				if (error == 0)
 #endif
 				error = ksem_access(ks, td->td_ucred);
 			}
 			if (error == 0)
 				ksem_hold(ks);
 #ifdef INVARIANTS
 			else
 				ks = NULL;
 #endif
 		}
 		sx_xunlock(&ksem_dict_lock);
 		if (path)
 			free(path, M_KSEM);
 	}
 
 	if (error) {
 		KASSERT(ks == NULL, ("ksem_create error with a ksem"));
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 	KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
 
 	finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
 
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static int
 ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	error = fget(td, id, rightsp, &fp);
 	if (error)
 		return (EINVAL);
 	if (fp->f_type != DTYPE_SEM) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_DEAD) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 /* System calls. */
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_init_args {
 	unsigned int	value;
 	semid_t		*idp;
 };
 #endif
 int
 sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_open_args {
 	char		*name;
 	int		oflag;
 	mode_t		mode;
 	unsigned int	value;
 	semid_t		*idp;	
 };
 #endif
 int
 sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
 {
 
 	DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_unlink_args {
 	char		*name;
 };
 #endif
 int
 sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 	    : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
 	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&ksem_dict_lock);
 	error = ksem_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&ksem_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_close_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_ANONYMOUS) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = kern_close(td, uap->id);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_post_args {
 	semid_t	id;
 };
 #endif
 int
 sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
-	    cap_rights_init(&rights, CAP_SEM_POST), &fp);
+	    cap_rights_init_one(&rights, CAP_SEM_POST), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
 	if (error)
 		goto err;
 #endif
 	if (ks->ks_value == SEM_VALUE_MAX) {
 		error = EOVERFLOW;
 		goto err;
 	}
 	++ks->ks_value;
 	if (ks->ks_waiters > 0)
 		cv_signal(&ks->ks_cv);
 	error = 0;
 	vfs_timestamp(&ks->ks_ctime);
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_wait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 0, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_timedwait_args {
 	semid_t		id;
 	const struct timespec *abstime;
 };
 #endif
 int
 sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
 {
 	struct timespec abstime;
 	struct timespec *ts;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime, sizeof(abstime));
 		if (error != 0)
 			return (error);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_trywait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 1, NULL));
 }
 
 static int
 kern_sem_wait(struct thread *td, semid_t id, int tryflag,
     struct timespec *abstime)
 {
 	struct timespec ts1, ts2;
 	struct timeval tv;
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
 	AUDIT_ARG_FD(id);
-	error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp);
+	error = ksem_get(td, id, cap_rights_init_one(&rights, CAP_SEM_WAIT),
+	    &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	DP((">>> kern_sem_wait critical section entered! pid=%d\n",
 	    (int)td->td_proc->p_pid));
 #ifdef MAC
 	error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		DP(("kern_sem_wait mac failed\n"));
 		goto err;
 	}
 #endif
 	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
 	vfs_timestamp(&ks->ks_atime);
 	while (ks->ks_value == 0) {
 		ks->ks_waiters++;
 		if (tryflag != 0)
 			error = EAGAIN;
 		else if (abstime == NULL)
 			error = cv_wait_sig(&ks->ks_cv, &sem_lock);
 		else {
 			for (;;) {
 				ts1 = *abstime;
 				getnanotime(&ts2);
 				timespecsub(&ts1, &ts2, &ts1);
 				TIMESPEC_TO_TIMEVAL(&tv, &ts1);
 				if (tv.tv_sec < 0) {
 					error = ETIMEDOUT;
 					break;
 				}
 				error = cv_timedwait_sig(&ks->ks_cv,
 				    &sem_lock, tvtohz(&tv));
 				if (error != EWOULDBLOCK)
 					break;
 			}
 		}
 		ks->ks_waiters--;
 		if (error)
 			goto err;
 	}
 	ks->ks_value--;
 	DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
 	error = 0;
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
 	    (int)td->td_proc->p_pid, error));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_getvalue_args {
 	semid_t		id;
 	int		*val;
 };
 #endif
 int
 sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error, val;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
-	    cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp);
+	    cap_rights_init_one(&rights, CAP_SEM_GETVALUE), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		mtx_unlock(&sem_lock);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 	val = ks->ks_value;
 	vfs_timestamp(&ks->ks_atime);
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	error = copyout(&val, uap->val, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_destroy_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
 {
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (!(ks->ks_flags & KS_ANONYMOUS)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	mtx_lock(&sem_lock);
 	if (ks->ks_waiters != 0) {
 		mtx_unlock(&sem_lock);
 		error = EBUSY;
 		goto err;
 	}
 	ks->ks_flags |= KS_DEAD;
 	mtx_unlock(&sem_lock);
 
 	error = kern_close(td, uap->id);
 err:
 	fdrop(fp, td);
 	return (error);
 }
 
 static struct syscall_helper_data ksem_syscalls[] = {
 	SYSCALL_INIT_HELPER(ksem_init),
 	SYSCALL_INIT_HELPER(ksem_open),
 	SYSCALL_INIT_HELPER(ksem_unlink),
 	SYSCALL_INIT_HELPER(ksem_close),
 	SYSCALL_INIT_HELPER(ksem_post),
 	SYSCALL_INIT_HELPER(ksem_wait),
 	SYSCALL_INIT_HELPER(ksem_timedwait),
 	SYSCALL_INIT_HELPER(ksem_trywait),
 	SYSCALL_INIT_HELPER(ksem_getvalue),
 	SYSCALL_INIT_HELPER(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 int
 freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 1));
 }
 
 int
 freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
 {
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 1));
 }
 
 int
 freebsd32_ksem_timedwait(struct thread *td,
     struct freebsd32_ksem_timedwait_args *uap)
 {
 	struct timespec32 abstime32;
 	struct timespec *ts, abstime;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
 		if (error != 0)
 			return (error);
 		CP(abstime32, abstime, tv_sec);
 		CP(abstime32, abstime, tv_nsec);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 static struct syscall_helper_data ksem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 ksem_module_init(void)
 {
 	int error;
 
 	mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
 	mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
 	sx_init(&ksem_dict_lock, "ksem dictionary");
 	ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
 	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
 	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
 
 	error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #endif
 	return (0);
 }
 
 static void
 ksem_module_destroy(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(ksem32_syscalls);
 #endif
 	syscall_helper_unregister(ksem_syscalls);
 
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
 	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
 	sx_destroy(&ksem_dict_lock);
 	mtx_destroy(&ksem_count_lock);
 	mtx_destroy(&sem_lock);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
 }
 
 static int
 sem_modload(struct module *module, int cmd, void *arg)
 {
         int error = 0;
 
         switch (cmd) {
         case MOD_LOAD:
 		error = ksem_module_init();
 		if (error)
 			ksem_module_destroy();
                 break;
 
         case MOD_UNLOAD:
 		mtx_lock(&ksem_count_lock);
 		if (nsems != 0) {
 			error = EOPNOTSUPP;
 			mtx_unlock(&ksem_count_lock);
 			break;
 		}
 		ksem_dead = 1;
 		mtx_unlock(&ksem_count_lock);
 		ksem_module_destroy();
                 break;
 
         case MOD_SHUTDOWN:
                 break;
         default:
                 error = EINVAL;
                 break;
         }
         return (error);
 }
 
 static moduledata_t sem_mod = {
         "sem",
         &sem_modload,
         NULL
 };
 
 DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sem, 1);
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index afea72fd2700..ad369fc7b23e 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -1,2992 +1,2994 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California. All Rights Reserved.
  * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
  * Copyright (c) 2018 Matthew Macy
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
  */
 
 /*
  * UNIX Domain (Local) Sockets
  *
  * This is an implementation of UNIX (local) domain sockets.  Each socket has
  * an associated struct unpcb (UNIX protocol control block).  Stream sockets
  * may be connected to 0 or 1 other socket.  Datagram sockets may be
  * connected to 0, 1, or many other sockets.  Sockets may be created and
  * connected in pairs (socketpair(2)), or bound/connected to using the file
  * system name space.  For most purposes, only the receive socket buffer is
  * used, as sending on one socket delivers directly to the receive socket
  * buffer of a second socket.
  *
  * The implementation is substantially complicated by the fact that
  * "ancillary data", such as file descriptors or credentials, may be passed
  * across UNIX domain sockets.  The potential for passing UNIX domain sockets
  * over other UNIX domain sockets requires the implementation of a simple
  * garbage collector to find and tear down cycles of disconnected sockets.
  *
  * TODO:
  *	RDM
  *	rethink name space problems
  *	need a proper out-of-band
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/vnode.h>
 
 #include <net/vnet.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 MALLOC_DECLARE(M_FILECAPS);
 
 /*
  * See unpcb.h for the locking key.
  */
 
 static uma_zone_t	unp_zone;
 static unp_gen_t	unp_gencnt;	/* (l) */
 static u_int		unp_count;	/* (l) Count of local sockets. */
 static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
 static int		unp_rights;	/* (g) File descriptors in flight. */
 static struct unp_head	unp_shead;	/* (l) List of stream sockets. */
 static struct unp_head	unp_dhead;	/* (l) List of datagram sockets. */
 static struct unp_head	unp_sphead;	/* (l) List of seqpacket sockets. */
 
 struct unp_defer {
 	SLIST_ENTRY(unp_defer) ud_link;
 	struct file *ud_fp;
 };
 static SLIST_HEAD(, unp_defer) unp_defers;
 static int unp_defers_count;
 
 static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
 
 /*
  * Garbage collection of cyclic file descriptor/socket references occurs
  * asynchronously in a taskqueue context in order to avoid recursion and
  * reentrance in the UNIX domain socket, file descriptor, and socket layer
  * code.  See unp_gc() for a full description.
  */
 static struct timeout_task unp_gc_task;
 
 /*
  * The close of unix domain sockets attached as SCM_RIGHTS is
  * postponed to the taskqueue, to avoid arbitrary recursion depth.
  * The attached sockets might have another sockets attached.
  */
 static struct task	unp_defer_task;
 
 /*
  * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
  * stream sockets, although the total for sender and receiver is actually
  * only PIPSIZ.
  *
  * Datagram sockets really use the sendspace as the maximum datagram size,
  * and don't really want to reserve the sendspace.  Their recvspace should be
  * large enough for at least one max-size datagram plus address.
  */
 #ifndef PIPSIZ
 #define	PIPSIZ	8192
 #endif
 static u_long	unpst_sendspace = PIPSIZ;
 static u_long	unpst_recvspace = PIPSIZ;
 static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
 static u_long	unpdg_recvspace = 4*1024;
 static u_long	unpsp_sendspace = PIPSIZ;	/* really max datagram size */
 static u_long	unpsp_recvspace = PIPSIZ;
 
 static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Local domain");
 static SYSCTL_NODE(_net_local, SOCK_STREAM, stream,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SOCK_STREAM");
 static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SOCK_DGRAM");
 static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SOCK_SEQPACKET");
 
 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
 	   &unpst_sendspace, 0, "Default stream send space.");
 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpst_recvspace, 0, "Default stream receive space.");
 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
 	   &unpdg_sendspace, 0, "Default datagram send space.");
 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpdg_recvspace, 0, "Default datagram receive space.");
 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
 	   &unpsp_sendspace, 0, "Default seqpacket send space.");
 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpsp_recvspace, 0, "Default seqpacket receive space.");
 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
     "File descriptors in flight.");
 SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
     &unp_defers_count, 0,
     "File descriptors deferred to taskqueue for close.");
 
 /*
  * Locking and synchronization:
  *
  * Several types of locks exist in the local domain socket implementation:
  * - a global linkage lock
  * - a global connection list lock
  * - the mtxpool lock
  * - per-unpcb mutexes
  *
  * The linkage lock protects the global socket lists, the generation number
  * counter and garbage collector state.
  *
  * The connection list lock protects the list of referring sockets in a datagram
  * socket PCB.  This lock is also overloaded to protect a global list of
  * sockets whose buffers contain socket references in the form of SCM_RIGHTS
  * messages.  To avoid recursion, such references are released by a dedicated
  * thread.
  *
  * The mtxpool lock protects the vnode from being modified while referenced.
  * Lock ordering rules require that it be acquired before any PCB locks.
  *
  * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the
  * unpcb.  This includes the unp_conn field, which either links two connected
  * PCBs together (for connected socket types) or points at the destination
  * socket (for connectionless socket types).  The operations of creating or
  * destroying a connection therefore involve locking multiple PCBs.  To avoid
  * lock order reversals, in some cases this involves dropping a PCB lock and
  * using a reference counter to maintain liveness.
  *
  * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
  * allocated in pru_attach() and freed in pru_detach().  The validity of that
  * pointer is an invariant, so no lock is required to dereference the so_pcb
  * pointer if a valid socket reference is held by the caller.  In practice,
  * this is always true during operations performed on a socket.  Each unpcb
  * has a back-pointer to its socket, unp_socket, which will be stable under
  * the same circumstances.
  *
  * This pointer may only be safely dereferenced as long as a valid reference
  * to the unpcb is held.  Typically, this reference will be from the socket,
  * or from another unpcb when the referring unpcb's lock is held (in order
  * that the reference not be invalidated during use).  For example, to follow
  * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
  * that detach is not run clearing unp_socket.
  *
  * Blocking with UNIX domain sockets is a tricky issue: unlike most network
  * protocols, bind() is a non-atomic operation, and connect() requires
  * potential sleeping in the protocol, due to potentially waiting on local or
  * distributed file systems.  We try to separate "lookup" operations, which
  * may sleep, and the IPC operations themselves, which typically can occur
  * with relative atomicity as locks can be held over the entire operation.
  *
  * Another tricky issue is simultaneous multi-threaded or multi-process
  * access to a single UNIX domain socket.  These are handled by the flags
  * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
  * binding, both of which involve dropping UNIX domain socket locks in order
  * to perform namei() and other file system operations.
  */
 static struct rwlock	unp_link_rwlock;
 static struct mtx	unp_defers_lock;
 
 #define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
 					    "unp_link_rwlock")
 
 #define	UNP_LINK_LOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
 					    RA_LOCKED)
 #define	UNP_LINK_UNLOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
 					    RA_UNLOCKED)
 
 #define	UNP_LINK_RLOCK()		rw_rlock(&unp_link_rwlock)
 #define	UNP_LINK_RUNLOCK()		rw_runlock(&unp_link_rwlock)
 #define	UNP_LINK_WLOCK()		rw_wlock(&unp_link_rwlock)
 #define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
 #define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
 					    RA_WLOCKED)
 #define	UNP_LINK_WOWNED()		rw_wowned(&unp_link_rwlock)
 
 #define	UNP_DEFERRED_LOCK_INIT()	mtx_init(&unp_defers_lock, \
 					    "unp_defer", NULL, MTX_DEF)
 #define	UNP_DEFERRED_LOCK()		mtx_lock(&unp_defers_lock)
 #define	UNP_DEFERRED_UNLOCK()		mtx_unlock(&unp_defers_lock)
 
 #define UNP_REF_LIST_LOCK()		UNP_DEFERRED_LOCK();
 #define UNP_REF_LIST_UNLOCK()		UNP_DEFERRED_UNLOCK();
 
 #define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
 					    "unp", "unp",	\
 					    MTX_DUPOK|MTX_DEF)
 #define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCKPTR(unp)		(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
 #define	UNP_PCB_TRYLOCK(unp)		mtx_trylock(&(unp)->unp_mtx)
 #define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
 #define	UNP_PCB_OWNED(unp)		mtx_owned(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
 #define	UNP_PCB_UNLOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
 
 static int	uipc_connect2(struct socket *, struct socket *);
 static int	uipc_ctloutput(struct socket *, struct sockopt *);
 static int	unp_connect(struct socket *, struct sockaddr *,
 		    struct thread *);
 static int	unp_connectat(int, struct socket *, struct sockaddr *,
 		    struct thread *);
 static int	unp_connect2(struct socket *so, struct socket *so2, int);
 static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
 static void	unp_dispose(struct socket *so);
 static void	unp_dispose_mbuf(struct mbuf *);
 static void	unp_shutdown(struct unpcb *);
 static void	unp_drop(struct unpcb *);
 static void	unp_gc(__unused void *, int);
 static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
 static void	unp_discard(struct file *);
 static void	unp_freerights(struct filedescent **, int);
 static void	unp_init(void);
 static int	unp_internalize(struct mbuf **, struct thread *);
 static void	unp_internalize_fp(struct file *);
 static int	unp_externalize(struct mbuf *, struct mbuf **, int);
 static int	unp_externalize_fp(struct file *);
 static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *, int);
 static void	unp_process_defers(void * __unused, int);
 
 static void
 unp_pcb_hold(struct unpcb *unp)
 {
 	u_int old __unused;
 
 	old = refcount_acquire(&unp->unp_refcount);
 	KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp));
 }
 
 static __result_use_check bool
 unp_pcb_rele(struct unpcb *unp)
 {
 	bool ret;
 
 	UNP_PCB_LOCK_ASSERT(unp);
 
 	if ((ret = refcount_release(&unp->unp_refcount))) {
 		UNP_PCB_UNLOCK(unp);
 		UNP_PCB_LOCK_DESTROY(unp);
 		uma_zfree(unp_zone, unp);
 	}
 	return (ret);
 }
 
 static void
 unp_pcb_rele_notlast(struct unpcb *unp)
 {
 	bool ret __unused;
 
 	ret = refcount_release(&unp->unp_refcount);
 	KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp));
 }
 
 static void
 unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2)
 {
 	UNP_PCB_UNLOCK_ASSERT(unp);
 	UNP_PCB_UNLOCK_ASSERT(unp2);
 
 	if (unp == unp2) {
 		UNP_PCB_LOCK(unp);
 	} else if ((uintptr_t)unp2 > (uintptr_t)unp) {
 		UNP_PCB_LOCK(unp);
 		UNP_PCB_LOCK(unp2);
 	} else {
 		UNP_PCB_LOCK(unp2);
 		UNP_PCB_LOCK(unp);
 	}
 }
 
 static void
 unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2)
 {
 	UNP_PCB_UNLOCK(unp);
 	if (unp != unp2)
 		UNP_PCB_UNLOCK(unp2);
 }
 
 /*
  * Try to lock the connected peer of an already locked socket.  In some cases
  * this requires that we unlock the current socket.  The pairbusy counter is
  * used to block concurrent connection attempts while the lock is dropped.  The
  * caller must be careful to revalidate PCB state.
  */
 static struct unpcb *
 unp_pcb_lock_peer(struct unpcb *unp)
 {
 	struct unpcb *unp2;
 
 	UNP_PCB_LOCK_ASSERT(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 == NULL)
 		return (NULL);
 	if (__predict_false(unp == unp2))
 		return (unp);
 
 	UNP_PCB_UNLOCK_ASSERT(unp2);
 
 	if (__predict_true(UNP_PCB_TRYLOCK(unp2)))
 		return (unp2);
 	if ((uintptr_t)unp2 > (uintptr_t)unp) {
 		UNP_PCB_LOCK(unp2);
 		return (unp2);
 	}
 	unp->unp_pairbusy++;
 	unp_pcb_hold(unp2);
 	UNP_PCB_UNLOCK(unp);
 
 	UNP_PCB_LOCK(unp2);
 	UNP_PCB_LOCK(unp);
 	KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL,
 	    ("%s: socket %p was reconnected", __func__, unp));
 	if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) {
 		unp->unp_flags &= ~UNP_WAITING;
 		wakeup(unp);
 	}
 	if (unp_pcb_rele(unp2)) {
 		/* unp2 is unlocked. */
 		return (NULL);
 	}
 	if (unp->unp_conn == NULL) {
 		UNP_PCB_UNLOCK(unp2);
 		return (NULL);
 	}
 	return (unp2);
 }
 
 /*
  * Definitions of protocols supported in the LOCAL domain.
  */
 static struct domain localdomain;
 static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream;
 static struct pr_usrreqs uipc_usrreqs_seqpacket;
 static struct protosw localsw[] = {
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&localdomain,
 	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_stream
 },
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&localdomain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_dgram
 },
 {
 	.pr_type =		SOCK_SEQPACKET,
 	.pr_domain =		&localdomain,
 
 	/*
 	 * XXXRW: For now, PR_ADDR because soreceive will bump into them
 	 * due to our use of sbappendaddr.  A new sbappend variants is needed
 	 * that supports both atomic record writes and control data.
 	 */
 	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD|
 				    PR_RIGHTS,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_seqpacket,
 },
 };
 
 static struct domain localdomain = {
 	.dom_family =		AF_LOCAL,
 	.dom_name =		"local",
 	.dom_init =		unp_init,
 	.dom_externalize =	unp_externalize,
 	.dom_dispose =		unp_dispose,
 	.dom_protosw =		localsw,
 	.dom_protoswNPROTOSW =	&localsw[nitems(localsw)]
 };
 DOMAIN_SET(local);
 
 static void
 uipc_abort(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
 	UNP_PCB_UNLOCK_ASSERT(unp);
 
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		unp_pcb_hold(unp2);
 		UNP_PCB_UNLOCK(unp);
 		unp_drop(unp2);
 	} else
 		UNP_PCB_UNLOCK(unp);
 }
 
 static int
 uipc_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp, *unp2;
 	const struct sockaddr *sa;
 
 	/*
 	 * Pass back name of connected socket, if it was bound and we are
 	 * still connected (our peer may have closed already!).
 	 */
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_PCB_LOCK(unp);
 	unp2 = unp_pcb_lock_peer(unp);
 	if (unp2 != NULL && unp2->unp_addr != NULL)
 		sa = (struct sockaddr *)unp2->unp_addr;
 	else
 		sa = &sun_noname;
 	bcopy(sa, *nam, sa->sa_len);
 	if (unp2 != NULL)
 		unp_pcb_unlock_pair(unp, unp2);
 	else
 		UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_attach(struct socket *so, int proto, struct thread *td)
 {
 	u_long sendspace, recvspace;
 	struct unpcb *unp;
 	int error;
 	bool locked;
 
 	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		switch (so->so_type) {
 		case SOCK_STREAM:
 			sendspace = unpst_sendspace;
 			recvspace = unpst_recvspace;
 			break;
 
 		case SOCK_DGRAM:
 			sendspace = unpdg_sendspace;
 			recvspace = unpdg_recvspace;
 			break;
 
 		case SOCK_SEQPACKET:
 			sendspace = unpsp_sendspace;
 			recvspace = unpsp_recvspace;
 			break;
 
 		default:
 			panic("uipc_attach");
 		}
 		error = soreserve(so, sendspace, recvspace);
 		if (error)
 			return (error);
 	}
 	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
 	if (unp == NULL)
 		return (ENOBUFS);
 	LIST_INIT(&unp->unp_refs);
 	UNP_PCB_LOCK_INIT(unp);
 	unp->unp_socket = so;
 	so->so_pcb = unp;
 	refcount_init(&unp->unp_refcount, 1);
 
 	if ((locked = UNP_LINK_WOWNED()) == false)
 		UNP_LINK_WLOCK();
 
 	unp->unp_gencnt = ++unp_gencnt;
 	unp->unp_ino = ++unp_ino;
 	unp_count++;
 	switch (so->so_type) {
 	case SOCK_STREAM:
 		LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
 		break;
 
 	case SOCK_DGRAM:
 		LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
 		break;
 
 	case SOCK_SEQPACKET:
 		LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
 		break;
 
 	default:
 		panic("uipc_attach");
 	}
 
 	if (locked == false)
 		UNP_LINK_WUNLOCK();
 
 	return (0);
 }
 
 static int
 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
 	struct vattr vattr;
 	int error, namelen;
 	struct nameidata nd;
 	struct unpcb *unp;
 	struct vnode *vp;
 	struct mount *mp;
 	cap_rights_t rights;
 	char *buf;
 
 	if (nam->sa_family != AF_UNIX)
 		return (EAFNOSUPPORT);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
 
 	if (soun->sun_len > sizeof(struct sockaddr_un))
 		return (EINVAL);
 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
 	if (namelen <= 0)
 		return (EINVAL);
 
 	/*
 	 * We don't allow simultaneous bind() calls on a single UNIX domain
 	 * socket, so flag in-progress operations, and return an error if an
 	 * operation is already in progress.
 	 *
 	 * Historically, we have not allowed a socket to be rebound, so this
 	 * also returns an error.  Not allowing re-binding simplifies the
 	 * implementation and avoids a great many possible failure modes.
 	 */
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode != NULL) {
 		UNP_PCB_UNLOCK(unp);
 		return (EINVAL);
 	}
 	if (unp->unp_flags & UNP_BINDING) {
 		UNP_PCB_UNLOCK(unp);
 		return (EALREADY);
 	}
 	unp->unp_flags |= UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 
 	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
 	bcopy(soun->sun_path, buf, namelen);
 	buf[namelen] = 0;
 
 restart:
 	NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE,
-	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_BINDAT), td);
+	    UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT),
+	    td);
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
 	error = namei(&nd);
 	if (error)
 		goto error;
 	vp = nd.ni_vp;
 	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (vp != NULL) {
 			vrele(vp);
 			error = EADDRINUSE;
 			goto error;
 		}
 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
 		if (error)
 			goto error;
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VSOCK;
 	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_pd->pd_cmask);
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 #endif
 	if (error == 0)
 		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (error) {
 		vn_finished_write(mp);
 		if (error == ERELOOKUP)
 			goto restart;
 		goto error;
 	}
 	vp = nd.ni_vp;
 	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
 	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
 
 	UNP_PCB_LOCK(unp);
 	VOP_UNP_BIND(vp, unp);
 	unp->unp_vnode = vp;
 	unp->unp_addr = soun;
 	unp->unp_flags &= ~UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	free(buf, M_TEMP);
 	return (0);
 
 error:
 	UNP_PCB_LOCK(unp);
 	unp->unp_flags &= ~UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (uipc_bindat(AT_FDCWD, so, nam, td));
 }
 
 static int
 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
 	error = unp_connect(so, nam, td);
 	return (error);
 }
 
 static int
 uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
     struct thread *td)
 {
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
 	error = unp_connectat(fd, so, nam, td);
 	return (error);
 }
 
 static void
 uipc_close(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 	struct vnode *vp = NULL;
 	struct mtx *vplock;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
 
 	vplock = NULL;
 	if ((vp = unp->unp_vnode) != NULL) {
 		vplock = mtx_pool_find(mtxpool_sleep, vp);
 		mtx_lock(vplock);
 	}
 	UNP_PCB_LOCK(unp);
 	if (vp && unp->unp_vnode == NULL) {
 		mtx_unlock(vplock);
 		vp = NULL;
 	}
 	if (vp != NULL) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 	}
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
 		unp_disconnect(unp, unp2);
 	else
 		UNP_PCB_UNLOCK(unp);
 	if (vp) {
 		mtx_unlock(vplock);
 		vrele(vp);
 	}
 }
 
 static int
 uipc_connect2(struct socket *so1, struct socket *so2)
 {
 	struct unpcb *unp, *unp2;
 	int error;
 
 	unp = so1->so_pcb;
 	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
 	unp2 = so2->so_pcb;
 	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
 	unp_pcb_lock_pair(unp, unp2);
 	error = unp_connect2(so1, so2, PRU_CONNECT2);
 	unp_pcb_unlock_pair(unp, unp2);
 	return (error);
 }
 
 static void
 uipc_detach(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 	struct mtx *vplock;
 	struct vnode *vp;
 	int local_unp_rights;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
 
 	vp = NULL;
 	vplock = NULL;
 
 	SOCK_LOCK(so);
 	if (!SOLISTENING(so)) {
 		/*
 		 * Once the socket is removed from the global lists,
 		 * uipc_ready() will not be able to locate its socket buffer, so
 		 * clear the buffer now.  At this point internalized rights have
 		 * already been disposed of.
 		 */
 		sbrelease(&so->so_rcv, so);
 	}
 	SOCK_UNLOCK(so);
 
 	UNP_LINK_WLOCK();
 	LIST_REMOVE(unp, unp_link);
 	if (unp->unp_gcflag & UNPGC_DEAD)
 		LIST_REMOVE(unp, unp_dead);
 	unp->unp_gencnt = ++unp_gencnt;
 	--unp_count;
 	UNP_LINK_WUNLOCK();
 
 	UNP_PCB_UNLOCK_ASSERT(unp);
  restart:
 	if ((vp = unp->unp_vnode) != NULL) {
 		vplock = mtx_pool_find(mtxpool_sleep, vp);
 		mtx_lock(vplock);
 	}
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode != vp && unp->unp_vnode != NULL) {
 		if (vplock)
 			mtx_unlock(vplock);
 		UNP_PCB_UNLOCK(unp);
 		goto restart;
 	}
 	if ((vp = unp->unp_vnode) != NULL) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 	}
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
 		unp_disconnect(unp, unp2);
 	else
 		UNP_PCB_UNLOCK(unp);
 
 	UNP_REF_LIST_LOCK();
 	while (!LIST_EMPTY(&unp->unp_refs)) {
 		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
 
 		unp_pcb_hold(ref);
 		UNP_REF_LIST_UNLOCK();
 
 		MPASS(ref != unp);
 		UNP_PCB_UNLOCK_ASSERT(ref);
 		unp_drop(ref);
 		UNP_REF_LIST_LOCK();
 	}
 	UNP_REF_LIST_UNLOCK();
 
 	UNP_PCB_LOCK(unp);
 	local_unp_rights = unp_rights;
 	unp->unp_socket->so_pcb = NULL;
 	unp->unp_socket = NULL;
 	free(unp->unp_addr, M_SONAME);
 	unp->unp_addr = NULL;
 	if (!unp_pcb_rele(unp))
 		UNP_PCB_UNLOCK(unp);
 	if (vp) {
 		mtx_unlock(vplock);
 		vrele(vp);
 	}
 	if (local_unp_rights)
 		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
 }
 
 static int
 uipc_disconnect(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
 
 	UNP_PCB_LOCK(unp);
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
 		unp_disconnect(unp, unp2);
 	else
 		UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_listen(struct socket *so, int backlog, struct thread *td)
 {
 	struct unpcb *unp;
 	int error;
 
 	if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET)
 		return (EOPNOTSUPP);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
 
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode == NULL) {
 		/* Already connected or not bound to an address. */
 		error = unp->unp_conn != NULL ? EINVAL : EDESTADDRREQ;
 		UNP_PCB_UNLOCK(unp);
 		return (error);
 	}
 
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error == 0) {
 		cru2xt(td, &unp->unp_peercred);
 		solisten_proto(so, backlog);
 	}
 	SOCK_UNLOCK(so);
 	UNP_PCB_UNLOCK(unp);
 	return (error);
 }
 
 static int
 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp, *unp2;
 	const struct sockaddr *sa;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_LINK_RLOCK();
 	/*
 	 * XXX: It seems that this test always fails even when connection is
 	 * established.  So, this else clause is added as workaround to
 	 * return PF_LOCAL sockaddr.
 	 */
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		UNP_PCB_LOCK(unp2);
 		if (unp2->unp_addr != NULL)
 			sa = (struct sockaddr *) unp2->unp_addr;
 		else
 			sa = &sun_noname;
 		bcopy(sa, *nam, sa->sa_len);
 		UNP_PCB_UNLOCK(unp2);
 	} else {
 		sa = &sun_noname;
 		bcopy(sa, *nam, sa->sa_len);
 	}
 	UNP_LINK_RUNLOCK();
 	return (0);
 }
 
 static int
 uipc_rcvd(struct socket *so, int flags)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
 	    ("%s: socktype %d", __func__, so->so_type));
 
 	/*
 	 * Adjust backpressure on sender and wakeup any waiting to write.
 	 *
 	 * The unp lock is acquired to maintain the validity of the unp_conn
 	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
 	 * static as long as we don't permit unp2 to disconnect from unp,
 	 * which is prevented by the lock on unp.  We cache values from
 	 * so_rcv to avoid holding the so_rcv lock over the entire
 	 * transaction on the remote so_snd.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	mbcnt = so->so_rcv.sb_mbcnt;
 	sbcc = sbavail(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	/*
 	 * There is a benign race condition at this point.  If we're planning to
 	 * clear SB_STOP, but uipc_send is called on the connected socket at
 	 * this instant, it might add data to the sockbuf and set SB_STOP.  Then
 	 * we would erroneously clear SB_STOP below, even though the sockbuf is
 	 * full.  The race is benign because the only ill effect is to allow the
 	 * sockbuf to exceed its size limit, and the size limits are not
 	 * strictly guaranteed anyway.
 	 */
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 == NULL) {
 		UNP_PCB_UNLOCK(unp);
 		return (0);
 	}
 	so2 = unp2->unp_socket;
 	SOCKBUF_LOCK(&so2->so_snd);
 	if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
 		so2->so_snd.sb_flags &= ~SB_STOP;
 	sowwakeup_locked(so2);
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
 	int freed, error;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM ||
 	    so->so_type == SOCK_SEQPACKET,
 	    ("%s: socktype %d", __func__, so->so_type));
 
 	freed = error = 0;
 	if (flags & PRUS_OOB) {
 		error = EOPNOTSUPP;
 		goto release;
 	}
 	if (control != NULL && (error = unp_internalize(&control, td)))
 		goto release;
 
 	unp2 = NULL;
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 	{
 		const struct sockaddr *from;
 
 		if (nam != NULL) {
 			error = unp_connect(so, nam, td);
 			if (error != 0)
 				break;
 		}
 		UNP_PCB_LOCK(unp);
 
 		/*
 		 * Because connect() and send() are non-atomic in a sendto()
 		 * with a target address, it's possible that the socket will
 		 * have disconnected before the send() can run.  In that case
 		 * return the slightly counter-intuitive but otherwise
 		 * correct error that the socket is not connected.
 		 */
 		unp2 = unp_pcb_lock_peer(unp);
 		if (unp2 == NULL) {
 			UNP_PCB_UNLOCK(unp);
 			error = ENOTCONN;
 			break;
 		}
 
 		if (unp2->unp_flags & UNP_WANTCRED_MASK)
 			control = unp_addsockcred(td, control,
 			    unp2->unp_flags);
 		if (unp->unp_addr != NULL)
 			from = (struct sockaddr *)unp->unp_addr;
 		else
 			from = &sun_noname;
 		so2 = unp2->unp_socket;
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if (sbappendaddr_locked(&so2->so_rcv, from, m,
 		    control)) {
 			sorwakeup_locked(so2);
 			m = NULL;
 			control = NULL;
 		} else {
 			SOCKBUF_UNLOCK(&so2->so_rcv);
 			error = ENOBUFS;
 		}
 		if (nam != NULL)
 			unp_disconnect(unp, unp2);
 		else
 			unp_pcb_unlock_pair(unp, unp2);
 		break;
 	}
 
 	case SOCK_SEQPACKET:
 	case SOCK_STREAM:
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			if (nam != NULL) {
 				error = unp_connect(so, nam, td);
 				if (error != 0)
 					break;
 			} else {
 				error = ENOTCONN;
 				break;
 			}
 		}
 
 		UNP_PCB_LOCK(unp);
 		if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) {
 			UNP_PCB_UNLOCK(unp);
 			error = ENOTCONN;
 			break;
 		} else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			unp_pcb_unlock_pair(unp, unp2);
 			error = EPIPE;
 			break;
 		}
 		UNP_PCB_UNLOCK(unp);
 		if ((so2 = unp2->unp_socket) == NULL) {
 			UNP_PCB_UNLOCK(unp2);
 			error = ENOTCONN;
 			break;
 		}
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if (unp2->unp_flags & UNP_WANTCRED_MASK) {
 			/*
 			 * Credentials are passed only once on SOCK_STREAM and
 			 * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
 			 * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
 			 */
 			control = unp_addsockcred(td, control, unp2->unp_flags);
 			unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
 		}
 
 		/*
 		 * Send to paired receive port and wake up readers.  Don't
 		 * check for space available in the receive buffer if we're
 		 * attaching ancillary data; Unix domain sockets only check
 		 * for space in the sending sockbuf, and that check is
 		 * performed one level up the stack.  At that level we cannot
 		 * precisely account for the amount of buffer space used
 		 * (e.g., because control messages are not yet internalized).
 		 */
 		switch (so->so_type) {
 		case SOCK_STREAM:
 			if (control != NULL) {
 				sbappendcontrol_locked(&so2->so_rcv, m,
 				    control, flags);
 				control = NULL;
 			} else
 				sbappend_locked(&so2->so_rcv, m, flags);
 			break;
 
 		case SOCK_SEQPACKET:
 			if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
 			    &sun_noname, m, control))
 				control = NULL;
 			break;
 		}
 
 		mbcnt = so2->so_rcv.sb_mbcnt;
 		sbcc = sbavail(&so2->so_rcv);
 		if (sbcc)
 			sorwakeup_locked(so2);
 		else
 			SOCKBUF_UNLOCK(&so2->so_rcv);
 
 		/*
 		 * The PCB lock on unp2 protects the SB_STOP flag.  Without it,
 		 * it would be possible for uipc_rcvd to be called at this
 		 * point, drain the receiving sockbuf, clear SB_STOP, and then
 		 * we would set SB_STOP below.  That could lead to an empty
 		 * sockbuf having SB_STOP set
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
 			so->so_snd.sb_flags |= SB_STOP;
 		SOCKBUF_UNLOCK(&so->so_snd);
 		UNP_PCB_UNLOCK(unp2);
 		m = NULL;
 		break;
 	}
 
 	/*
 	 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
 	 */
 	if (flags & PRUS_EOF) {
 		UNP_PCB_LOCK(unp);
 		socantsendmore(so);
 		unp_shutdown(unp);
 		UNP_PCB_UNLOCK(unp);
 	}
 	if (control != NULL && error != 0)
 		unp_dispose_mbuf(control);
 
 release:
 	if (control != NULL)
 		m_freem(control);
 	/*
 	 * In case of PRUS_NOTREADY, uipc_ready() is responsible
 	 * for freeing memory.
 	 */   
 	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
 		m_freem(m);
 	return (error);
 }
 
 static bool
 uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp)
 {
 	struct mbuf *mb, *n;
 	struct sockbuf *sb;
 
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		SOCK_UNLOCK(so);
 		return (false);
 	}
 	mb = NULL;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	if (sb->sb_fnrdy != NULL) {
 		for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) {
 			if (mb == m) {
 				*errorp = sbready(sb, m, count);
 				break;
 			}
 			mb = mb->m_next;
 			if (mb == NULL) {
 				mb = n;
 				if (mb != NULL)
 					n = mb->m_nextpkt;
 			}
 		}
 	}
 	SOCKBUF_UNLOCK(sb);
 	SOCK_UNLOCK(so);
 	return (mb != NULL);
 }
 
 static int
 uipc_ready(struct socket *so, struct mbuf *m, int count)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	int error, i;
 
 	unp = sotounpcb(so);
 
 	KASSERT(so->so_type == SOCK_STREAM,
 	    ("%s: unexpected socket type for %p", __func__, so));
 
 	UNP_PCB_LOCK(unp);
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
 		UNP_PCB_UNLOCK(unp);
 		so2 = unp2->unp_socket;
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if ((error = sbready(&so2->so_rcv, m, count)) == 0)
 			sorwakeup_locked(so2);
 		else
 			SOCKBUF_UNLOCK(&so2->so_rcv);
 		UNP_PCB_UNLOCK(unp2);
 		return (error);
 	}
 	UNP_PCB_UNLOCK(unp);
 
 	/*
 	 * The receiving socket has been disconnected, but may still be valid.
 	 * In this case, the now-ready mbufs are still present in its socket
 	 * buffer, so perform an exhaustive search before giving up and freeing
 	 * the mbufs.
 	 */
 	UNP_LINK_RLOCK();
 	LIST_FOREACH(unp, &unp_shead, unp_link) {
 		if (uipc_ready_scan(unp->unp_socket, m, count, &error))
 			break;
 	}
 	UNP_LINK_RUNLOCK();
 
 	if (unp == NULL) {
 		for (i = 0; i < count; i++)
 			m = m_free(m);
 		error = ECONNRESET;
 	}
 	return (error);
 }
 
 static int
 uipc_sense(struct socket *so, struct stat *sb)
 {
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
 
 	sb->st_blksize = so->so_snd.sb_hiwat;
 	sb->st_dev = NODEV;
 	sb->st_ino = unp->unp_ino;
 	return (0);
 }
 
 static int
 uipc_shutdown(struct socket *so)
 {
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
 
 	UNP_PCB_LOCK(unp);
 	socantsendmore(so);
 	unp_shutdown(unp);
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp;
 	const struct sockaddr *sa;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_addr != NULL)
 		sa = (struct sockaddr *) unp->unp_addr;
 	else
 		sa = &sun_noname;
 	bcopy(sa, *nam, sa->sa_len);
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static struct pr_usrreqs uipc_usrreqs_dgram = {
 	.pru_abort = 		uipc_abort,
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
 	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
 	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_sense =		uipc_sense,
 	.pru_shutdown =		uipc_shutdown,
 	.pru_sockaddr =		uipc_sockaddr,
 	.pru_soreceive =	soreceive_dgram,
 	.pru_close =		uipc_close,
 };
 
 static struct pr_usrreqs uipc_usrreqs_seqpacket = {
 	.pru_abort =		uipc_abort,
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
 	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
 	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_sense =		uipc_sense,
 	.pru_shutdown =		uipc_shutdown,
 	.pru_sockaddr =		uipc_sockaddr,
 	.pru_soreceive =	soreceive_generic,	/* XXX: or...? */
 	.pru_close =		uipc_close,
 };
 
 static struct pr_usrreqs uipc_usrreqs_stream = {
 	.pru_abort = 		uipc_abort,
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
 	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
 	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_ready =		uipc_ready,
 	.pru_sense =		uipc_sense,
 	.pru_shutdown =		uipc_shutdown,
 	.pru_sockaddr =		uipc_sockaddr,
 	.pru_soreceive =	soreceive_generic,
 	.pru_close =		uipc_close,
 };
 
 static int
 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct unpcb *unp;
 	struct xucred xu;
 	int error, optval;
 
 	if (sopt->sopt_level != SOL_LOCAL)
 		return (EINVAL);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case LOCAL_PEERCRED:
 			UNP_PCB_LOCK(unp);
 			if (unp->unp_flags & UNP_HAVEPC)
 				xu = unp->unp_peercred;
 			else {
 				if (so->so_type == SOCK_STREAM)
 					error = ENOTCONN;
 				else
 					error = EINVAL;
 			}
 			UNP_PCB_UNLOCK(unp);
 			if (error == 0)
 				error = sooptcopyout(sopt, &xu, sizeof(xu));
 			break;
 
 		case LOCAL_CREDS:
 			/* Unlocked read. */
 			optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0;
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		case LOCAL_CREDS_PERSISTENT:
 			/* Unlocked read. */
 			optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0;
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		case LOCAL_CONNWAIT:
 			/* Unlocked read. */
 			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case LOCAL_CREDS:
 		case LOCAL_CREDS_PERSISTENT:
 		case LOCAL_CONNWAIT:
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 
 #define	OPTSET(bit, exclusive) do {					\
 	UNP_PCB_LOCK(unp);						\
 	if (optval) {							\
 		if ((unp->unp_flags & (exclusive)) != 0) {		\
 			UNP_PCB_UNLOCK(unp);				\
 			error = EINVAL;					\
 			break;						\
 		}							\
 		unp->unp_flags |= (bit);				\
 	} else								\
 		unp->unp_flags &= ~(bit);				\
 	UNP_PCB_UNLOCK(unp);						\
 } while (0)
 
 			switch (sopt->sopt_name) {
 			case LOCAL_CREDS:
 				OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS);
 				break;
 
 			case LOCAL_CREDS_PERSISTENT:
 				OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT);
 				break;
 
 			case LOCAL_CONNWAIT:
 				OPTSET(UNP_CONNWAIT, 0);
 				break;
 
 			default:
 				break;
 			}
 			break;
 #undef	OPTSET
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static int
 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (unp_connectat(AT_FDCWD, so, nam, td));
 }
 
 static int
 unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
     struct thread *td)
 {
 	struct mtx *vplock;
 	struct sockaddr_un *soun;
 	struct vnode *vp;
 	struct socket *so2;
 	struct unpcb *unp, *unp2, *unp3;
 	struct nameidata nd;
 	char buf[SOCK_MAXADDRLEN];
 	struct sockaddr *sa;
 	cap_rights_t rights;
 	int error, len;
 	bool connreq;
 
 	if (nam->sa_family != AF_UNIX)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len > sizeof(struct sockaddr_un))
 		return (EINVAL);
 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
 	if (len <= 0)
 		return (EINVAL);
 	soun = (struct sockaddr_un *)nam;
 	bcopy(soun->sun_path, buf, len);
 	buf[len] = 0;
 
 	unp = sotounpcb(so);
 	UNP_PCB_LOCK(unp);
 	for (;;) {
 		/*
 		 * Wait for connection state to stabilize.  If a connection
 		 * already exists, give up.  For datagram sockets, which permit
 		 * multiple consecutive connect(2) calls, upper layers are
 		 * responsible for disconnecting in advance of a subsequent
 		 * connect(2), but this is not synchronized with PCB connection
 		 * state.
 		 *
 		 * Also make sure that no threads are currently attempting to
 		 * lock the peer socket, to ensure that unp_conn cannot
 		 * transition between two valid sockets while locks are dropped.
 		 */
 		if (unp->unp_conn != NULL) {
 			UNP_PCB_UNLOCK(unp);
 			return (EISCONN);
 		}
 		if ((unp->unp_flags & UNP_CONNECTING) != 0) {
 			UNP_PCB_UNLOCK(unp);
 			return (EALREADY);
 		}
 		if (unp->unp_pairbusy > 0) {
 			unp->unp_flags |= UNP_WAITING;
 			mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0);
 			continue;
 		}
 		break;
 	}
 	unp->unp_flags |= UNP_CONNECTING;
 	UNP_PCB_UNLOCK(unp);
 
 	connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0;
 	if (connreq)
 		sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	else
 		sa = NULL;
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
-	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_CONNECTAT), td);
+	    UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT),
+	    td);
 	error = namei(&nd);
 	if (error)
 		vp = NULL;
 	else
 		vp = nd.ni_vp;
 	ASSERT_VOP_LOCKED(vp, "unp_connect");
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error)
 		goto bad;
 
 	if (vp->v_type != VSOCK) {
 		error = ENOTSOCK;
 		goto bad;
 	}
 #ifdef MAC
 	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
 	if (error)
 		goto bad;
 #endif
 	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
 	if (error)
 		goto bad;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
 
 	vplock = mtx_pool_find(mtxpool_sleep, vp);
 	mtx_lock(vplock);
 	VOP_UNP_CONNECT(vp, &unp2);
 	if (unp2 == NULL) {
 		error = ECONNREFUSED;
 		goto bad2;
 	}
 	so2 = unp2->unp_socket;
 	if (so->so_type != so2->so_type) {
 		error = EPROTOTYPE;
 		goto bad2;
 	}
 	if (connreq) {
 		if (so2->so_options & SO_ACCEPTCONN) {
 			CURVNET_SET(so2->so_vnet);
 			so2 = sonewconn(so2, 0);
 			CURVNET_RESTORE();
 		} else
 			so2 = NULL;
 		if (so2 == NULL) {
 			error = ECONNREFUSED;
 			goto bad2;
 		}
 		unp3 = sotounpcb(so2);
 		unp_pcb_lock_pair(unp2, unp3);
 		if (unp2->unp_addr != NULL) {
 			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
 			unp3->unp_addr = (struct sockaddr_un *) sa;
 			sa = NULL;
 		}
 
 		unp_copy_peercred(td, unp3, unp, unp2);
 
 		UNP_PCB_UNLOCK(unp2);
 		unp2 = unp3;
 
 		/*
 		 * It is safe to block on the PCB lock here since unp2 is
 		 * nascent and cannot be connected to any other sockets.
 		 */
 		UNP_PCB_LOCK(unp);
 #ifdef MAC
 		mac_socketpeer_set_from_socket(so, so2);
 		mac_socketpeer_set_from_socket(so2, so);
 #endif
 	} else {
 		unp_pcb_lock_pair(unp, unp2);
 	}
 	KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
 	    sotounpcb(so2) == unp2,
 	    ("%s: unp2 %p so2 %p", __func__, unp2, so2));
 	error = unp_connect2(so, so2, PRU_CONNECT);
 	unp_pcb_unlock_pair(unp, unp2);
 bad2:
 	mtx_unlock(vplock);
 bad:
 	if (vp != NULL) {
 		vput(vp);
 	}
 	free(sa, M_SONAME);
 	UNP_PCB_LOCK(unp);
 	KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
 	    ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
 	unp->unp_flags &= ~UNP_CONNECTING;
 	UNP_PCB_UNLOCK(unp);
 	return (error);
 }
 
 /*
  * Set socket peer credentials at connection time.
  *
  * The client's PCB credentials are copied from its process structure.  The
  * server's PCB credentials are copied from the socket on which it called
  * listen(2).  uipc_listen cached that process's credentials at the time.
  */
 void
 unp_copy_peercred(struct thread *td, struct unpcb *client_unp,
     struct unpcb *server_unp, struct unpcb *listen_unp)
 {
 	cru2xt(td, &client_unp->unp_peercred);
 	client_unp->unp_flags |= UNP_HAVEPC;
 
 	memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred,
 	    sizeof(server_unp->unp_peercred));
 	server_unp->unp_flags |= UNP_HAVEPC;
 	client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK);
 }
 
 static int
 unp_connect2(struct socket *so, struct socket *so2, int req)
 {
 	struct unpcb *unp;
 	struct unpcb *unp2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
 	unp2 = sotounpcb(so2);
 	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
 
 	UNP_PCB_LOCK_ASSERT(unp);
 	UNP_PCB_LOCK_ASSERT(unp2);
 	KASSERT(unp->unp_conn == NULL,
 	    ("%s: socket %p is already connected", __func__, unp));
 
 	if (so2->so_type != so->so_type)
 		return (EPROTOTYPE);
 	unp->unp_conn = unp2;
 	unp_pcb_hold(unp2);
 	unp_pcb_hold(unp);
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 		UNP_REF_LIST_LOCK();
 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
 		UNP_REF_LIST_UNLOCK();
 		soisconnected(so);
 		break;
 
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		KASSERT(unp2->unp_conn == NULL,
 		    ("%s: socket %p is already connected", __func__, unp2));
 		unp2->unp_conn = unp;
 		if (req == PRU_CONNECT &&
 		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
 			soisconnecting(so);
 		else
 			soisconnected(so);
 		soisconnected(so2);
 		break;
 
 	default:
 		panic("unp_connect2");
 	}
 	return (0);
 }
 
 static void
 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
 {
 	struct socket *so, *so2;
 #ifdef INVARIANTS
 	struct unpcb *unptmp;
 #endif
 
 	UNP_PCB_LOCK_ASSERT(unp);
 	UNP_PCB_LOCK_ASSERT(unp2);
 	KASSERT(unp->unp_conn == unp2,
 	    ("%s: unpcb %p is not connected to %p", __func__, unp, unp2));
 
 	unp->unp_conn = NULL;
 	so = unp->unp_socket;
 	so2 = unp2->unp_socket;
 	switch (unp->unp_socket->so_type) {
 	case SOCK_DGRAM:
 		UNP_REF_LIST_LOCK();
 #ifdef INVARIANTS
 		LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) {
 			if (unptmp == unp)
 				break;
 		}
 		KASSERT(unptmp != NULL,
 		    ("%s: %p not found in reflist of %p", __func__, unp, unp2));
 #endif
 		LIST_REMOVE(unp, unp_reflink);
 		UNP_REF_LIST_UNLOCK();
 		if (so) {
 			SOCK_LOCK(so);
 			so->so_state &= ~SS_ISCONNECTED;
 			SOCK_UNLOCK(so);
 		}
 		break;
 
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		if (so)
 			soisdisconnected(so);
 		MPASS(unp2->unp_conn == unp);
 		unp2->unp_conn = NULL;
 		if (so2)
 			soisdisconnected(so2);
 		break;
 	}
 
 	if (unp == unp2) {
 		unp_pcb_rele_notlast(unp);
 		if (!unp_pcb_rele(unp))
 			UNP_PCB_UNLOCK(unp);
 	} else {
 		if (!unp_pcb_rele(unp))
 			UNP_PCB_UNLOCK(unp);
 		if (!unp_pcb_rele(unp2))
 			UNP_PCB_UNLOCK(unp2);
 	}
 }
 
 /*
  * unp_pcblist() walks the global list of struct unpcb's to generate a
  * pointer list, bumping the refcount on each unpcb.  It then copies them out
  * sequentially, validating the generation number on each to see if it has
  * been detached.  All of this is necessary because copyout() may sleep on
  * disk I/O.
  */
 static int
 unp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct unpcb *unp, **unp_list;
 	unp_gen_t gencnt;
 	struct xunpgen *xug;
 	struct unp_head *head;
 	struct xunpcb *xu;
 	u_int i;
 	int error, n;
 
 	switch ((intptr_t)arg1) {
 	case SOCK_STREAM:
 		head = &unp_shead;
 		break;
 
 	case SOCK_DGRAM:
 		head = &unp_dhead;
 		break;
 
 	case SOCK_SEQPACKET:
 		head = &unp_sphead;
 		break;
 
 	default:
 		panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
 	}
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = unp_count;
 		req->oldidx = 2 * (sizeof *xug)
 			+ (n + n/8) * sizeof(struct xunpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO);
 	UNP_LINK_RLOCK();
 	gencnt = unp_gencnt;
 	n = unp_count;
 	UNP_LINK_RUNLOCK();
 
 	xug->xug_len = sizeof *xug;
 	xug->xug_count = n;
 	xug->xug_gen = gencnt;
 	xug->xug_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, xug, sizeof *xug);
 	if (error) {
 		free(xug, M_TEMP);
 		return (error);
 	}
 
 	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
 
 	UNP_LINK_RLOCK();
 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
 	     unp = LIST_NEXT(unp, unp_link)) {
 		UNP_PCB_LOCK(unp);
 		if (unp->unp_gencnt <= gencnt) {
 			if (cr_cansee(req->td->td_ucred,
 			    unp->unp_socket->so_cred)) {
 				UNP_PCB_UNLOCK(unp);
 				continue;
 			}
 			unp_list[i++] = unp;
 			unp_pcb_hold(unp);
 		}
 		UNP_PCB_UNLOCK(unp);
 	}
 	UNP_LINK_RUNLOCK();
 	n = i;			/* In case we lost some during malloc. */
 
 	error = 0;
 	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
 	for (i = 0; i < n; i++) {
 		unp = unp_list[i];
 		UNP_PCB_LOCK(unp);
 		if (unp_pcb_rele(unp))
 			continue;
 
 		if (unp->unp_gencnt <= gencnt) {
 			xu->xu_len = sizeof *xu;
 			xu->xu_unpp = (uintptr_t)unp;
 			/*
 			 * XXX - need more locking here to protect against
 			 * connect/disconnect races for SMP.
 			 */
 			if (unp->unp_addr != NULL)
 				bcopy(unp->unp_addr, &xu->xu_addr,
 				      unp->unp_addr->sun_len);
 			else
 				bzero(&xu->xu_addr, sizeof(xu->xu_addr));
 			if (unp->unp_conn != NULL &&
 			    unp->unp_conn->unp_addr != NULL)
 				bcopy(unp->unp_conn->unp_addr,
 				      &xu->xu_caddr,
 				      unp->unp_conn->unp_addr->sun_len);
 			else
 				bzero(&xu->xu_caddr, sizeof(xu->xu_caddr));
 			xu->unp_vnode = (uintptr_t)unp->unp_vnode;
 			xu->unp_conn = (uintptr_t)unp->unp_conn;
 			xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs);
 			xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink);
 			xu->unp_gencnt = unp->unp_gencnt;
 			sotoxsocket(unp->unp_socket, &xu->xu_socket);
 			UNP_PCB_UNLOCK(unp);
 			error = SYSCTL_OUT(req, xu, sizeof *xu);
 		} else {
 			UNP_PCB_UNLOCK(unp);
 		}
 	}
 	free(xu, M_TEMP);
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		xug->xug_gen = unp_gencnt;
 		xug->xug_sogen = so_gencnt;
 		xug->xug_count = unp_count;
 		error = SYSCTL_OUT(req, xug, sizeof *xug);
 	}
 	free(unp_list, M_TEMP);
 	free(xug, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
     "List of active local datagram sockets");
 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
     "List of active local stream sockets");
 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
     "List of active local seqpacket sockets");
 
 static void
 unp_shutdown(struct unpcb *unp)
 {
 	struct unpcb *unp2;
 	struct socket *so;
 
 	UNP_PCB_LOCK_ASSERT(unp);
 
 	unp2 = unp->unp_conn;
 	if ((unp->unp_socket->so_type == SOCK_STREAM ||
 	    (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
 		so = unp2->unp_socket;
 		if (so != NULL)
 			socantrcvmore(so);
 	}
 }
 
 static void
 unp_drop(struct unpcb *unp)
 {
 	struct socket *so = unp->unp_socket;
 	struct unpcb *unp2;
 
 	/*
 	 * Regardless of whether the socket's peer dropped the connection
 	 * with this socket by aborting or disconnecting, POSIX requires
 	 * that ECONNRESET is returned.
 	 */
 
 	UNP_PCB_LOCK(unp);
 	if (so)
 		so->so_error = ECONNRESET;
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
 		/* Last reference dropped in unp_disconnect(). */
 		unp_pcb_rele_notlast(unp);
 		unp_disconnect(unp, unp2);
 	} else if (!unp_pcb_rele(unp)) {
 		UNP_PCB_UNLOCK(unp);
 	}
 }
 
 static void
 unp_freerights(struct filedescent **fdep, int fdcount)
 {
 	struct file *fp;
 	int i;
 
 	KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
 
 	for (i = 0; i < fdcount; i++) {
 		fp = fdep[i]->fde_file;
 		filecaps_free(&fdep[i]->fde_caps);
 		unp_discard(fp);
 	}
 	free(fdep[0], M_FILECAPS);
 }
 
 static int
 unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
 {
 	struct thread *td = curthread;		/* XXX */
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	int i;
 	int *fdp;
 	struct filedesc *fdesc = td->td_proc->p_fd;
 	struct filedescent **fdep;
 	void *data;
 	socklen_t clen = control->m_len, datalen;
 	int error, newfds;
 	u_int newlen;
 
 	UNP_LINK_UNLOCK_ASSERT();
 
 	error = 0;
 	if (controlp != NULL) /* controlp == NULL => free control messages */
 		*controlp = NULL;
 	while (cm != NULL) {
 		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
 			error = EINVAL;
 			break;
 		}
 		data = CMSG_DATA(cm);
 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 		if (cm->cmsg_level == SOL_SOCKET
 		    && cm->cmsg_type == SCM_RIGHTS) {
 			newfds = datalen / sizeof(*fdep);
 			if (newfds == 0)
 				goto next;
 			fdep = data;
 
 			/* If we're not outputting the descriptors free them. */
 			if (error || controlp == NULL) {
 				unp_freerights(fdep, newfds);
 				goto next;
 			}
 			FILEDESC_XLOCK(fdesc);
 
 			/*
 			 * Now change each pointer to an fd in the global
 			 * table to an integer that is the index to the local
 			 * fd table entry that we set up to point to the
 			 * global one we are transferring.
 			 */
 			newlen = newfds * sizeof(int);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				FILEDESC_XUNLOCK(fdesc);
 				error = E2BIG;
 				unp_freerights(fdep, newfds);
 				goto next;
 			}
 
 			fdp = (int *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			if (fdallocn(td, 0, fdp, newfds) != 0) {
 				FILEDESC_XUNLOCK(fdesc);
 				error = EMSGSIZE;
 				unp_freerights(fdep, newfds);
 				m_freem(*controlp);
 				*controlp = NULL;
 				goto next;
 			}
 			for (i = 0; i < newfds; i++, fdp++) {
 				_finstall(fdesc, fdep[i]->fde_file, *fdp,
 				    (flags & MSG_CMSG_CLOEXEC) != 0 ? UF_EXCLOSE : 0,
 				    &fdep[i]->fde_caps);
 				unp_externalize_fp(fdep[i]->fde_file);
 			}
 
 			/*
 			 * The new type indicates that the mbuf data refers to
 			 * kernel resources that may need to be released before
 			 * the mbuf is freed.
 			 */
 			m_chtype(*controlp, MT_EXTCONTROL);
 			FILEDESC_XUNLOCK(fdesc);
 			free(fdep[0], M_FILECAPS);
 		} else {
 			/* We can just copy anything else across. */
 			if (error || controlp == NULL)
 				goto next;
 			*controlp = sbcreatecontrol(NULL, datalen,
 			    cm->cmsg_type, cm->cmsg_level);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto next;
 			}
 			bcopy(data,
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
 			    datalen);
 		}
 		controlp = &(*controlp)->m_next;
 
 next:
 		if (CMSG_SPACE(datalen) < clen) {
 			clen -= CMSG_SPACE(datalen);
 			cm = (struct cmsghdr *)
 			    ((caddr_t)cm + CMSG_SPACE(datalen));
 		} else {
 			clen = 0;
 			cm = NULL;
 		}
 	}
 
 	m_freem(control);
 	return (error);
 }
 
 static void
 unp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(unp_zone, maxsockets);
 }
 
 #ifdef INVARIANTS
 static void
 unp_zdtor(void *mem, int size __unused, void *arg __unused)
 {
 	struct unpcb *unp;
 
 	unp = mem;
 
 	KASSERT(LIST_EMPTY(&unp->unp_refs),
 	    ("%s: unpcb %p has lingering refs", __func__, unp));
 	KASSERT(unp->unp_socket == NULL,
 	    ("%s: unpcb %p has socket backpointer", __func__, unp));
 	KASSERT(unp->unp_vnode == NULL,
 	    ("%s: unpcb %p has vnode references", __func__, unp));
 	KASSERT(unp->unp_conn == NULL,
 	    ("%s: unpcb %p is still connected", __func__, unp));
 	KASSERT(unp->unp_addr == NULL,
 	    ("%s: unpcb %p has leaked addr", __func__, unp));
 }
 #endif
 
 static void
 unp_init(void)
 {
 	uma_dtor dtor;
 
 #ifdef VIMAGE
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 #endif
 
 #ifdef INVARIANTS
 	dtor = unp_zdtor;
 #else
 	dtor = NULL;
 #endif
 	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor,
 	    NULL, NULL, UMA_ALIGN_CACHE, 0);
 	uma_zone_set_max(unp_zone, maxsockets);
 	uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	LIST_INIT(&unp_dhead);
 	LIST_INIT(&unp_shead);
 	LIST_INIT(&unp_sphead);
 	SLIST_INIT(&unp_defers);
 	TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
 	TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
 	UNP_LINK_LOCK_INIT();
 	UNP_DEFERRED_LOCK_INIT();
 }
 
 static void
 unp_internalize_cleanup_rights(struct mbuf *control)
 {
 	struct cmsghdr *cp;
 	struct mbuf *m;
 	void *data;
 	socklen_t datalen;
 
 	for (m = control; m != NULL; m = m->m_next) {
 		cp = mtod(m, struct cmsghdr *);
 		if (cp->cmsg_level != SOL_SOCKET ||
 		    cp->cmsg_type != SCM_RIGHTS)
 			continue;
 		data = CMSG_DATA(cp);
 		datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data;
 		unp_freerights(data, datalen / sizeof(struct filedesc *));
 	}
 }
 
 static int
 unp_internalize(struct mbuf **controlp, struct thread *td)
 {
 	struct mbuf *control, **initial_controlp;
 	struct proc *p;
 	struct filedesc *fdesc;
 	struct bintime *bt;
 	struct cmsghdr *cm;
 	struct cmsgcred *cmcred;
 	struct filedescent *fde, **fdep, *fdev;
 	struct file *fp;
 	struct timeval *tv;
 	struct timespec *ts;
 	void *data;
 	socklen_t clen, datalen;
 	int i, j, error, *fdp, oldfds;
 	u_int newlen;
 
 	UNP_LINK_UNLOCK_ASSERT();
 
 	p = td->td_proc;
 	fdesc = p->p_fd;
 	error = 0;
 	control = *controlp;
 	clen = control->m_len;
 	*controlp = NULL;
 	initial_controlp = controlp;
 	for (cm = mtod(control, struct cmsghdr *); cm != NULL;) {
 		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
 		    || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) {
 			error = EINVAL;
 			goto out;
 		}
 		data = CMSG_DATA(cm);
 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 
 		switch (cm->cmsg_type) {
 		/*
 		 * Fill in credential information.
 		 */
 		case SCM_CREDS:
 			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
 			    SCM_CREDS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			cmcred = (struct cmsgcred *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			cmcred->cmcred_pid = p->p_pid;
 			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
 			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
 			cmcred->cmcred_euid = td->td_ucred->cr_uid;
 			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
 			    CMGROUP_MAX);
 			for (i = 0; i < cmcred->cmcred_ngroups; i++)
 				cmcred->cmcred_groups[i] =
 				    td->td_ucred->cr_groups[i];
 			break;
 
 		case SCM_RIGHTS:
 			oldfds = datalen / sizeof (int);
 			if (oldfds == 0)
 				break;
 			/*
 			 * Check that all the FDs passed in refer to legal
 			 * files.  If not, reject the entire operation.
 			 */
 			fdp = data;
 			FILEDESC_SLOCK(fdesc);
 			for (i = 0; i < oldfds; i++, fdp++) {
 				fp = fget_locked(fdesc, *fdp);
 				if (fp == NULL) {
 					FILEDESC_SUNLOCK(fdesc);
 					error = EBADF;
 					goto out;
 				}
 				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
 					FILEDESC_SUNLOCK(fdesc);
 					error = EOPNOTSUPP;
 					goto out;
 				}
 			}
 
 			/*
 			 * Now replace the integer FDs with pointers to the
 			 * file structure and capability rights.
 			 */
 			newlen = oldfds * sizeof(fdep[0]);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				FILEDESC_SUNLOCK(fdesc);
 				error = E2BIG;
 				goto out;
 			}
 			fdp = data;
 			for (i = 0; i < oldfds; i++, fdp++) {
 				if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) {
 					fdp = data;
 					for (j = 0; j < i; j++, fdp++) {
 						fdrop(fdesc->fd_ofiles[*fdp].
 						    fde_file, td);
 					}
 					FILEDESC_SUNLOCK(fdesc);
 					error = EBADF;
 					goto out;
 				}
 			}
 			fdp = data;
 			fdep = (struct filedescent **)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
 			    M_WAITOK);
 			for (i = 0; i < oldfds; i++, fdev++, fdp++) {
 				fde = &fdesc->fd_ofiles[*fdp];
 				fdep[i] = fdev;
 				fdep[i]->fde_file = fde->fde_file;
 				filecaps_copy(&fde->fde_caps,
 				    &fdep[i]->fde_caps, true);
 				unp_internalize_fp(fdep[i]->fde_file);
 			}
 			FILEDESC_SUNLOCK(fdesc);
 			break;
 
 		case SCM_TIMESTAMP:
 			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			tv = (struct timeval *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			microtime(tv);
 			break;
 
 		case SCM_BINTIME:
 			*controlp = sbcreatecontrol(NULL, sizeof(*bt),
 			    SCM_BINTIME, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			bt = (struct bintime *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			bintime(bt);
 			break;
 
 		case SCM_REALTIME:
 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
 			    SCM_REALTIME, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			ts = (struct timespec *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			nanotime(ts);
 			break;
 
 		case SCM_MONOTONIC:
 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
 			    SCM_MONOTONIC, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			ts = (struct timespec *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			nanouptime(ts);
 			break;
 
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		if (*controlp != NULL)
 			controlp = &(*controlp)->m_next;
 		if (CMSG_SPACE(datalen) < clen) {
 			clen -= CMSG_SPACE(datalen);
 			cm = (struct cmsghdr *)
 			    ((caddr_t)cm + CMSG_SPACE(datalen));
 		} else {
 			clen = 0;
 			cm = NULL;
 		}
 	}
 
 out:
 	if (error != 0 && initial_controlp != NULL)
 		unp_internalize_cleanup_rights(*initial_controlp);
 	m_freem(control);
 	return (error);
 }
 
 static struct mbuf *
 unp_addsockcred(struct thread *td, struct mbuf *control, int mode)
 {
 	struct mbuf *m, *n, *n_prev;
 	const struct cmsghdr *cm;
 	int ngroups, i, cmsgtype;
 	size_t ctrlsz;
 
 	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
 	if (mode & UNP_WANTCRED_ALWAYS) {
 		ctrlsz = SOCKCRED2SIZE(ngroups);
 		cmsgtype = SCM_CREDS2;
 	} else {
 		ctrlsz = SOCKCREDSIZE(ngroups);
 		cmsgtype = SCM_CREDS;
 	}
 
 	m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET);
 	if (m == NULL)
 		return (control);
 
 	if (mode & UNP_WANTCRED_ALWAYS) {
 		struct sockcred2 *sc;
 
 		sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
 		sc->sc_version = 0;
 		sc->sc_pid = td->td_proc->p_pid;
 		sc->sc_uid = td->td_ucred->cr_ruid;
 		sc->sc_euid = td->td_ucred->cr_uid;
 		sc->sc_gid = td->td_ucred->cr_rgid;
 		sc->sc_egid = td->td_ucred->cr_gid;
 		sc->sc_ngroups = ngroups;
 		for (i = 0; i < sc->sc_ngroups; i++)
 			sc->sc_groups[i] = td->td_ucred->cr_groups[i];
 	} else {
 		struct sockcred *sc;
 
 		sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
 		sc->sc_uid = td->td_ucred->cr_ruid;
 		sc->sc_euid = td->td_ucred->cr_uid;
 		sc->sc_gid = td->td_ucred->cr_rgid;
 		sc->sc_egid = td->td_ucred->cr_gid;
 		sc->sc_ngroups = ngroups;
 		for (i = 0; i < sc->sc_ngroups; i++)
 			sc->sc_groups[i] = td->td_ucred->cr_groups[i];
 	}
 
 	/*
 	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
 	 * created SCM_CREDS control message (struct sockcred) has another
 	 * format.
 	 */
 	if (control != NULL && cmsgtype == SCM_CREDS)
 		for (n = control, n_prev = NULL; n != NULL;) {
 			cm = mtod(n, struct cmsghdr *);
     			if (cm->cmsg_level == SOL_SOCKET &&
 			    cm->cmsg_type == SCM_CREDS) {
     				if (n_prev == NULL)
 					control = n->m_next;
 				else
 					n_prev->m_next = n->m_next;
 				n = m_free(n);
 			} else {
 				n_prev = n;
 				n = n->m_next;
 			}
 		}
 
 	/* Prepend it to the head. */
 	m->m_next = control;
 	return (m);
 }
 
 static struct unpcb *
 fptounp(struct file *fp)
 {
 	struct socket *so;
 
 	if (fp->f_type != DTYPE_SOCKET)
 		return (NULL);
 	if ((so = fp->f_data) == NULL)
 		return (NULL);
 	if (so->so_proto->pr_domain != &localdomain)
 		return (NULL);
 	return sotounpcb(so);
 }
 
 static void
 unp_discard(struct file *fp)
 {
 	struct unp_defer *dr;
 
 	if (unp_externalize_fp(fp)) {
 		dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
 		dr->ud_fp = fp;
 		UNP_DEFERRED_LOCK();
 		SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
 		UNP_DEFERRED_UNLOCK();
 		atomic_add_int(&unp_defers_count, 1);
 		taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
 	} else
 		(void) closef(fp, (struct thread *)NULL);
 }
 
 static void
 unp_process_defers(void *arg __unused, int pending)
 {
 	struct unp_defer *dr;
 	SLIST_HEAD(, unp_defer) drl;
 	int count;
 
 	SLIST_INIT(&drl);
 	for (;;) {
 		UNP_DEFERRED_LOCK();
 		if (SLIST_FIRST(&unp_defers) == NULL) {
 			UNP_DEFERRED_UNLOCK();
 			break;
 		}
 		SLIST_SWAP(&unp_defers, &drl, unp_defer);
 		UNP_DEFERRED_UNLOCK();
 		count = 0;
 		while ((dr = SLIST_FIRST(&drl)) != NULL) {
 			SLIST_REMOVE_HEAD(&drl, ud_link);
 			closef(dr->ud_fp, NULL);
 			free(dr, M_TEMP);
 			count++;
 		}
 		atomic_add_int(&unp_defers_count, -count);
 	}
 }
 
 static void
 unp_internalize_fp(struct file *fp)
 {
 	struct unpcb *unp;
 
 	UNP_LINK_WLOCK();
 	if ((unp = fptounp(fp)) != NULL) {
 		unp->unp_file = fp;
 		unp->unp_msgcount++;
 	}
 	unp_rights++;
 	UNP_LINK_WUNLOCK();
 }
 
 static int
 unp_externalize_fp(struct file *fp)
 {
 	struct unpcb *unp;
 	int ret;
 
 	UNP_LINK_WLOCK();
 	if ((unp = fptounp(fp)) != NULL) {
 		unp->unp_msgcount--;
 		ret = 1;
 	} else
 		ret = 0;
 	unp_rights--;
 	UNP_LINK_WUNLOCK();
 	return (ret);
 }
 
 /*
  * unp_defer indicates whether additional work has been defered for a future
  * pass through unp_gc().  It is thread local and does not require explicit
  * synchronization.
  */
 static int	unp_marked;
 
 static void
 unp_remove_dead_ref(struct filedescent **fdep, int fdcount)
 {
 	struct unpcb *unp;
 	struct file *fp;
 	int i;
 
 	/*
 	 * This function can only be called from the gc task.
 	 */
 	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
 	    ("%s: not on gc callout", __func__));
 	UNP_LINK_LOCK_ASSERT();
 
 	for (i = 0; i < fdcount; i++) {
 		fp = fdep[i]->fde_file;
 		if ((unp = fptounp(fp)) == NULL)
 			continue;
 		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
 			continue;
 		unp->unp_gcrefs--;
 	}
 }
 
 static void
 unp_restore_undead_ref(struct filedescent **fdep, int fdcount)
 {
 	struct unpcb *unp;
 	struct file *fp;
 	int i;
 
 	/*
 	 * This function can only be called from the gc task.
 	 */
 	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
 	    ("%s: not on gc callout", __func__));
 	UNP_LINK_LOCK_ASSERT();
 
 	for (i = 0; i < fdcount; i++) {
 		fp = fdep[i]->fde_file;
 		if ((unp = fptounp(fp)) == NULL)
 			continue;
 		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
 			continue;
 		unp->unp_gcrefs++;
 		unp_marked++;
 	}
 }
 
 static void
 unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int))
 {
 	struct socket *so, *soa;
 
 	so = unp->unp_socket;
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		/*
 		 * Mark all sockets in our accept queue.
 		 */
 		TAILQ_FOREACH(soa, &so->sol_comp, so_list) {
 			if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
 				continue;
 			SOCKBUF_LOCK(&soa->so_rcv);
 			unp_scan(soa->so_rcv.sb_mb, op);
 			SOCKBUF_UNLOCK(&soa->so_rcv);
 		}
 	} else {
 		/*
 		 * Mark all sockets we reference with RIGHTS.
 		 */
 		if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
 			SOCKBUF_LOCK(&so->so_rcv);
 			unp_scan(so->so_rcv.sb_mb, op);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 		}
 	}
 	SOCK_UNLOCK(so);
 }
 
 static int unp_recycled;
 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, 
     "Number of unreachable sockets claimed by the garbage collector.");
 
 static int unp_taskcount;
 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, 
     "Number of times the garbage collector has run.");
 
 SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0, 
     "Number of active local sockets.");
 
 static void
 unp_gc(__unused void *arg, int pending)
 {
 	struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
 				    NULL };
 	struct unp_head **head;
 	struct unp_head unp_deadhead;	/* List of potentially-dead sockets. */
 	struct file *f, **unref;
 	struct unpcb *unp, *unptmp;
 	int i, total, unp_unreachable;
 
 	LIST_INIT(&unp_deadhead);
 	unp_taskcount++;
 	UNP_LINK_RLOCK();
 	/*
 	 * First determine which sockets may be in cycles.
 	 */
 	unp_unreachable = 0;
 
 	for (head = heads; *head != NULL; head++)
 		LIST_FOREACH(unp, *head, unp_link) {
 			KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0,
 			    ("%s: unp %p has unexpected gc flags 0x%x",
 			    __func__, unp, (unsigned int)unp->unp_gcflag));
 
 			f = unp->unp_file;
 
 			/*
 			 * Check for an unreachable socket potentially in a
 			 * cycle.  It must be in a queue as indicated by
 			 * msgcount, and this must equal the file reference
 			 * count.  Note that when msgcount is 0 the file is
 			 * NULL.
 			 */
 			if (f != NULL && unp->unp_msgcount != 0 &&
 			    refcount_load(&f->f_count) == unp->unp_msgcount) {
 				LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead);
 				unp->unp_gcflag |= UNPGC_DEAD;
 				unp->unp_gcrefs = unp->unp_msgcount;
 				unp_unreachable++;
 			}
 		}
 
 	/*
 	 * Scan all sockets previously marked as potentially being in a cycle
 	 * and remove the references each socket holds on any UNPGC_DEAD
 	 * sockets in its queue.  After this step, all remaining references on
 	 * sockets marked UNPGC_DEAD should not be part of any cycle.
 	 */
 	LIST_FOREACH(unp, &unp_deadhead, unp_dead)
 		unp_gc_scan(unp, unp_remove_dead_ref);
 
 	/*
 	 * If a socket still has a non-negative refcount, it cannot be in a
 	 * cycle.  In this case increment refcount of all children iteratively.
 	 * Stop the scan once we do a complete loop without discovering
 	 * a new reachable socket.
 	 */
 	do {
 		unp_marked = 0;
 		LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp)
 			if (unp->unp_gcrefs > 0) {
 				unp->unp_gcflag &= ~UNPGC_DEAD;
 				LIST_REMOVE(unp, unp_dead);
 				KASSERT(unp_unreachable > 0,
 				    ("%s: unp_unreachable underflow.",
 				    __func__));
 				unp_unreachable--;
 				unp_gc_scan(unp, unp_restore_undead_ref);
 			}
 	} while (unp_marked);
 
 	UNP_LINK_RUNLOCK();
 
 	if (unp_unreachable == 0)
 		return;
 
 	/*
 	 * Allocate space for a local array of dead unpcbs.
 	 * TODO: can this path be simplified by instead using the local
 	 * dead list at unp_deadhead, after taking out references
 	 * on the file object and/or unpcb and dropping the link lock?
 	 */
 	unref = malloc(unp_unreachable * sizeof(struct file *),
 	    M_TEMP, M_WAITOK);
 
 	/*
 	 * Iterate looking for sockets which have been specifically marked
 	 * as unreachable and store them locally.
 	 */
 	UNP_LINK_RLOCK();
 	total = 0;
 	LIST_FOREACH(unp, &unp_deadhead, unp_dead) {
 		KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0,
 		    ("%s: unp %p not marked UNPGC_DEAD", __func__, unp));
 		unp->unp_gcflag &= ~UNPGC_DEAD;
 		f = unp->unp_file;
 		if (unp->unp_msgcount == 0 || f == NULL ||
 		    refcount_load(&f->f_count) != unp->unp_msgcount ||
 		    !fhold(f))
 			continue;
 		unref[total++] = f;
 		KASSERT(total <= unp_unreachable,
 		    ("%s: incorrect unreachable count.", __func__));
 	}
 	UNP_LINK_RUNLOCK();
 
 	/*
 	 * Now flush all sockets, free'ing rights.  This will free the
 	 * struct files associated with these sockets but leave each socket
 	 * with one remaining ref.
 	 */
 	for (i = 0; i < total; i++) {
 		struct socket *so;
 
 		so = unref[i]->f_data;
 		CURVNET_SET(so->so_vnet);
 		sorflush(so);
 		CURVNET_RESTORE();
 	}
 
 	/*
 	 * And finally release the sockets so they can be reclaimed.
 	 */
 	for (i = 0; i < total; i++)
 		fdrop(unref[i], NULL);
 	unp_recycled += total;
 	free(unref, M_TEMP);
 }
 
 static void
 unp_dispose_mbuf(struct mbuf *m)
 {
 
 	if (m)
 		unp_scan(m, unp_freerights);
 }
 
 /*
  * Synchronize against unp_gc, which can trip over data as we are freeing it.
  */
 static void
 unp_dispose(struct socket *so)
 {
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	UNP_LINK_WLOCK();
 	unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
 	UNP_LINK_WUNLOCK();
 	if (!SOLISTENING(so))
 		unp_dispose_mbuf(so->so_rcv.sb_mb);
 }
 
 static void
 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
 {
 	struct mbuf *m;
 	struct cmsghdr *cm;
 	void *data;
 	socklen_t clen, datalen;
 
 	while (m0 != NULL) {
 		for (m = m0; m; m = m->m_next) {
 			if (m->m_type != MT_CONTROL)
 				continue;
 
 			cm = mtod(m, struct cmsghdr *);
 			clen = m->m_len;
 
 			while (cm != NULL) {
 				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
 					break;
 
 				data = CMSG_DATA(cm);
 				datalen = (caddr_t)cm + cm->cmsg_len
 				    - (caddr_t)data;
 
 				if (cm->cmsg_level == SOL_SOCKET &&
 				    cm->cmsg_type == SCM_RIGHTS) {
 					(*op)(data, datalen /
 					    sizeof(struct filedescent *));
 				}
 
 				if (CMSG_SPACE(datalen) < clen) {
 					clen -= CMSG_SPACE(datalen);
 					cm = (struct cmsghdr *)
 					    ((caddr_t)cm + CMSG_SPACE(datalen));
 				} else {
 					clen = 0;
 					cm = NULL;
 				}
 			}
 		}
 		m0 = m0->m_nextpkt;
 	}
 }
 
 /*
  * A helper function called by VFS before socket-type vnode reclamation.
  * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
  * use count.
  */
 void
 vfs_unp_reclaim(struct vnode *vp)
 {
 	struct unpcb *unp;
 	int active;
 	struct mtx *vplock;
 
 	ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
 	KASSERT(vp->v_type == VSOCK,
 	    ("vfs_unp_reclaim: vp->v_type != VSOCK"));
 
 	active = 0;
 	vplock = mtx_pool_find(mtxpool_sleep, vp);
 	mtx_lock(vplock);
 	VOP_UNP_CONNECT(vp, &unp);
 	if (unp == NULL)
 		goto done;
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode == vp) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 		active = 1;
 	}
 	UNP_PCB_UNLOCK(unp);
  done:
 	mtx_unlock(vplock);
 	if (active)
 		vunref(vp);
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_unpflags(int unp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (unp_flags & UNP_HAVEPC) {
 		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_WANTCRED_ALWAYS) {
 		db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_WANTCRED_ONESHOT) {
 		db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_CONNWAIT) {
 		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_CONNECTING) {
 		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_BINDING) {
 		db_printf("%sUNP_BINDING", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_xucred(int indent, struct xucred *xu)
 {
 	int comma, i;
 
 	db_print_indent(indent);
 	db_printf("cr_version: %u   cr_uid: %u   cr_pid: %d   cr_ngroups: %d\n",
 	    xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups);
 	db_print_indent(indent);
 	db_printf("cr_groups: ");
 	comma = 0;
 	for (i = 0; i < xu->cr_ngroups; i++) {
 		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
 		comma = 1;
 	}
 	db_printf("\n");
 }
 
 static void
 db_print_unprefs(int indent, struct unp_head *uh)
 {
 	struct unpcb *unp;
 	int counter;
 
 	counter = 0;
 	LIST_FOREACH(unp, uh, unp_reflink) {
 		if (counter % 4 == 0)
 			db_print_indent(indent);
 		db_printf("%p  ", unp);
 		if (counter % 4 == 3)
 			db_printf("\n");
 		counter++;
 	}
 	if (counter != 0 && counter % 4 != 0)
 		db_printf("\n");
 }
 
 DB_SHOW_COMMAND(unpcb, db_show_unpcb)
 {
 	struct unpcb *unp;
 
         if (!have_addr) {
                 db_printf("usage: show unpcb <addr>\n");
                 return;
         }
         unp = (struct unpcb *)addr;
 
 	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
 	    unp->unp_vnode);
 
 	db_printf("unp_ino: %ju   unp_conn: %p\n", (uintmax_t)unp->unp_ino,
 	    unp->unp_conn);
 
 	db_printf("unp_refs:\n");
 	db_print_unprefs(2, &unp->unp_refs);
 
 	/* XXXRW: Would be nice to print the full address, if any. */
 	db_printf("unp_addr: %p\n", unp->unp_addr);
 
 	db_printf("unp_gencnt: %llu\n",
 	    (unsigned long long)unp->unp_gencnt);
 
 	db_printf("unp_flags: %x (", unp->unp_flags);
 	db_print_unpflags(unp->unp_flags);
 	db_printf(")\n");
 
 	db_printf("unp_peercred:\n");
 	db_print_xucred(2, &unp->unp_peercred);
 
 	db_printf("unp_refcount: %u\n", unp->unp_refcount);
 }
 #endif
diff --git a/sys/netinet/sctp_syscalls.c b/sys/netinet/sctp_syscalls.c
index 0dad76b0f8bc..2697d139300c 100644
--- a/sys/netinet/sctp_syscalls.c
+++ b/sys/netinet/sctp_syscalls.c
@@ -1,589 +1,589 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_sctp.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/event.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/sf_buf.h>
 #include <sys/sysent.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <netinet/sctp.h>
 #include <netinet/sctp_os_bsd.h>
 #include <netinet/sctp_peeloff.h>
 
 static struct syscall_helper_data sctp_syscalls[] = {
 	SYSCALL_INIT_HELPER_F(sctp_peeloff, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(sctp_generic_sendmsg, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(sctp_generic_sendmsg_iov, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(sctp_generic_recvmsg, SYF_CAPENABLED),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 static struct syscall_helper_data sctp32_syscalls[] = {
 	SYSCALL32_INIT_HELPER_COMPAT(sctp_peeloff),
 	SYSCALL32_INIT_HELPER_COMPAT(sctp_generic_sendmsg),
 	SYSCALL32_INIT_HELPER_COMPAT(sctp_generic_sendmsg_iov),
 	SYSCALL32_INIT_HELPER_COMPAT(sctp_generic_recvmsg),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 int
 sctp_syscalls_init(void)
 {
 	int error;
 
 	error = syscall_helper_register(sctp_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(sctp32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 #ifdef SCTP
 SYSINIT(sctp_syscalls, SI_SUB_SYSCALLS, SI_ORDER_ANY, sctp_syscalls_init, NULL);
 #endif
 
 int
 sctp_syscalls_uninit(void)
 {
 	int error;
 
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_unregister(sctp32_syscalls);
 	if (error != 0)
 		return (error);
 #endif
 	error = syscall_helper_unregister(sctp_syscalls);
 	if (error != 0)
 		return (error);
 	return (0);
 }
 
 /*
  * SCTP syscalls.
  */
 int
 sys_sctp_peeloff(td, uap)
 	struct thread *td;
 	struct sctp_peeloff_args /* {
 		int	sd;
 		caddr_t	name;
 	} */ *uap;
 {
 	struct file *headfp, *nfp = NULL;
 	struct socket *head, *so;
 	cap_rights_t rights;
 	u_int fflag;
 	int error, fd;
 
 	AUDIT_ARG_FD(uap->sd);
-	error = getsock_cap(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF),
+	error = getsock_cap(td, uap->sd, cap_rights_init_one(&rights, CAP_PEELOFF),
 	    &headfp, &fflag, NULL);
 	if (error != 0)
 		goto done2;
 	head = headfp->f_data;
 	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto done;
 	}
 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
 	if (error != 0)
 		goto done;
 	/*
 	 * At this point we know we do have a assoc to pull
 	 * we proceed to get the fd setup. This may block
 	 * but that is ok.
 	 */
 
 	error = falloc(td, &nfp, &fd, 0);
 	if (error != 0)
 		goto done;
 	td->td_retval[0] = fd;
 
 	CURVNET_SET(head->so_vnet);
 	so = sopeeloff(head);
 	if (so == NULL) {
 		error = ENOMEM;
 		goto noconnection;
 	}
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
 	if (error != 0)
 		goto noconnection;
 	if (head->so_sigio != NULL)
 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
 
 noconnection:
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error != 0)
 		fdclose(td, nfp, fd);
 
 	/*
 	 * Release explicitly held references before returning.
 	 */
 	CURVNET_RESTORE();
 done:
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fdrop(headfp, td);
 done2:
 	return (error);
 }
 
 int
 sys_sctp_generic_sendmsg (td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_args /* {
 		int sd,
 		caddr_t msg,
 		int mlen,
 		caddr_t to,
 		__socklen_t tolen,
 		struct sctp_sndrcvinfo *sinfo,
 		int flags
 	} */ *uap;
 {
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec iov[1];
 	cap_rights_t rights;
 	int error = 0, len;
 
 	if (uap->sinfo != NULL) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error != 0)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 
 	cap_rights_init_one(&rights, CAP_SEND);
 	if (uap->tolen != 0) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error != 0) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 		cap_rights_set_one(&rights, CAP_CONNECT);
 	}
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock_cap(td, uap->sd, &rights, &fp, NULL, NULL);
 	if (error != 0)
 		goto sctp_bad;
 #ifdef KTRACE
 	if (to && (KTRPOINT(td, KTR_STRUCT)))
 		ktrsockaddr(to);
 #endif
 
 	iov[0].iov_base = uap->msg;
 	iov[0].iov_len = uap->mlen;
 
 	so = (struct socket *)fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto sctp_bad;
 	}
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	len = auio.uio_resid = uap->mlen;
 	CURVNET_SET(so->so_vnet);
 	error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL,
 	    (struct mbuf *)NULL, uap->flags, u_sinfo, td);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket. */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	if (fp != NULL)
 		fdrop(fp, td);
 sctp_bad2:
 	free(to, M_SONAME);
 	return (error);
 }
 
 int
 sys_sctp_generic_sendmsg_iov(td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_iov_args /* {
 		int sd,
 		struct iovec *iov,
 		int iovlen,
 		caddr_t to,
 		__socklen_t tolen,
 		struct sctp_sndrcvinfo *sinfo,
 		int flags
 	} */ *uap;
 {
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	cap_rights_t rights;
 	ssize_t len;
 	int error, i;
 
 	if (uap->sinfo != NULL) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error != 0)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	cap_rights_init_one(&rights, CAP_SEND);
 	if (uap->tolen != 0) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error != 0) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 		cap_rights_set_one(&rights, CAP_CONNECT);
 	}
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock_cap(td, uap->sd, &rights, &fp, NULL, NULL);
 	if (error != 0)
 		goto sctp_bad1;
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
 		    uap->iovlen, &iov, EMSGSIZE);
 	else
 #endif
 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		goto sctp_bad1;
 #ifdef KTRACE
 	if (to && (KTRPOINT(td, KTR_STRUCT)))
 		ktrsockaddr(to);
 #endif
 
 	so = (struct socket *)fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto sctp_bad;
 	}
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto sctp_bad;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	len = auio.uio_resid;
 	CURVNET_SET(so->so_vnet);
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, u_sinfo, td);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	free(iov, M_IOV);
 sctp_bad1:
 	if (fp != NULL)
 		fdrop(fp, td);
 sctp_bad2:
 	free(to, M_SONAME);
 	return (error);
 }
 
 int
 sys_sctp_generic_recvmsg(td, uap)
 	struct thread *td;
 	struct sctp_generic_recvmsg_args /* {
 		int sd,
 		struct iovec *iov,
 		int iovlen,
 		struct sockaddr *from,
 		__socklen_t *fromlenaddr,
 		struct sctp_sndrcvinfo *sinfo,
 		int *msg_flags
 	} */ *uap;
 {
 	uint8_t sockbufstore[256];
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	struct sctp_sndrcvinfo sinfo;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *fromsa;
 	cap_rights_t rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int error, fromlen, i, msg_flags;
 
 	AUDIT_ARG_FD(uap->sd);
-	error = getsock_cap(td, uap->sd, cap_rights_init(&rights, CAP_RECV),
+	error = getsock_cap(td, uap->sd, cap_rights_init_one(&rights, CAP_RECV),
 	    &fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
 		    uap->iovlen, &iov, EMSGSIZE);
 	else
 #endif
 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		goto out1;
 
 	so = fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_socket_check_receive(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif /* MAC */
 
 	if (uap->fromlenaddr != NULL) {
 		error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen));
 		if (error != 0)
 			goto out;
 	} else {
 		fromlen = 0;
 	}
 	if (uap->msg_flags) {
 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
 		if (error != 0)
 			goto out;
 	} else {
 		msg_flags = 0;
 	}
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 	len = auio.uio_resid;
 	fromsa = (struct sockaddr *)sockbufstore;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
 	CURVNET_SET(so->so_vnet);
 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
 		    fromsa, fromlen, &msg_flags,
 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	} else {
 		if (uap->sinfo)
 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = len - auio.uio_resid;
 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
 	}
 #endif /* KTRACE */
 	if (error != 0)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 
 	if (fromlen && uap->from) {
 		len = fromlen;
 		if (len <= 0 || fromsa == NULL)
 			len = 0;
 		else {
 			len = MIN(len, fromsa->sa_len);
 			error = copyout(fromsa, uap->from, (size_t)len);
 			if (error != 0)
 				goto out;
 		}
 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
 		if (error != 0)
 			goto out;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(fromsa);
 #endif
 	if (uap->msg_flags) {
 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
 		if (error != 0)
 			goto out;
 	}
 out:
 	free(iov, M_IOV);
 out1:
 	if (fp != NULL)
 		fdrop(fp, td);
 
 	return (error);
 }
diff --git a/sys/security/mac/mac_syscalls.c b/sys/security/mac/mac_syscalls.c
index c28a5e99c656..e7d71b2e22da 100644
--- a/sys/security/mac/mac_syscalls.c
+++ b/sys/security/mac/mac_syscalls.c
@@ -1,661 +1,663 @@
 /*-
  * Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2005 Networks Associates Technology, Inc.
  * Copyright (c) 2005-2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract 
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mac.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/pipe.h>
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 #ifdef MAC
 
 FEATURE(security_mac, "Mandatory Access Control Framework support");
 
 static int	kern___mac_get_path(struct thread *td, const char *path_p,
 		    struct mac *mac_p, int follow);
 static int	kern___mac_set_path(struct thread *td, const char *path_p,
 		    struct mac *mac_p, int follow);
 
 int
 sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
 {
 	char *elements, *buffer;
 	struct mac mac;
 	struct proc *tproc;
 	struct ucred *tcred;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	tproc = pfind(uap->pid);
 	if (tproc == NULL)
 		return (ESRCH);
 
 	tcred = NULL;				/* Satisfy gcc. */
 	error = p_cansee(td, tproc);
 	if (error == 0)
 		tcred = crhold(tproc->p_ucred);
 	PROC_UNLOCK(tproc);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		crfree(tcred);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	error = mac_cred_externalize_label(tcred->cr_label, elements,
 	    buffer, mac.m_buflen);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	crfree(tcred);
 	return (error);
 }
 
 int
 sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
 {
 	char *elements, *buffer;
 	struct mac mac;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	error = mac_cred_externalize_label(td->td_ucred->cr_label,
 	    elements, buffer, mac.m_buflen);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	return (error);
 }
 
 int
 sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
 {
 	struct ucred *newcred, *oldcred;
 	struct label *intlabel;
 	struct proc *p;
 	struct mac mac;
 	char *buffer;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_CRED))
 		return (EINVAL);
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_cred_label_alloc();
 	error = mac_cred_internalize_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	newcred = crget();
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 	error = mac_cred_check_relabel(oldcred, intlabel);
 	if (error) {
 		PROC_UNLOCK(p);
 		crfree(newcred);
 		goto out;
 	}
 
 	setsugid(p);
 	crcopy(newcred, oldcred);
 	mac_cred_relabel(newcred, intlabel);
 	proc_set_cred(p, newcred);
 
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	mac_proc_vm_revoke(td);
 
 out:
 	mac_cred_label_free(intlabel);
 	return (error);
 }
 
 int
 sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
 {
 	char *elements, *buffer;
 	struct label *intlabel;
 	struct file *fp;
 	struct mac mac;
 	struct vnode *vp;
 	struct pipe *pipe;
 	struct socket *so;
 	cap_rights_t rights;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
-	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_MAC_GET), &fp);
+	error = fget(td, uap->fd, cap_rights_init_one(&rights, CAP_MAC_GET),
+	    &fp);
 	if (error)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		if (!(mac_labeled & MPC_OBJECT_VNODE)) {
 			error = EINVAL;
 			goto out_fdrop;
 		}
 		vp = fp->f_vnode;
 		intlabel = mac_vnode_label_alloc();
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		mac_vnode_copy_label(vp->v_label, intlabel);
 		VOP_UNLOCK(vp);
 		error = mac_vnode_externalize_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_vnode_label_free(intlabel);
 		break;
 
 	case DTYPE_PIPE:
 		if (!(mac_labeled & MPC_OBJECT_PIPE)) {
 			error = EINVAL;
 			goto out_fdrop;
 		}
 		pipe = fp->f_data;
 		intlabel = mac_pipe_label_alloc();
 		PIPE_LOCK(pipe);
 		mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel);
 		PIPE_UNLOCK(pipe);
 		error = mac_pipe_externalize_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_pipe_label_free(intlabel);
 		break;
 
 	case DTYPE_SOCKET:
 		if (!(mac_labeled & MPC_OBJECT_SOCKET)) {
 			error = EINVAL;
 			goto out_fdrop;
 		}
 		so = fp->f_data;
 		intlabel = mac_socket_label_alloc(M_WAITOK);
 		SOCK_LOCK(so);
 		mac_socket_copy_label(so->so_label, intlabel);
 		SOCK_UNLOCK(so);
 		error = mac_socket_externalize_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_socket_label_free(intlabel);
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 out_fdrop:
 	fdrop(fp, td);
 out:
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	return (error);
 }
 
 int
 sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
 {
 
 	return (kern___mac_get_path(td, uap->path_p, uap->mac_p, FOLLOW));
 }
 
 int
 sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
 {
 
 	return (kern___mac_get_path(td, uap->path_p, uap->mac_p, NOFOLLOW));
 }
 
 static int
 kern___mac_get_path(struct thread *td, const char *path_p, struct mac *mac_p,
    int follow)
 {
 	char *elements, *buffer;
 	struct nameidata nd;
 	struct label *intlabel;
 	struct mac mac;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_VNODE))
 		return (EINVAL);
 
 	error = copyin(mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	NDINIT(&nd, LOOKUP, LOCKLEAF | follow, UIO_USERSPACE, path_p, td);
 	error = namei(&nd);
 	if (error)
 		goto out;
 
 	intlabel = mac_vnode_label_alloc();
 	mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
 	error = mac_vnode_externalize_label(intlabel, elements, buffer,
 	    mac.m_buflen);
 	NDFREE(&nd, 0);
 	mac_vnode_label_free(intlabel);
 
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 out:
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
 
 int
 sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
 {
 	struct label *intlabel;
 	struct pipe *pipe;
 	struct socket *so;
 	struct file *fp;
 	struct mount *mp;
 	struct vnode *vp;
 	struct mac mac;
 	cap_rights_t rights;
 	char *buffer;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
-	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_MAC_SET), &fp);
+	error = fget(td, uap->fd, cap_rights_init_one(&rights, CAP_MAC_SET),
+	    &fp);
 	if (error)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		if (!(mac_labeled & MPC_OBJECT_VNODE)) {
 			error = EINVAL;
 			goto out_fdrop;
 		}
 		intlabel = mac_vnode_label_alloc();
 		error = mac_vnode_internalize_label(intlabel, buffer);
 		if (error) {
 			mac_vnode_label_free(intlabel);
 			break;
 		}
 		vp = fp->f_vnode;
 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 		if (error != 0) {
 			mac_vnode_label_free(intlabel);
 			break;
 		}
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = vn_setlabel(vp, intlabel, td->td_ucred);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 		mac_vnode_label_free(intlabel);
 		break;
 
 	case DTYPE_PIPE:
 		if (!(mac_labeled & MPC_OBJECT_PIPE)) {
 			error = EINVAL;
 			goto out_fdrop;
 		}
 		intlabel = mac_pipe_label_alloc();
 		error = mac_pipe_internalize_label(intlabel, buffer);
 		if (error == 0) {
 			pipe = fp->f_data;
 			PIPE_LOCK(pipe);
 			error = mac_pipe_label_set(td->td_ucred,
 			    pipe->pipe_pair, intlabel);
 			PIPE_UNLOCK(pipe);
 		}
 		mac_pipe_label_free(intlabel);
 		break;
 
 	case DTYPE_SOCKET:
 		if (!(mac_labeled & MPC_OBJECT_SOCKET)) {
 			error = EINVAL;
 			goto out_fdrop;
 		}
 		intlabel = mac_socket_label_alloc(M_WAITOK);
 		error = mac_socket_internalize_label(intlabel, buffer);
 		if (error == 0) {
 			so = fp->f_data;
 			error = mac_socket_label_set(td->td_ucred, so,
 			    intlabel);
 		}
 		mac_socket_label_free(intlabel);
 		break;
 
 	default:
 		error = EINVAL;
 	}
 out_fdrop:
 	fdrop(fp, td);
 out:
 	free(buffer, M_MACTEMP);
 	return (error);
 }
 
 int
 sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
 {
 
 	return (kern___mac_set_path(td, uap->path_p, uap->mac_p, FOLLOW));
 }
 
 int
 sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
 {
 
 	return (kern___mac_set_path(td, uap->path_p, uap->mac_p, NOFOLLOW));
 }
 
 static int
 kern___mac_set_path(struct thread *td, const char *path_p, struct mac *mac_p,
     int follow)
 {
 	struct label *intlabel;
 	struct nameidata nd;
 	struct mount *mp;
 	struct mac mac;
 	char *buffer;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_VNODE))
 		return (EINVAL);
 
 	error = copyin(mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_vnode_label_alloc();
 	error = mac_vnode_internalize_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	NDINIT(&nd, LOOKUP, LOCKLEAF | follow, UIO_USERSPACE, path_p, td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 		if (error == 0) {
 			error = vn_setlabel(nd.ni_vp, intlabel,
 			    td->td_ucred);
 			vn_finished_write(mp);
 		}
 	}
 
 	NDFREE(&nd, 0);
 out:
 	mac_vnode_label_free(intlabel);
 	return (error);
 }
 
 int
 sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap)
 {
 	struct mac_policy_conf *mpc;
 	char target[MAC_MAX_POLICY_NAME];
 	int error;
 
 	error = copyinstr(uap->policy, target, sizeof(target), NULL);
 	if (error)
 		return (error);
 
 	error = ENOSYS;
 	LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
 		if (strcmp(mpc->mpc_name, target) == 0 &&
 		    mpc->mpc_ops->mpo_syscall != NULL) {
 			error = mpc->mpc_ops->mpo_syscall(td,
 			    uap->call, uap->arg);
 			goto out;
 		}
 	}
 
 	if (!LIST_EMPTY(&mac_policy_list)) {
 		mac_policy_slock_sleep();
 		LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
 			if (strcmp(mpc->mpc_name, target) == 0 &&
 			    mpc->mpc_ops->mpo_syscall != NULL) {
 				error = mpc->mpc_ops->mpo_syscall(td,
 				    uap->call, uap->arg);
 				break;
 			}
 		}
 		mac_policy_sunlock_sleep();
 	}
 out:
 	return (error);
 }
 
 #else /* !MAC */
 
 int
 sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #endif /* !MAC */
diff --git a/sys/security/mac_veriexec/mac_veriexec.c b/sys/security/mac_veriexec/mac_veriexec.c
index 8d43d59c9a3c..ecaa8cc35e09 100644
--- a/sys/security/mac_veriexec/mac_veriexec.c
+++ b/sys/security/mac_veriexec/mac_veriexec.c
@@ -1,882 +1,882 @@
 /*
  * $FreeBSD$
  *
  * Copyright (c) 2011, 2012, 2013, 2015, 2016, 2019 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 
 #include "opt_capsicum.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <fs/nullfs/null.h>
 #include <security/mac/mac_policy.h>
 
 #include "mac_veriexec.h"
 #include "mac_veriexec_internal.h"
 
 #define	SLOT(l) \
 	mac_label_get((l), mac_veriexec_slot)
 #define	SLOT_SET(l, v) \
 	mac_label_set((l), mac_veriexec_slot, (v))
 
 #ifdef MAC_DEBUG
 #define	MAC_VERIEXEC_DBG(_lvl, _fmt, ...)				\
 	do {								\
 		VERIEXEC_DEBUG((_lvl), (MAC_VERIEXEC_FULLNAME ": " _fmt	\
 		     "\n", ##__VA_ARGS__));				\
 	} while(0)
 #else
 #define	MAC_VERIEXEC_DBG(_lvl, _fmt, ...)
 #endif
 
 static int sysctl_mac_veriexec_state(SYSCTL_HANDLER_ARGS);
 static int sysctl_mac_veriexec_db(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_DECL(_security_mac);
 
 SYSCTL_NODE(_security_mac, OID_AUTO, veriexec, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "MAC/veriexec policy controls");
 
 int	mac_veriexec_debug;
 SYSCTL_INT(_security_mac_veriexec, OID_AUTO, debug, CTLFLAG_RW,
     &mac_veriexec_debug, 0, "Debug level");
 
 static int	mac_veriexec_state;
 SYSCTL_PROC(_security_mac_veriexec, OID_AUTO, state,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     0, 0, sysctl_mac_veriexec_state, "A",
     "Verified execution subsystem state");
 
 SYSCTL_PROC(_security_mac_veriexec, OID_AUTO, db,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT,
     0, 0, sysctl_mac_veriexec_db,
     "A", "Verified execution fingerprint database");
 
 static int mac_veriexec_slot;
 
 MALLOC_DEFINE(M_VERIEXEC, "veriexec", "Verified execution data");
 
 /**
  * @internal
  * @brief Handler for security.mac.veriexec.db sysctl
  *
  * Display a human-readable form of the current fingerprint database.
  */
 static int
 sysctl_mac_veriexec_db(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	sbuf_new_for_sysctl(&sb, NULL, 1024, req);
 	mac_veriexec_metadata_print_db(&sb);
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	return (error);
 }
 
 /**
  * @internal
  * @brief Generate human-readable output about the current verified execution
  *        state.
  *
  * @param sbp		sbuf to write output to
  */
 static void
 mac_veriexec_print_state(struct sbuf *sbp)
 {
 
 	if (mac_veriexec_state & VERIEXEC_STATE_INACTIVE)
 		sbuf_printf(sbp, "inactive ");
 	if (mac_veriexec_state & VERIEXEC_STATE_LOADED)
 		sbuf_printf(sbp, "loaded ");
 	if (mac_veriexec_state & VERIEXEC_STATE_ACTIVE)
 		sbuf_printf(sbp, "active ");
 	if (mac_veriexec_state & VERIEXEC_STATE_ENFORCE)
 		sbuf_printf(sbp, "enforce ");
 	if (mac_veriexec_state & VERIEXEC_STATE_LOCKED)
 		sbuf_printf(sbp, "locked ");
 	if (mac_veriexec_state != 0)
 		sbuf_trim(sbp);
 }
 
 /**
  * @internal
  * @brief Handler for security.mac.veriexec.state sysctl
  *
  * Display a human-readable form of the current verified execution subsystem
  * state.
  */
 static int
 sysctl_mac_veriexec_state(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	int error;
 
 	sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
 	mac_veriexec_print_state(&sb);
 	sbuf_finish(&sb);
 
 	error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
 	sbuf_delete(&sb);
 	return (error);
 }
 
 /**
  * @internal
  * @brief Event handler called when a virtual file system is mounted.
  *
  * We need to record the file system identifier in the MAC per-policy slot
  * assigned to veriexec, so we have a key to use in order to reference the
  * mount point in the meta-data store.
  *
  * @param arg		unused argument
  * @param mp		mount point that is being mounted
  * @param fsrootvp	vnode of the file system root
  * @param td		calling thread
  */
 static void
 mac_veriexec_vfs_mounted(void *arg __unused, struct mount *mp,
     struct vnode *fsrootvp, struct thread *td)
 {
 	struct vattr va;
 	int error;
 
 	error = VOP_GETATTR(fsrootvp, &va, td->td_ucred);
 	if (error)
 		return;
 
 	SLOT_SET(mp->mnt_label, va.va_fsid);
 #ifdef MAC_DEBUG
 	MAC_VERIEXEC_DBG(3, "set fsid to %ju for mount %p",
 	    (uintmax_t)va.va_fsid, mp);
 #endif
 }
 
 /**
  * @internal
  * @brief Event handler called when a virtual file system is unmounted.
  *
  * If we recorded a file system identifier in the MAC per-policy slot assigned
  * to veriexec, then we need to tell the meta-data store to clean up.
  *
  * @param arg		unused argument
  * @param mp		mount point that is being unmounted
  * @param td		calling thread
  */
 static void
 mac_veriexec_vfs_unmounted(void *arg __unused, struct mount *mp,
     struct thread *td)
 {
 	dev_t fsid;
 
 	fsid = SLOT(mp->mnt_label);
 	if (fsid) {
 		MAC_VERIEXEC_DBG(3, "fsid %ju, cleaning up mount",
 		    (uintmax_t)fsid);
 		mac_veriexec_metadata_unmounted(fsid, td);
 	}
 }
 
 /**
  * @internal
  * @brief The mount point is being initialized, set the value in the MAC
  *     per-policy slot for veriexec to zero.
  *
  * @note A value of zero in this slot indicates no file system identifier
  *     is assigned.
  *
  * @param label the label that is being initialized
  */
 static void 
 mac_veriexec_mount_init_label(struct label *label) 
 {
 
 	SLOT_SET(label, 0);
 }
 
 /**
  * @internal
  * @brief The mount-point is being destroyed, reset the value in the MAC
  *     per-policy slot for veriexec back to zero.
  *
  * @note A value of zero in this slot indicates no file system identifier
  *     is assigned.
  *
  * @param label the label that is being destroyed
  */
 static void 
 mac_veriexec_mount_destroy_label(struct label *label) 
 {
 
 	SLOT_SET(label, 0);
 }
 
 /**
  * @internal
  * @brief The vnode label is being initialized, set the value in the MAC
  *     per-policy slot for veriexec to @c FINGERPRINT_INVALID
  *
  * @note @c FINGERPRINT_INVALID indicates the fingerprint is invalid.
  *
  * @param label		the label that is being initialized
  */
 static void
 mac_veriexec_vnode_init_label(struct label *label)
 {
 
 	SLOT_SET(label, FINGERPRINT_INVALID);
 }
 
 /**
  * @internal
  * @brief The vnode label is being destroyed, reset the value in the MAC
  *        per-policy slot for veriexec back to @c FINGERPRINT_INVALID
  *
  * @note @c FINGERPRINT_INVALID indicates the fingerprint is invalid.
  *
  * @param label		the label that is being destroyed
  */
 static void
 mac_veriexec_vnode_destroy_label(struct label *label)
 {
 
 	SLOT_SET(label, FINGERPRINT_INVALID);
 }
 
 /**
  * @internal
  * @brief Copy the value in the MAC per-policy slot assigned to veriexec from
  *        the @p src label to the @p dest label
  */
 static void 
 mac_veriexec_copy_label(struct label *src, struct label *dest)
 {
 
 	SLOT_SET(dest, SLOT(src));
 }
 
 /**
  * @internal
  * @brief Check if the requested process can be debugged
  *
  * @param cred		credentials to use
  * @param p		process to debug
  *
  * @return 0 if debugging is allowed, otherwise an error code.
  */
 static int
 mac_veriexec_proc_check_debug(struct ucred *cred, struct proc *p)
 {
 	int error, flags;
 
 	/* If we are not enforcing veriexec, nothing for us to check */
 	if ((mac_veriexec_state & VERIEXEC_STATE_ENFORCE) == 0)
 		return (0);
 
 	error = mac_veriexec_metadata_get_executable_flags(cred, p, &flags, 0);
 	if (error != 0)
 		return (0);
 
 	return ((flags & VERIEXEC_NOTRACE) ? EACCES : 0);
 }
 
 /**
  * @internal
  * @brief A KLD load has been requested and needs to be validated.
  *
  * @param cred		credentials to use
  * @param vp		vnode of the KLD that has been requested
  * @param vlabel	vnode label assigned to the vnode
  *
  * @return 0 if the KLD load is allowed, otherwise an error code.
  */
 static int
 mac_veriexec_kld_check_load(struct ucred *cred, struct vnode *vp,
     struct label *vlabel)
 {
 	struct vattr va;
 	struct thread *td = curthread;
 	fingerprint_status_t status;
 	int error;
 
 	/*
 	 * If we are not actively enforcing, allow it
 	 */
 	if ((mac_veriexec_state & VERIEXEC_STATE_ENFORCE) == 0)
 		return (0);
 
 	/* Get vnode attributes */
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error)
 		return (error);
 
 	/*
 	 * Fetch the fingerprint status for the vnode
 	 * (starting with files first)
 	 */
 	error = mac_veriexec_metadata_fetch_fingerprint_status(vp, &va, td,
 	    VERIEXEC_FILES_FIRST);
 	if (error && error != EAUTH)
 		return (error);
 
 	/*
 	 * By now we should have status...
 	 */
 	status = mac_veriexec_get_fingerprint_status(vp);
 	switch (status) {
 	case FINGERPRINT_FILE:
 	case FINGERPRINT_VALID:
 	case FINGERPRINT_INDIRECT:
 		if (error)
 			return (error);
 		break;
 	default:
 		/*
 		 * kldload should fail unless there is a valid fingerprint
 		 * registered.
 		 */
 		MAC_VERIEXEC_DBG(2, "fingerprint status is %d for dev %ju, "
 		    "file %ju.%ju\n", status, (uintmax_t)va.va_fsid,
 		    (uintmax_t)va.va_fileid, (uintmax_t)va.va_gen);
 		return (EAUTH);
 	}
 
 	/* Everything is good, allow the KLD to be loaded */
 	return (0);
 }
 
 /**
  * @internal
  * @brief Check privileges that veriexec needs to be concerned about.
  *
  * The following privileges are checked by this function:
  *  - PRIV_KMEM_WRITE\n
  *    Check if writes to /dev/mem and /dev/kmem are allowed\n
  *    (Only trusted processes are allowed)
  *
  * @param cred		credentials to use
  * @param priv		privilege to check
  *
  * @return 0 if the privilege is allowed, error code otherwise.
  */
 static int
 mac_veriexec_priv_check(struct ucred *cred, int priv)
 {
 
 	/* If we are not enforcing veriexec, nothing for us to check */
 	if ((mac_veriexec_state & VERIEXEC_STATE_ENFORCE) == 0)
 		return (0);
 
 	switch (priv) {
 	case PRIV_KMEM_WRITE:
 		if (!mac_veriexec_proc_is_trusted(cred, curproc))
 			return (EPERM);
 		break;
 	default:
 		break;
 	}
 	return (0);
 }
 
 static int
 mac_veriexec_sysctl_check(struct ucred *cred, struct sysctl_oid *oidp,
     void *arg1, int arg2, struct sysctl_req *req)
 {
 	struct sysctl_oid *oid;
 
 	/* If we are not enforcing veriexec, nothing for us to check */
 	if ((mac_veriexec_state & VERIEXEC_STATE_ENFORCE) == 0)
 		return (0);
 
 	oid = oidp;
 	if (oid->oid_kind & CTLFLAG_SECURE) {
 		return (EPERM);		/* XXX call mac_veriexec_priv_check? */
 	}
 	return 0;
 }
 
 /**
  * @internal
  * @brief A program is being executed and needs to be validated.
  *
  * @param cred		credentials to use
  * @param vp		vnode of the program that is being executed
  * @param label		vnode label assigned to the vnode
  * @param imgp		parameters for the image to be executed
  * @param execlabel	optional exec label
  *
  * @return 0 if the program should be allowed to execute, otherwise an error
  *     code.
  */
 static int
 mac_veriexec_vnode_check_exec(struct ucred *cred __unused,
     struct vnode *vp __unused, struct label *label __unused,
     struct image_params *imgp, struct label *execlabel __unused)
 {
 	struct thread *td = curthread;
 	int error;
 
 	error = mac_veriexec_fingerprint_check_image(imgp, 0, td);
 	return (error);
 }
 
 /**
  * @brief Check fingerprint for the specified vnode and validate it
  *
  * @param cred		credentials to use
  * @param vp		vnode of the file
  * @param accmode	access mode to check (read, write, append, create,
  *			verify, etc.)
  *
  * @return 0 if the file validated, otherwise an error code.
  */
 static int
 mac_veriexec_check_vp(struct ucred *cred, struct vnode *vp, accmode_t accmode)
 {
 	struct vattr va;
 	struct thread *td = curthread;
 	fingerprint_status_t status;
 	int error;
 
 	/* Get vnode attributes */
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error)
 		return (error);
 
 	/* Get the fingerprint status for the file */
 	error = mac_veriexec_metadata_fetch_fingerprint_status(vp, &va, td,
 	    VERIEXEC_FILES_FIRST);
 	if (error && error != EAUTH)
 		return (error);
 
 	/*
 	 * By now we should have status...
 	 */
 	status = mac_veriexec_get_fingerprint_status(vp);
 	if (accmode & VWRITE) {
 		/*
 		 * If file has a fingerprint then deny the write request,
 		 * otherwise invalidate the status so we don't keep checking
 		 * for the file having a fingerprint. 
 		 */
 		switch (status) {
 		case FINGERPRINT_FILE:
 		case FINGERPRINT_VALID:
 		case FINGERPRINT_INDIRECT:
 			MAC_VERIEXEC_DBG(2,
 			    "attempted write to fingerprinted file for dev "
 			    "%ju, file %ju.%ju\n", (uintmax_t)va.va_fsid,
 			    (uintmax_t)va.va_fileid, (uintmax_t)va.va_gen);
 			return (EPERM);
 		default:
 			break;
 		}
 	}
 	if (accmode & VVERIFY) {
 		switch (status) {
 		case FINGERPRINT_FILE:
 		case FINGERPRINT_VALID:
 		case FINGERPRINT_INDIRECT:
 			if (error)
 				return (error);
 			break;
 		default:
 			/*
 			 * Caller wants open to fail unless there is a valid
 			 * fingerprint registered. 
 			 */
 			MAC_VERIEXEC_DBG(2, "fingerprint status is %d for dev "
 			    "%ju, file %ju.%ju\n", status,
 			    (uintmax_t)va.va_fsid, (uintmax_t)va.va_fileid,
 			    (uintmax_t)va.va_gen);
 			return (EAUTH);
 		}
 	}
 	return (0);
 }
 
 /**
  * @brief Opening a file has been requested and may need to be validated.
  *
  * @param cred		credentials to use
  * @param vp		vnode of the file to open
  * @param label		vnode label assigned to the vnode
  * @param accmode	access mode to use for opening the file (read, write,
  * 			append, create, verify, etc.)
  *
  * @return 0 if opening the file should be allowed, otherwise an error code.
  */
 static int
 mac_veriexec_vnode_check_open(struct ucred *cred, struct vnode *vp,
 	struct label *label __unused, accmode_t accmode)
 {
 	int error;
 
 	/*
 	 * Look for the file on the fingerprint lists iff it has not been seen
 	 * before.
 	 */
 	if ((mac_veriexec_state & VERIEXEC_STATE_ENFORCE) == 0)
 		return (0);
 
 	error = mac_veriexec_check_vp(cred, vp, accmode);
 	return (error);
 }
 
 /**
  * @brief Check mode changes on file to ensure they should be allowed.
  *
  * We cannot allow chmod of SUID or SGID on verified files.
  *
  * @param cred		credentials to use
  * @param vp		vnode of the file to open
  * @param label		vnode label assigned to the vnode
  * @param mode		mode flags to set
  *
  * @return 0 if the mode change should be allowed, EAUTH otherwise.
  */
 static int
 mac_veriexec_vnode_check_setmode(struct ucred *cred, struct vnode *vp,
     struct label *label __unused, mode_t mode)
 {
 	int error;
 
 	if ((mac_veriexec_state & VERIEXEC_STATE_ENFORCE) == 0)
 		return (0);
 
 	/*
 	 * Do not allow chmod (set-[gu]id) of verified file
 	 */
 	error = mac_veriexec_check_vp(cred, vp, VVERIFY);
 	if (error == EAUTH)             /* it isn't verified */
 		return (0);
 	if (error == 0 && (mode & (S_ISUID|S_ISGID)) != 0)
 		return (EAUTH);
 	return (0);
 }
 
 /**
  * @internal
  * @brief Initialize the mac_veriexec MAC policy
  *
  * @param mpc		MAC policy configuration
  */
 static void
 mac_veriexec_init(struct mac_policy_conf *mpc __unused)
 {
 	/* Initialize state */
 	mac_veriexec_state = VERIEXEC_STATE_INACTIVE;
 
 	/* Initialize meta-data storage */
 	mac_veriexec_metadata_init();
 
 	/* Initialize fingerprint ops */
 	mac_veriexec_fingerprint_init();
 
 	/* Register event handlers */
 	EVENTHANDLER_REGISTER(vfs_mounted, mac_veriexec_vfs_mounted, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 	EVENTHANDLER_REGISTER(vfs_unmounted, mac_veriexec_vfs_unmounted, NULL,
 	    EVENTHANDLER_PRI_LAST);
 }
 
 /**
  * @internal
  * @brief MAC policy-specific syscall for mac_veriexec
  *
  * The following syscalls are implemented:
  *   - @c MAC_VERIEXEC_CHECK_SYSCALL
  *        Check if the file referenced by a file descriptor has a fingerprint
  *        registered in the meta-data store.
  *
  * @param td		calling thread
  * @param call		system call number
  * @param arg		arugments to the syscall
  *
  * @return 0 on success, otherwise an error code.
  */
 static int
 mac_veriexec_syscall(struct thread *td, int call, void *arg)
 {
 	struct image_params img;
 	struct nameidata nd;
 	cap_rights_t rights;
 	struct vattr va;
 	struct file *fp;
 	int error;
 
 	switch (call) {
 	case MAC_VERIEXEC_CHECK_FD_SYSCALL:
 		/* Get the vnode associated with the file descriptor passed */
-		error = getvnode(td, (uintptr_t) arg, cap_rights_init(&rights,
-		    CAP_READ), &fp);
+		error = getvnode(td, (uintptr_t) arg,
+		    cap_rights_init_one(&rights, CAP_READ), &fp);
 		if (error)
 			return (error);
 		if (fp->f_type != DTYPE_VNODE) {
 			MAC_VERIEXEC_DBG(3, "MAC_VERIEXEC_CHECK_SYSCALL: "
 			    "file is not vnode type (type=0x%x)",
 			    fp->f_type);
 			error = EINVAL;
 			goto cleanup_file;
 		}
 
 		/*
 		 * setup the bits of image_params that are used by
 		 * mac_veriexec_check_fingerprint().
 		 */
 		bzero(&img, sizeof(img));
 		img.proc = td->td_proc;
 		img.vp = fp->f_vnode;
 		img.attr = &va;
 
 		/*
 		 * Get vnode attributes
 		 * (need to obtain a lock on the vnode first)
 		 */
 		vn_lock(img.vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_GETATTR(fp->f_vnode, &va,  td->td_ucred);
 		if (error)
 			goto check_done;
 		       
 		MAC_VERIEXEC_DBG(2, "mac_veriexec_fingerprint_check_image: "
 		    "va_mode=%o, check_files=%d\n", va.va_mode,
 		    ((va.va_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0));
 		error = mac_veriexec_fingerprint_check_image(&img,
 		    ((va.va_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0), td);
 check_done:
 		/* Release the lock we obtained earlier */
 		VOP_UNLOCK(img.vp);
 cleanup_file:
 		fdrop(fp, td);
 		break;
 	case MAC_VERIEXEC_CHECK_PATH_SYSCALL:
 		/* Look up the path to get the vnode */
 		NDINIT(&nd, LOOKUP,
 		    FOLLOW | LOCKLEAF | LOCKSHARED | AUDITVNODE1,
 		    UIO_USERSPACE, arg, td);
 		error = namei(&nd);
 		if (error != 0)
 			break;
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 
 		/* Check the fingerprint status of the vnode */
 		error = mac_veriexec_check_vp(td->td_ucred, nd.ni_vp, VVERIFY);
 		vput(nd.ni_vp);
 		break;
 	default:
 		error = EOPNOTSUPP;
 	}
 	return (error);
 }
 
 static struct mac_policy_ops mac_veriexec_ops =
 {
 	.mpo_init = mac_veriexec_init,
 	.mpo_kld_check_load = mac_veriexec_kld_check_load,
 	.mpo_mount_destroy_label = mac_veriexec_mount_destroy_label,
 	.mpo_mount_init_label = mac_veriexec_mount_init_label,
 	.mpo_priv_check = mac_veriexec_priv_check,
 	.mpo_proc_check_debug = mac_veriexec_proc_check_debug,
 	.mpo_syscall = mac_veriexec_syscall,
 	.mpo_system_check_sysctl = mac_veriexec_sysctl_check,
 	.mpo_vnode_check_exec = mac_veriexec_vnode_check_exec,
 	.mpo_vnode_check_open = mac_veriexec_vnode_check_open,
 	.mpo_vnode_check_setmode = mac_veriexec_vnode_check_setmode,
 	.mpo_vnode_copy_label = mac_veriexec_copy_label,
 	.mpo_vnode_destroy_label = mac_veriexec_vnode_destroy_label,
 	.mpo_vnode_init_label = mac_veriexec_vnode_init_label,
 };
 
 MAC_POLICY_SET(&mac_veriexec_ops, mac_veriexec, MAC_VERIEXEC_FULLNAME,
     MPC_LOADTIME_FLAG_NOTLATE, &mac_veriexec_slot);
 MODULE_VERSION(mac_veriexec, 1);
 
 static struct vnode *
 mac_veriexec_bottom_vnode(struct vnode *vp)
 {
 	struct vnode *ldvp = NULL;
 
 	/*
 	 * XXX This code is bogus. nullfs is not the only stacking
 	 * filesystem. Less bogus code would add a VOP to reach bottom
 	 * vnode and would not make assumptions how to get there.
 	 */
 	if (vp->v_mount != NULL &&
 	    strcmp(vp->v_mount->mnt_vfc->vfc_name, "nullfs") == 0)
 		ldvp = NULLVPTOLOWERVP(vp);
 	return (ldvp);
 }
 
 /**
  * @brief Get the fingerprint status set on a vnode.
  *
  * @param vp		vnode to obtain fingerprint status from
  *
  * @return Fingerprint status assigned to the vnode.
  */
 fingerprint_status_t
 mac_veriexec_get_fingerprint_status(struct vnode *vp)
 {
 	fingerprint_status_t fps;
 	struct vnode *ldvp;
 
 	fps = SLOT(vp->v_label);
 	switch (fps) {
 	case FINGERPRINT_VALID:
 	case FINGERPRINT_INDIRECT:
 	case FINGERPRINT_FILE:
 		break;
 	default:
 		/* we may need to recurse */
 		ldvp = mac_veriexec_bottom_vnode(vp);
 		if (ldvp != NULL)
 			return mac_veriexec_get_fingerprint_status(ldvp);
 		break;
 	}
 	return fps;
 }
 
 /**
  * @brief Get the current verified execution subsystem state.
  *
  * @return Current set of verified execution subsystem state flags.
  */
 int
 mac_veriexec_get_state(void)
 {
 
 	return (mac_veriexec_state);
 }
 
 /**
  * @brief Determine if the verified execution subsystem state has specific
  *     flags set.
  *
  * @param state		mask of flags to check
  *
  * @return State flags set within the masked bits
  */
 int
 mac_veriexec_in_state(int state)
 {
 
 	return (mac_veriexec_state & state);
 }
 
 /**
  * @brief Set the fingerprint status for a vnode
  *
  * Fingerprint status is stored in the MAC per-policy slot assigned to
  * mac_veriexec.
  *
  * @param vp		vnode to store the fingerprint status on
  * @param fp_status	fingerprint status to store
  */
 void
 mac_veriexec_set_fingerprint_status(struct vnode *vp,
     fingerprint_status_t fp_status)
 {
 	struct vnode *ldvp;
 
 	/* recurse until we find the real storage */
 	ldvp = mac_veriexec_bottom_vnode(vp);
 	if (ldvp != NULL) {
 		mac_veriexec_set_fingerprint_status(ldvp, fp_status);
 		return;
 	}
 	SLOT_SET(vp->v_label, fp_status);
 }
 
 /**
  * @brief Set verified execution subsystem state flags
  *
  * @note Flags can only be added to the current state, not removed.
  *
  * @param state		state flags to add to the current state
  */
 void
 mac_veriexec_set_state(int state)
 {
 
 	mac_veriexec_state |= state;
 }
 
 /**
  * @brief Determine if the process is trusted
  *
  * @param cred		credentials to use
  * @param p		the process in question
  *
  * @return 1 if the process is trusted, otherwise 0.
  */
 int
 mac_veriexec_proc_is_trusted(struct ucred *cred, struct proc *p)
 {
 	int already_locked, error, flags;
 
 	/* Make sure we lock the process if we do not already have the lock */
 	already_locked = PROC_LOCKED(p);
 	if (!already_locked)
 		PROC_LOCK(p);
 
 	error = mac_veriexec_metadata_get_executable_flags(cred, p, &flags, 0);
 
 	/* Unlock the process if we locked it previously */
 	if (!already_locked)
 		PROC_UNLOCK(p);
 
 	/* Any errors, deny access */
 	if (error != 0)
 		return (0);
 
 	/* Check that the trusted flag is set */
 	return ((flags & VERIEXEC_TRUSTED) == VERIEXEC_TRUSTED);
 }
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index be93dd16cd0a..3b652ea14303 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -1,3517 +1,3517 @@
 /*-
  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
  *
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Marshall
  * Kirk McKusick and Network Associates Laboratories, the Security
  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
  * research program
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_alloc.c	8.18 (Berkeley) 5/26/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/gsb_crc32.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 
 #include <security/audit/audit.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufsmount.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 #include <ufs/ffs/softdep.h>
 
 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,
 				  int size, int rsize);
 
 static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);
 static ufs2_daddr_t
 	      ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
 static void	ffs_blkfree_cg(struct ufsmount *, struct fs *,
 		    struct vnode *, ufs2_daddr_t, long, ino_t,
 		    struct workhead *);
 #ifdef INVARIANTS
 static int	ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
 static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);
 static ino_t	ffs_dirpref(struct inode *);
 static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,
 		    int, int);
 static ufs2_daddr_t	ffs_hashalloc
 		(struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *);
 static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,
 		    int);
 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
 static int	ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
 static int	ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
 static void	ffs_ckhash_cg(struct buf *);
 
 /*
  * Allocate a block in the filesystem.
  *
  * The size of the requested block is given, which must be some
  * multiple of fs_fsize and <= fs_bsize.
  * A preference may be optionally specified. If a preference is given
  * the following hierarchy is used to allocate a block:
  *   1) allocate the requested block.
  *   2) allocate a rotationally optimal block in the same cylinder.
  *   3) allocate a block in the same cylinder group.
  *   4) quadradically rehash into other cylinder groups, until an
  *      available block is located.
  * If no block preference is given the following hierarchy is used
  * to allocate a block:
  *   1) allocate a block in the cylinder group that contains the
  *      inode for the file.
  *   2) quadradically rehash into other cylinder groups, until an
  *      available block is located.
  */
 int
 ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp)
 	struct inode *ip;
 	ufs2_daddr_t lbn, bpref;
 	int size, flags;
 	struct ucred *cred;
 	ufs2_daddr_t *bnp;
 {
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t bno;
 	u_int cg, reclaimed;
 	int64_t delta;
 #ifdef QUOTA
 	int error;
 #endif
 
 	*bnp = 0;
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 #ifdef INVARIANTS
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
 		    devtoname(ump->um_dev), (long)fs->fs_bsize, size,
 		    fs->fs_fsmnt);
 		panic("ffs_alloc: bad size");
 	}
 	if (cred == NOCRED)
 		panic("ffs_alloc: missing credential");
 #endif /* INVARIANTS */
 	reclaimed = 0;
 retry:
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	error = chkdq(ip, btodb(size), cred, 0);
 	if (error)
 		return (error);
 	UFS_LOCK(ump);
 #endif
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 		goto nospace;
 	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) &&
 	    freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
 		goto nospace;
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	if (bpref == 0)
 		cg = ino_to_cg(fs, ip->i_number);
 	else
 		cg = dtog(fs, bpref);
 	bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
 	if (bno > 0) {
 		delta = btodb(size);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		if (flags & IO_EXT)
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 		else
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		*bnp = bno;
 		return (0);
 	}
 nospace:
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	/*
 	 * Restore user's disk quota because allocation failed.
 	 */
 	(void) chkdq(ip, -btodb(size), cred, FORCE);
 	UFS_LOCK(ump);
 #endif
 	if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
 		reclaimed = 1;
 		softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT);
 		goto retry;
 	}
 	if (ffs_fsfail_cleanup_locked(ump, 0)) {
 		UFS_UNLOCK(ump);
 		return (ENXIO);
 	}
 	if (reclaimed > 0 &&
 	    ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
 		UFS_UNLOCK(ump);
 		ffs_fserr(fs, ip->i_number, "filesystem full");
 		uprintf("\n%s: write failed, filesystem is full\n",
 		    fs->fs_fsmnt);
 	} else {
 		UFS_UNLOCK(ump);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Reallocate a fragment to a bigger size
  *
  * The number and size of the old block is given, and a preference
  * and new size is also specified. The allocator attempts to extend
  * the original block. Failing that, the regular block allocator is
  * invoked to get an appropriate block.
  */
 int
 ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp)
 	struct inode *ip;
 	ufs2_daddr_t lbprev;
 	ufs2_daddr_t bprev;
 	ufs2_daddr_t bpref;
 	int osize, nsize, flags;
 	struct ucred *cred;
 	struct buf **bpp;
 {
 	struct vnode *vp;
 	struct fs *fs;
 	struct buf *bp;
 	struct ufsmount *ump;
 	u_int cg, request, reclaimed;
 	int error, gbflags;
 	ufs2_daddr_t bno;
 	int64_t delta;
 
 	vp = ITOV(ip);
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	bp = NULL;
 	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
 
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 #ifdef INVARIANTS
 	if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_realloccg: allocation on suspended filesystem");
 	if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
 	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
 		printf(
 		"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
 		    devtoname(ump->um_dev), (long)fs->fs_bsize, osize,
 		    nsize, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad size");
 	}
 	if (cred == NOCRED)
 		panic("ffs_realloccg: missing credential");
 #endif /* INVARIANTS */
 	reclaimed = 0;
 retry:
 	if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) &&
 	    freespace(fs, fs->fs_minfree) -  numfrags(fs, nsize - osize) < 0) {
 		goto nospace;
 	}
 	if (bprev == 0) {
 		printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
 		    devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev,
 		    fs->fs_fsmnt);
 		panic("ffs_realloccg: bad bprev");
 	}
 	UFS_UNLOCK(ump);
 	/*
 	 * Allocate the extra space in the buffer.
 	 */
 	error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp);
 	if (error) {
 		return (error);
 	}
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		if (lbprev >= UFS_NDADDR)
 			panic("ffs_realloccg: lbprev out of range");
 		bp->b_blkno = fsbtodb(fs, bprev);
 	}
 
 #ifdef QUOTA
 	error = chkdq(ip, btodb(nsize - osize), cred, 0);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 #endif
 	/*
 	 * Check for extension in the existing location.
 	 */
 	*bpp = NULL;
 	cg = dtog(fs, bprev);
 	UFS_LOCK(ump);
 	bno = ffs_fragextend(ip, cg, bprev, osize, nsize);
 	if (bno) {
 		if (bp->b_blkno != fsbtodb(fs, bno))
 			panic("ffs_realloccg: bad blockno");
 		delta = btodb(nsize - osize);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		if (flags & IO_EXT)
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 		else
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
 		vfs_bio_bzero_buf(bp, osize, nsize - osize);
 		if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
 			vfs_bio_set_valid(bp, osize, nsize - osize);
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Allocate a new disk location.
 	 */
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	switch ((int)fs->fs_optim) {
 	case FS_OPTSPACE:
 		/*
 		 * Allocate an exact sized fragment. Although this makes
 		 * best use of space, we will waste time relocating it if
 		 * the file continues to grow. If the fragmentation is
 		 * less than half of the minimum free reserve, we choose
 		 * to begin optimizing for time.
 		 */
 		request = nsize;
 		if (fs->fs_minfree <= 5 ||
 		    fs->fs_cstotal.cs_nffree >
 		    (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
 			break;
 		log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
 			fs->fs_fsmnt);
 		fs->fs_optim = FS_OPTTIME;
 		break;
 	case FS_OPTTIME:
 		/*
 		 * At this point we have discovered a file that is trying to
 		 * grow a small fragment to a larger fragment. To save time,
 		 * we allocate a full sized block, then free the unused portion.
 		 * If the file continues to grow, the `ffs_fragextend' call
 		 * above will be able to grow it in place without further
 		 * copying. If aberrant programs cause disk fragmentation to
 		 * grow within 2% of the free reserve, we choose to begin
 		 * optimizing for space.
 		 */
 		request = fs->fs_bsize;
 		if (fs->fs_cstotal.cs_nffree <
 		    (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
 			break;
 		log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
 			fs->fs_fsmnt);
 		fs->fs_optim = FS_OPTSPACE;
 		break;
 	default:
 		printf("dev = %s, optim = %ld, fs = %s\n",
 		    devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad optim");
 		/* NOTREACHED */
 	}
 	bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(vp))
 			/*
 			 * The usual case is that a smaller fragment that
 			 * was just allocated has been replaced with a bigger
 			 * fragment or a full-size block. If it is marked as
 			 * B_DELWRI, the current contents have not been written
 			 * to disk. It is possible that the block was written
 			 * earlier, but very uncommon. If the block has never
 			 * been written, there is no need to send a BIO_DELETE
 			 * for it when it is freed. The gain from avoiding the
 			 * TRIMs for the common case of unwritten blocks far
 			 * exceeds the cost of the write amplification for the
 			 * uncommon case of failing to send a TRIM for a block
 			 * that had been written.
 			 */
 			ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
 			    ip->i_number, vp->v_type, NULL,
 			    (bp->b_flags & B_DELWRI) != 0 ?
 			    NOTRIM_KEY : SINGLETON_KEY);
 		delta = btodb(nsize - osize);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		if (flags & IO_EXT)
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
 		else
 			UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
 		vfs_bio_bzero_buf(bp, osize, nsize - osize);
 		if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
 			vfs_bio_set_valid(bp, osize, nsize - osize);
 		*bpp = bp;
 		return (0);
 	}
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	/*
 	 * Restore user's disk quota because allocation failed.
 	 */
 	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
 	UFS_LOCK(ump);
 #endif
 nospace:
 	/*
 	 * no space available
 	 */
 	if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
 		reclaimed = 1;
 		UFS_UNLOCK(ump);
 		if (bp) {
 			brelse(bp);
 			bp = NULL;
 		}
 		UFS_LOCK(ump);
 		softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
 		goto retry;
 	}
 	if (bp)
 		brelse(bp);
 	if (ffs_fsfail_cleanup_locked(ump, 0)) {
 		UFS_UNLOCK(ump);
 		return (ENXIO);
 	}
 	if (reclaimed > 0 &&
 	    ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
 		UFS_UNLOCK(ump);
 		ffs_fserr(fs, ip->i_number, "filesystem full");
 		uprintf("\n%s: write failed, filesystem is full\n",
 		    fs->fs_fsmnt);
 	} else {
 		UFS_UNLOCK(ump);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Reallocate a sequence of blocks into a contiguous sequence of blocks.
  *
  * The vnode and an array of buffer pointers for a range of sequential
  * logical blocks to be made contiguous is given. The allocator attempts
  * to find a range of sequential blocks starting as close as possible
  * from the end of the allocation for the logical block immediately
  * preceding the current range. If successful, the physical block numbers
  * in the buffer pointers and in the inode are changed to reflect the new
  * allocation. If unsuccessful, the allocation is left unchanged. The
  * success in doing the reallocation is returned. Note that the error
  * return is not reflected back to the user. Rather the previous block
  * allocation will be used.
  */
 
 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "FFS filesystem");
 
 static int doasyncfree = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0,
 "do not force synchronous writes when blocks are reallocated");
 
 static int doreallocblks = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0,
 "enable block reallocation");
 
 static int dotrimcons = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0,
 "enable BIO_DELETE / TRIM consolidation");
 
 static int maxclustersearch = 10;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch,
 0, "max number of cylinder group to search for contigous blocks");
 
 #ifdef DIAGNOSTIC
 static int prtrealloc = 0;
 SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0,
 	"print out FFS filesystem block reallocation operations");
 #endif
 
 int
 ffs_reallocblks(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct ufsmount *ump;
 
 	/*
 	 * We used to skip reallocating the blocks of a file into a
 	 * contiguous sequence if the underlying flash device requested
 	 * BIO_DELETE notifications, because devices that benefit from
 	 * BIO_DELETE also benefit from not moving the data. However,
 	 * the destination for the data is usually moved before the data
 	 * is written to the initially allocated location, so we rarely
 	 * suffer the penalty of extra writes. With the addition of the
 	 * consolidation of contiguous blocks into single BIO_DELETE
 	 * operations, having fewer but larger contiguous blocks reduces
 	 * the number of (slow and expensive) BIO_DELETE operations. So
 	 * when doing BIO_DELETE consolidation, we do block reallocation.
 	 *
 	 * Skip if reallocblks has been disabled globally.
 	 */
 	ump = ap->a_vp->v_mount->mnt_data;
 	if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) ||
 	    doreallocblks == 0)
 		return (ENOSPC);
 
 	/*
 	 * We can't wait in softdep prealloc as it may fsync and recurse
 	 * here.  Instead we simply fail to reallocate blocks if this
 	 * rare condition arises.
 	 */
 	if (DOINGSOFTDEP(ap->a_vp))
 		if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
 			return (ENOSPC);
 	if (ump->um_fstype == UFS1)
 		return (ffs_reallocblks_ufs1(ap));
 	return (ffs_reallocblks_ufs2(ap));
 }
 
 static int
 ffs_reallocblks_ufs1(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
 	struct buf *sbp, *ebp, *bp;
 	ufs1_daddr_t *bap, *sbap, *ebap;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
 	ufs_lbn_t start_lbn, end_lbn;
 	ufs1_daddr_t soff, newblk, blkno;
 	ufs2_daddr_t pref;
 	struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
 	int i, cg, len, start_lvl, end_lvl, ssize;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	/*
 	 * If we are not tracking block clusters or if we have less than 4%
 	 * free blocks left, then do not attempt to cluster. Running with
 	 * less than 5% free block reserve is not recommended and those that
 	 * choose to do so do not expect to have good file layout.
 	 */
 	if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
 		return (ENOSPC);
 	buflist = ap->a_buflist;
 	len = buflist->bs_nchildren;
 	start_lbn = buflist->bs_children[0]->b_lblkno;
 	end_lbn = start_lbn + len - 1;
 #ifdef INVARIANTS
 	for (i = 0; i < len; i++)
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 1");
 	for (i = 1; i < len; i++)
 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
 			panic("ffs_reallocblks: non-logical cluster");
 	blkno = buflist->bs_children[0]->b_blkno;
 	ssize = fsbtodb(fs, fs->fs_frag);
 	for (i = 1; i < len - 1; i++)
 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
 			panic("ffs_reallocblks: non-physical cluster %d", i);
 #endif
 	/*
 	 * If the cluster crosses the boundary for the first indirect
 	 * block, leave space for the indirect block. Indirect blocks
 	 * are initially laid out in a position after the last direct
 	 * block. Block reallocation would usually destroy locality by
 	 * moving the indirect block out of the way to make room for
 	 * data blocks if we didn't compensate here. We should also do
 	 * this for other indirect block boundaries, but it is only
 	 * important for the first one.
 	 */
 	if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
 		return (ENOSPC);
 	/*
 	 * If the latest allocation is in a new cylinder group, assume that
 	 * the filesystem has decided to move and do not force it back to
 	 * the previous cylinder group.
 	 */
 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
 		return (ENOSPC);
 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
 		return (ENOSPC);
 	/*
 	 * Get the starting offset and block map for the first block.
 	 */
 	if (start_lvl == 0) {
 		sbap = &ip->i_din1->di_db[0];
 		soff = start_lbn;
 	} else {
 		idp = &start_ap[start_lvl - 1];
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
 			brelse(sbp);
 			return (ENOSPC);
 		}
 		sbap = (ufs1_daddr_t *)sbp->b_data;
 		soff = idp->in_off;
 	}
 	/*
 	 * If the block range spans two block maps, get the second map.
 	 */
 	ebap = NULL;
 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
 		ssize = len;
 	} else {
 #ifdef INVARIANTS
 		if (start_lvl > 0 &&
 		    start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
 			panic("ffs_reallocblk: start == end");
 #endif
 		ssize = len - (idp->in_off + 1);
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
 			goto fail;
 		ebap = (ufs1_daddr_t *)ebp->b_data;
 	}
 	/*
 	 * Find the preferred location for the cluster. If we have not
 	 * previously failed at this endeavor, then follow our standard
 	 * preference calculation. If we have failed at it, then pick up
 	 * where we last ended our search.
 	 */
 	UFS_LOCK(ump);
 	if (ip->i_nextclustercg == -1)
 		pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap);
 	else
 		pref = cgdata(fs, ip->i_nextclustercg);
 	/*
 	 * Search the block map looking for an allocation of the desired size.
 	 * To avoid wasting too much time, we limit the number of cylinder
 	 * groups that we will search.
 	 */
 	cg = dtog(fs, pref);
 	for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
 		if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
 			break;
 		cg += 1;
 		if (cg >= fs->fs_ncg)
 			cg = 0;
 	}
 	/*
 	 * If we have failed in our search, record where we gave up for
 	 * next time. Otherwise, fall back to our usual search citerion.
 	 */
 	if (newblk == 0) {
 		ip->i_nextclustercg = cg;
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
 	ip->i_nextclustercg = -1;
 	/*
 	 * We have found a new contiguous block.
 	 *
 	 * First we have to replace the old block pointers with the new
 	 * block pointers in the inode and indirect blocks associated
 	 * with the file.
 	 */
 #ifdef DIAGNOSTIC
 	if (prtrealloc)
 		printf("realloc: ino %ju, lbns %jd-%jd\n\told:",
 		    (uintmax_t)ip->i_number,
 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
 #endif
 	blkno = newblk;
 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (i == ssize) {
 			bap = ebap;
 			soff = -i;
 		}
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 2");
 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
 			panic("ffs_reallocblks: alloc mismatch");
 #endif
 #ifdef DIAGNOSTIC
 		if (prtrealloc)
 			printf(" %d,", *bap);
 #endif
 		if (DOINGSOFTDEP(vp)) {
 			if (sbap == &ip->i_din1->di_db[0] && i < ssize)
 				softdep_setup_allocdirect(ip, start_lbn + i,
 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
 				    buflist->bs_children[i]);
 			else
 				softdep_setup_allocindir_page(ip, start_lbn + i,
 				    i < ssize ? sbp : ebp, soff + i, blkno,
 				    *bap, buflist->bs_children[i]);
 		}
 		*bap++ = blkno;
 	}
 	/*
 	 * Next we must write out the modified inode and indirect blocks.
 	 * For strict correctness, the writes should be synchronous since
 	 * the old block values may have been written to disk. In practise
 	 * they are almost never written, but if we are concerned about
 	 * strict correctness, the `doasyncfree' flag should be set to zero.
 	 *
 	 * The test on `doasyncfree' should be changed to test a flag
 	 * that shows whether the associated buffers and inodes have
 	 * been written. The flag should be set when the cluster is
 	 * started and cleared whenever the buffer or inode is flushed.
 	 * We can then check below to see if it is set, and do the
 	 * synchronous write only when it has been cleared.
 	 */
 	if (sbap != &ip->i_din1->di_db[0]) {
 		if (doasyncfree)
 			bdwrite(sbp);
 		else
 			bwrite(sbp);
 	} else {
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		if (!doasyncfree)
 			ffs_update(vp, 1);
 	}
 	if (ssize < len) {
 		if (doasyncfree)
 			bdwrite(ebp);
 		else
 			bwrite(ebp);
 	}
 	/*
 	 * Last, free the old blocks and assign the new blocks to the buffers.
 	 */
 #ifdef DIAGNOSTIC
 	if (prtrealloc)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		bp = buflist->bs_children[i];
 		if (!DOINGSOFTDEP(vp))
 			/*
 			 * The usual case is that a set of N-contiguous blocks
 			 * that was just allocated has been replaced with a
 			 * set of N+1-contiguous blocks. If they are marked as
 			 * B_DELWRI, the current contents have not been written
 			 * to disk. It is possible that the blocks were written
 			 * earlier, but very uncommon. If the blocks have never
 			 * been written, there is no need to send a BIO_DELETE
 			 * for them when they are freed. The gain from avoiding
 			 * the TRIMs for the common case of unwritten blocks
 			 * far exceeds the cost of the write amplification for
 			 * the uncommon case of failing to send a TRIM for the
 			 * blocks that had been written.
 			 */
 			ffs_blkfree(ump, fs, ump->um_devvp,
 			    dbtofsb(fs, bp->b_blkno),
 			    fs->fs_bsize, ip->i_number, vp->v_type, NULL,
 			    (bp->b_flags & B_DELWRI) != 0 ?
 			    NOTRIM_KEY : SINGLETON_KEY);
 		bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DIAGNOSTIC
 		if (prtrealloc)
 			printf(" %d,", blkno);
 #endif
 	}
 #ifdef DIAGNOSTIC
 	if (prtrealloc) {
 		prtrealloc--;
 		printf("\n");
 	}
 #endif
 	return (0);
 
 fail:
 	if (ssize < len)
 		brelse(ebp);
 	if (sbap != &ip->i_din1->di_db[0])
 		brelse(sbp);
 	return (ENOSPC);
 }
 
 static int
 ffs_reallocblks_ufs2(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
 	struct buf *sbp, *ebp, *bp;
 	ufs2_daddr_t *bap, *sbap, *ebap;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
 	ufs_lbn_t start_lbn, end_lbn;
 	ufs2_daddr_t soff, newblk, blkno, pref;
 	struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
 	int i, cg, len, start_lvl, end_lvl, ssize;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	/*
 	 * If we are not tracking block clusters or if we have less than 4%
 	 * free blocks left, then do not attempt to cluster. Running with
 	 * less than 5% free block reserve is not recommended and those that
 	 * choose to do so do not expect to have good file layout.
 	 */
 	if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
 		return (ENOSPC);
 	buflist = ap->a_buflist;
 	len = buflist->bs_nchildren;
 	start_lbn = buflist->bs_children[0]->b_lblkno;
 	end_lbn = start_lbn + len - 1;
 #ifdef INVARIANTS
 	for (i = 0; i < len; i++)
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 1");
 	for (i = 1; i < len; i++)
 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
 			panic("ffs_reallocblks: non-logical cluster");
 	blkno = buflist->bs_children[0]->b_blkno;
 	ssize = fsbtodb(fs, fs->fs_frag);
 	for (i = 1; i < len - 1; i++)
 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
 			panic("ffs_reallocblks: non-physical cluster %d", i);
 #endif
 	/*
 	 * If the cluster crosses the boundary for the first indirect
 	 * block, do not move anything in it. Indirect blocks are
 	 * usually initially laid out in a position between the data
 	 * blocks. Block reallocation would usually destroy locality by
 	 * moving the indirect block out of the way to make room for
 	 * data blocks if we didn't compensate here. We should also do
 	 * this for other indirect block boundaries, but it is only
 	 * important for the first one.
 	 */
 	if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
 		return (ENOSPC);
 	/*
 	 * If the latest allocation is in a new cylinder group, assume that
 	 * the filesystem has decided to move and do not force it back to
 	 * the previous cylinder group.
 	 */
 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
 		return (ENOSPC);
 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
 		return (ENOSPC);
 	/*
 	 * Get the starting offset and block map for the first block.
 	 */
 	if (start_lvl == 0) {
 		sbap = &ip->i_din2->di_db[0];
 		soff = start_lbn;
 	} else {
 		idp = &start_ap[start_lvl - 1];
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
 			brelse(sbp);
 			return (ENOSPC);
 		}
 		sbap = (ufs2_daddr_t *)sbp->b_data;
 		soff = idp->in_off;
 	}
 	/*
 	 * If the block range spans two block maps, get the second map.
 	 */
 	ebap = NULL;
 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
 		ssize = len;
 	} else {
 #ifdef INVARIANTS
 		if (start_lvl > 0 &&
 		    start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
 			panic("ffs_reallocblk: start == end");
 #endif
 		ssize = len - (idp->in_off + 1);
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
 			goto fail;
 		ebap = (ufs2_daddr_t *)ebp->b_data;
 	}
 	/*
 	 * Find the preferred location for the cluster. If we have not
 	 * previously failed at this endeavor, then follow our standard
 	 * preference calculation. If we have failed at it, then pick up
 	 * where we last ended our search.
 	 */
 	UFS_LOCK(ump);
 	if (ip->i_nextclustercg == -1)
 		pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
 	else
 		pref = cgdata(fs, ip->i_nextclustercg);
 	/*
 	 * Search the block map looking for an allocation of the desired size.
 	 * To avoid wasting too much time, we limit the number of cylinder
 	 * groups that we will search.
 	 */
 	cg = dtog(fs, pref);
 	for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
 		if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
 			break;
 		cg += 1;
 		if (cg >= fs->fs_ncg)
 			cg = 0;
 	}
 	/*
 	 * If we have failed in our search, record where we gave up for
 	 * next time. Otherwise, fall back to our usual search citerion.
 	 */
 	if (newblk == 0) {
 		ip->i_nextclustercg = cg;
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
 	ip->i_nextclustercg = -1;
 	/*
 	 * We have found a new contiguous block.
 	 *
 	 * First we have to replace the old block pointers with the new
 	 * block pointers in the inode and indirect blocks associated
 	 * with the file.
 	 */
 #ifdef DIAGNOSTIC
 	if (prtrealloc)
 		printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number,
 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
 #endif
 	blkno = newblk;
 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (i == ssize) {
 			bap = ebap;
 			soff = -i;
 		}
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 2");
 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
 			panic("ffs_reallocblks: alloc mismatch");
 #endif
 #ifdef DIAGNOSTIC
 		if (prtrealloc)
 			printf(" %jd,", (intmax_t)*bap);
 #endif
 		if (DOINGSOFTDEP(vp)) {
 			if (sbap == &ip->i_din2->di_db[0] && i < ssize)
 				softdep_setup_allocdirect(ip, start_lbn + i,
 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
 				    buflist->bs_children[i]);
 			else
 				softdep_setup_allocindir_page(ip, start_lbn + i,
 				    i < ssize ? sbp : ebp, soff + i, blkno,
 				    *bap, buflist->bs_children[i]);
 		}
 		*bap++ = blkno;
 	}
 	/*
 	 * Next we must write out the modified inode and indirect blocks.
 	 * For strict correctness, the writes should be synchronous since
 	 * the old block values may have been written to disk. In practise
 	 * they are almost never written, but if we are concerned about
 	 * strict correctness, the `doasyncfree' flag should be set to zero.
 	 *
 	 * The test on `doasyncfree' should be changed to test a flag
 	 * that shows whether the associated buffers and inodes have
 	 * been written. The flag should be set when the cluster is
 	 * started and cleared whenever the buffer or inode is flushed.
 	 * We can then check below to see if it is set, and do the
 	 * synchronous write only when it has been cleared.
 	 */
 	if (sbap != &ip->i_din2->di_db[0]) {
 		if (doasyncfree)
 			bdwrite(sbp);
 		else
 			bwrite(sbp);
 	} else {
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
 		if (!doasyncfree)
 			ffs_update(vp, 1);
 	}
 	if (ssize < len) {
 		if (doasyncfree)
 			bdwrite(ebp);
 		else
 			bwrite(ebp);
 	}
 	/*
 	 * Last, free the old blocks and assign the new blocks to the buffers.
 	 */
 #ifdef DIAGNOSTIC
 	if (prtrealloc)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		bp = buflist->bs_children[i];
 		if (!DOINGSOFTDEP(vp))
 			/*
 			 * The usual case is that a set of N-contiguous blocks
 			 * that was just allocated has been replaced with a
 			 * set of N+1-contiguous blocks. If they are marked as
 			 * B_DELWRI, the current contents have not been written
 			 * to disk. It is possible that the blocks were written
 			 * earlier, but very uncommon. If the blocks have never
 			 * been written, there is no need to send a BIO_DELETE
 			 * for them when they are freed. The gain from avoiding
 			 * the TRIMs for the common case of unwritten blocks
 			 * far exceeds the cost of the write amplification for
 			 * the uncommon case of failing to send a TRIM for the
 			 * blocks that had been written.
 			 */
 			ffs_blkfree(ump, fs, ump->um_devvp,
 			    dbtofsb(fs, bp->b_blkno),
 			    fs->fs_bsize, ip->i_number, vp->v_type, NULL,
 			    (bp->b_flags & B_DELWRI) != 0 ?
 			    NOTRIM_KEY : SINGLETON_KEY);
 		bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
 		if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DIAGNOSTIC
 		if (prtrealloc)
 			printf(" %jd,", (intmax_t)blkno);
 #endif
 	}
 #ifdef DIAGNOSTIC
 	if (prtrealloc) {
 		prtrealloc--;
 		printf("\n");
 	}
 #endif
 	return (0);
 
 fail:
 	if (ssize < len)
 		brelse(ebp);
 	if (sbap != &ip->i_din2->di_db[0])
 		brelse(sbp);
 	return (ENOSPC);
 }
 
 /*
  * Allocate an inode in the filesystem.
  *
  * If allocating a directory, use ffs_dirpref to select the inode.
  * If allocating in a directory, the following hierarchy is followed:
  *   1) allocate the preferred inode.
  *   2) allocate an inode in the same cylinder group.
  *   3) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
  * If no inode preference is given the following hierarchy is used
  * to allocate an inode:
  *   1) allocate an inode in cylinder group 0.
  *   2) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
  */
 int
 ffs_valloc(pvp, mode, cred, vpp)
 	struct vnode *pvp;
 	int mode;
 	struct ucred *cred;
 	struct vnode **vpp;
 {
 	struct inode *pip;
 	struct fs *fs;
 	struct inode *ip;
 	struct timespec ts;
 	struct ufsmount *ump;
 	ino_t ino, ipref;
 	u_int cg;
 	int error, reclaimed;
 
 	*vpp = NULL;
 	pip = VTOI(pvp);
 	ump = ITOUMP(pip);
 	fs = ump->um_fs;
 
 	UFS_LOCK(ump);
 	reclaimed = 0;
 retry:
 	if (fs->fs_cstotal.cs_nifree == 0)
 		goto noinodes;
 
 	if ((mode & IFMT) == IFDIR)
 		ipref = ffs_dirpref(pip);
 	else
 		ipref = pip->i_number;
 	if (ipref >= fs->fs_ncg * fs->fs_ipg)
 		ipref = 0;
 	cg = ino_to_cg(fs, ipref);
 	/*
 	 * Track number of dirs created one after another
 	 * in a same cg without intervening by files.
 	 */
 	if ((mode & IFMT) == IFDIR) {
 		if (fs->fs_contigdirs[cg] < 255)
 			fs->fs_contigdirs[cg]++;
 	} else {
 		if (fs->fs_contigdirs[cg] > 0)
 			fs->fs_contigdirs[cg]--;
 	}
 	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
 					(allocfcn_t *)ffs_nodealloccg);
 	if (ino == 0)
 		goto noinodes;
 	/*
 	 * Get rid of the cached old vnode, force allocation of a new vnode
 	 * for this inode. If this fails, release the allocated ino and
 	 * return the error.
 	 */
 	if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp,
 	    FFSV_FORCEINSMQ | FFSV_REPLACE)) != 0) {
 		ffs_vfree(pvp, ino, mode);
 		return (error);
 	}
 	/*
 	 * We got an inode, so check mode and panic if it is already allocated.
 	 */
 	ip = VTOI(*vpp);
 	if (ip->i_mode) {
 		printf("mode = 0%o, inum = %ju, fs = %s\n",
 		    ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt);
 		panic("ffs_valloc: dup alloc");
 	}
 	if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) {  /* XXX */
 		printf("free inode %s/%lu had %ld blocks\n",
 		    fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks));
 		DIP_SET(ip, i_blocks, 0);
 	}
 	ip->i_flags = 0;
 	DIP_SET(ip, i_flags, 0);
 	/*
 	 * Set up a new generation number for this inode.
 	 */
 	while (ip->i_gen == 0 || ++ip->i_gen == 0)
 		ip->i_gen = arc4random();
 	DIP_SET(ip, i_gen, ip->i_gen);
 	if (fs->fs_magic == FS_UFS2_MAGIC) {
 		vfs_timestamp(&ts);
 		ip->i_din2->di_birthtime = ts.tv_sec;
 		ip->i_din2->di_birthnsec = ts.tv_nsec;
 	}
 	ip->i_flag = 0;
 	(*vpp)->v_vflag = 0;
 	(*vpp)->v_type = VNON;
 	if (fs->fs_magic == FS_UFS2_MAGIC) {
 		(*vpp)->v_op = &ffs_vnodeops2;
 		UFS_INODE_SET_FLAG(ip, IN_UFS2);
 	} else {
 		(*vpp)->v_op = &ffs_vnodeops1;
 	}
 	return (0);
 noinodes:
 	if (reclaimed == 0) {
 		reclaimed = 1;
 		softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT);
 		goto retry;
 	}
 	if (ffs_fsfail_cleanup_locked(ump, 0)) {
 		UFS_UNLOCK(ump);
 		return (ENXIO);
 	}
 	if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) {
 		UFS_UNLOCK(ump);
 		ffs_fserr(fs, pip->i_number, "out of inodes");
 		uprintf("\n%s: create/symlink failed, no inodes free\n",
 		    fs->fs_fsmnt);
 	} else {
 		UFS_UNLOCK(ump);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Find a cylinder group to place a directory.
  *
  * The policy implemented by this algorithm is to allocate a
  * directory inode in the same cylinder group as its parent
  * directory, but also to reserve space for its files inodes
  * and data. Restrict the number of directories which may be
  * allocated one after another in the same cylinder group
  * without intervening allocation of files.
  *
  * If we allocate a first level directory then force allocation
  * in another cylinder group.
  */
 static ino_t
 ffs_dirpref(pip)
 	struct inode *pip;
 {
 	struct fs *fs;
 	int cg, prefcg, dirsize, cgsize;
 	u_int avgifree, avgbfree, avgndir, curdirsize;
 	u_int minifree, minbfree, maxndir;
 	u_int mincg, minndir;
 	u_int maxcontigdirs;
 
 	mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED);
 	fs = ITOFS(pip);
 
 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
 	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
 
 	/*
 	 * Force allocation in another cg if creating a first level dir.
 	 */
 	ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
 	if (ITOV(pip)->v_vflag & VV_ROOT) {
 		prefcg = arc4random() % fs->fs_ncg;
 		mincg = prefcg;
 		minndir = fs->fs_ipg;
 		for (cg = prefcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				mincg = cg;
 				minndir = fs->fs_cs(fs, cg).cs_ndir;
 			}
 		for (cg = 0; cg < prefcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				mincg = cg;
 				minndir = fs->fs_cs(fs, cg).cs_ndir;
 			}
 		return ((ino_t)(fs->fs_ipg * mincg));
 	}
 
 	/*
 	 * Count various limits which used for
 	 * optimal allocation of a directory inode.
 	 */
 	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
 	minifree = avgifree - avgifree / 4;
 	if (minifree < 1)
 		minifree = 1;
 	minbfree = avgbfree - avgbfree / 4;
 	if (minbfree < 1)
 		minbfree = 1;
 	cgsize = fs->fs_fsize * fs->fs_fpg;
 	dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
 	curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
 	if (dirsize < curdirsize)
 		dirsize = curdirsize;
 	if (dirsize <= 0)
 		maxcontigdirs = 0;		/* dirsize overflowed */
 	else
 		maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
 	if (fs->fs_avgfpdir > 0)
 		maxcontigdirs = min(maxcontigdirs,
 				    fs->fs_ipg / fs->fs_avgfpdir);
 	if (maxcontigdirs == 0)
 		maxcontigdirs = 1;
 
 	/*
 	 * Limit number of dirs in one cg and reserve space for 
 	 * regular files, but only if we have no deficit in
 	 * inodes or space.
 	 *
 	 * We are trying to find a suitable cylinder group nearby
 	 * our preferred cylinder group to place a new directory.
 	 * We scan from our preferred cylinder group forward looking
 	 * for a cylinder group that meets our criterion. If we get
 	 * to the final cylinder group and do not find anything,
 	 * we start scanning forwards from the beginning of the
 	 * filesystem. While it might seem sensible to start scanning
 	 * backwards or even to alternate looking forward and backward,
 	 * this approach fails badly when the filesystem is nearly full.
 	 * Specifically, we first search all the areas that have no space
 	 * and finally try the one preceding that. We repeat this on
 	 * every request and in the case of the final block end up
 	 * searching the entire filesystem. By jumping to the front
 	 * of the filesystem, our future forward searches always look
 	 * in new cylinder groups so finds every possible block after
 	 * one pass over the filesystem.
 	 */
 	prefcg = ino_to_cg(fs, pip->i_number);
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 		    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
 	for (cg = 0; cg < prefcg; cg++)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 		    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
 	/*
 	 * This is a backstop when we have deficit in space.
 	 */
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			return ((ino_t)(fs->fs_ipg * cg));
 	for (cg = 0; cg < prefcg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			break;
 	return ((ino_t)(fs->fs_ipg * cg));
 }
 
 /*
  * Select the desired position for the next block in a file.  The file is
  * logically divided into sections. The first section is composed of the
  * direct blocks and the next fs_maxbpg blocks. Each additional section
  * contains fs_maxbpg blocks.
  *
  * If no blocks have been allocated in the first section, the policy is to
  * request a block in the same cylinder group as the inode that describes
  * the file. The first indirect is allocated immediately following the last
  * direct block and the data blocks for the first indirect immediately
  * follow it.
  *
  * If no blocks have been allocated in any other section, the indirect 
  * block(s) are allocated in the same cylinder group as its inode in an
  * area reserved immediately following the inode blocks. The policy for
  * the data blocks is to place them in a cylinder group with a greater than
  * average number of free blocks. An appropriate cylinder group is found
  * by using a rotor that sweeps the cylinder groups. When a new group of
  * blocks is needed, the sweep begins in the cylinder group following the
  * cylinder group from which the previous allocation was made. The sweep
  * continues until a cylinder group with greater than the average number
  * of free blocks is found. If the allocation is for the first block in an
  * indirect block or the previous block is a hole, then the information on
  * the previous allocation is unavailable; here a best guess is made based
  * on the logical block number being allocated.
  *
  * If a section is already partially allocated, the policy is to
  * allocate blocks contiguously within the section if possible.
  */
 ufs2_daddr_t
 ffs_blkpref_ufs1(ip, lbn, indx, bap)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	int indx;
 	ufs1_daddr_t *bap;
 {
 	struct fs *fs;
 	u_int cg, inocg;
 	u_int avgbfree, startcg;
 	ufs2_daddr_t pref, prevbn;
 
 	KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
 	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
 	fs = ITOFS(ip);
 	/*
 	 * Allocation of indirect blocks is indicated by passing negative
 	 * values in indx: -1 for single indirect, -2 for double indirect,
 	 * -3 for triple indirect. As noted below, we attempt to allocate
 	 * the first indirect inline with the file data. For all later
 	 * indirect blocks, the data is often allocated in other cylinder
 	 * groups. However to speed random file access and to speed up
 	 * fsck, the filesystem reserves the first fs_metaspace blocks
 	 * (typically half of fs_minfree) of the data area of each cylinder
 	 * group to hold these later indirect blocks.
 	 */
 	inocg = ino_to_cg(fs, ip->i_number);
 	if (indx < 0) {
 		/*
 		 * Our preference for indirect blocks is the zone at the
 		 * beginning of the inode's cylinder group data area that
 		 * we try to reserve for indirect blocks.
 		 */
 		pref = cgmeta(fs, inocg);
 		/*
 		 * If we are allocating the first indirect block, try to
 		 * place it immediately following the last direct block.
 		 */
 		if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
 		    ip->i_din1->di_db[UFS_NDADDR - 1] != 0)
 			pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag;
 		return (pref);
 	}
 	/*
 	 * If we are allocating the first data block in the first indirect
 	 * block and the indirect has been allocated in the data block area,
 	 * try to place it immediately following the indirect block.
 	 */
 	if (lbn == UFS_NDADDR) {
 		pref = ip->i_din1->di_ib[0];
 		if (pref != 0 && pref >= cgdata(fs, inocg) &&
 		    pref < cgbase(fs, inocg + 1))
 			return (pref + fs->fs_frag);
 	}
 	/*
 	 * If we are at the beginning of a file, or we have already allocated
 	 * the maximum number of blocks per cylinder group, or we do not
 	 * have a block allocated immediately preceding us, then we need
 	 * to decide where to start allocating new blocks.
 	 */
 	if (indx ==  0) {
 		prevbn = 0;
 	} else {
 		prevbn = bap[indx - 1];
 		if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn,
 		    fs->fs_bsize) != 0)
 			prevbn = 0;
 	}
 	if (indx % fs->fs_maxbpg == 0 || prevbn == 0) {
 		/*
 		 * If we are allocating a directory data block, we want
 		 * to place it in the metadata area.
 		 */
 		if ((ip->i_mode & IFMT) == IFDIR)
 			return (cgmeta(fs, inocg));
 		/*
 		 * Until we fill all the direct and all the first indirect's
 		 * blocks, we try to allocate in the data area of the inode's
 		 * cylinder group.
 		 */
 		if (lbn < UFS_NDADDR + NINDIR(fs))
 			return (cgdata(fs, inocg));
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || prevbn == 0)
 			startcg = inocg + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, prevbn) + 1;
 		startcg %= fs->fs_ncg;
 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgdata(fs, cg));
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgdata(fs, cg));
 			}
 		return (0);
 	}
 	/*
 	 * Otherwise, we just always try to lay things out contiguously.
 	 */
 	return (prevbn + fs->fs_frag);
 }
 
 /*
  * Same as above, but for UFS2
  */
 ufs2_daddr_t
 ffs_blkpref_ufs2(ip, lbn, indx, bap)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	int indx;
 	ufs2_daddr_t *bap;
 {
 	struct fs *fs;
 	u_int cg, inocg;
 	u_int avgbfree, startcg;
 	ufs2_daddr_t pref, prevbn;
 
 	KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
 	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
 	fs = ITOFS(ip);
 	/*
 	 * Allocation of indirect blocks is indicated by passing negative
 	 * values in indx: -1 for single indirect, -2 for double indirect,
 	 * -3 for triple indirect. As noted below, we attempt to allocate
 	 * the first indirect inline with the file data. For all later
 	 * indirect blocks, the data is often allocated in other cylinder
 	 * groups. However to speed random file access and to speed up
 	 * fsck, the filesystem reserves the first fs_metaspace blocks
 	 * (typically half of fs_minfree) of the data area of each cylinder
 	 * group to hold these later indirect blocks.
 	 */
 	inocg = ino_to_cg(fs, ip->i_number);
 	if (indx < 0) {
 		/*
 		 * Our preference for indirect blocks is the zone at the
 		 * beginning of the inode's cylinder group data area that
 		 * we try to reserve for indirect blocks.
 		 */
 		pref = cgmeta(fs, inocg);
 		/*
 		 * If we are allocating the first indirect block, try to
 		 * place it immediately following the last direct block.
 		 */
 		if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
 		    ip->i_din2->di_db[UFS_NDADDR - 1] != 0)
 			pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag;
 		return (pref);
 	}
 	/*
 	 * If we are allocating the first data block in the first indirect
 	 * block and the indirect has been allocated in the data block area,
 	 * try to place it immediately following the indirect block.
 	 */
 	if (lbn == UFS_NDADDR) {
 		pref = ip->i_din2->di_ib[0];
 		if (pref != 0 && pref >= cgdata(fs, inocg) &&
 		    pref < cgbase(fs, inocg + 1))
 			return (pref + fs->fs_frag);
 	}
 	/*
 	 * If we are at the beginning of a file, or we have already allocated
 	 * the maximum number of blocks per cylinder group, or we do not
 	 * have a block allocated immediately preceding us, then we need
 	 * to decide where to start allocating new blocks.
 	 */
 	if (indx ==  0) {
 		prevbn = 0;
 	} else {
 		prevbn = bap[indx - 1];
 		if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn,
 		    fs->fs_bsize) != 0)
 			prevbn = 0;
 	}
 	if (indx % fs->fs_maxbpg == 0 || prevbn == 0) {
 		/*
 		 * If we are allocating a directory data block, we want
 		 * to place it in the metadata area.
 		 */
 		if ((ip->i_mode & IFMT) == IFDIR)
 			return (cgmeta(fs, inocg));
 		/*
 		 * Until we fill all the direct and all the first indirect's
 		 * blocks, we try to allocate in the data area of the inode's
 		 * cylinder group.
 		 */
 		if (lbn < UFS_NDADDR + NINDIR(fs))
 			return (cgdata(fs, inocg));
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || prevbn == 0)
 			startcg = inocg + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, prevbn) + 1;
 		startcg %= fs->fs_ncg;
 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgdata(fs, cg));
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgdata(fs, cg));
 			}
 		return (0);
 	}
 	/*
 	 * Otherwise, we just always try to lay things out contiguously.
 	 */
 	return (prevbn + fs->fs_frag);
 }
 
 /*
  * Implement the cylinder overflow algorithm.
  *
  * The policy implemented by this algorithm is:
  *   1) allocate the block in its requested cylinder group.
  *   2) quadradically rehash on the cylinder group number.
  *   3) brute force search for a free block.
  *
  * Must be called with the UFS lock held.  Will release the lock on success
  * and return with it held on failure.
  */
 /*VARARGS5*/
 static ufs2_daddr_t
 ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t pref;
 	int size;	/* Search size for data blocks, mode for inodes */
 	int rsize;	/* Real allocated size. */
 	allocfcn_t *allocator;
 {
 	struct fs *fs;
 	ufs2_daddr_t result;
 	u_int i, icg = cg;
 
 	mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
 #ifdef INVARIANTS
 	if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_hashalloc: allocation on suspended filesystem");
 #endif
 	fs = ITOFS(ip);
 	/*
 	 * 1: preferred cylinder group
 	 */
 	result = (*allocator)(ip, cg, pref, size, rsize);
 	if (result)
 		return (result);
 	/*
 	 * 2: quadratic rehash
 	 */
 	for (i = 1; i < fs->fs_ncg; i *= 2) {
 		cg += i;
 		if (cg >= fs->fs_ncg)
 			cg -= fs->fs_ncg;
 		result = (*allocator)(ip, cg, 0, size, rsize);
 		if (result)
 			return (result);
 	}
 	/*
 	 * 3: brute force search
 	 * Note that we start at i == 2, since 0 was checked initially,
 	 * and 1 is always checked in the quadratic rehash.
 	 */
 	cg = (icg + 2) % fs->fs_ncg;
 	for (i = 2; i < fs->fs_ncg; i++) {
 		result = (*allocator)(ip, cg, 0, size, rsize);
 		if (result)
 			return (result);
 		cg++;
 		if (cg == fs->fs_ncg)
 			cg = 0;
 	}
 	return (0);
 }
 
 /*
  * Determine whether a fragment can be extended.
  *
  * Check to see if the necessary fragments are available, and
  * if they are, allocate them.
  */
 static ufs2_daddr_t
 ffs_fragextend(ip, cg, bprev, osize, nsize)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t bprev;
 	int osize, nsize;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	int nffree;
 	long bno;
 	int frags, bbase;
 	int i, error;
 	u_int8_t *blksfree;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
 		return (0);
 	frags = numfrags(fs, nsize);
 	bbase = fragnum(fs, bprev);
 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
 		/* cannot extend across a block boundary */
 		return (0);
 	}
 	UFS_UNLOCK(ump);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0)
 		goto fail;
 	bno = dtogd(fs, bprev);
 	blksfree = cg_blksfree(cgp);
 	for (i = numfrags(fs, osize); i < frags; i++)
 		if (isclr(blksfree, bno + i))
 			goto fail;
 	/*
 	 * the current fragment can be extended
 	 * deduct the count on fragment being extended into
 	 * increase the count on the remaining fragment (if any)
 	 * allocate the extended piece
 	 */
 	for (i = frags; i < fs->fs_frag - bbase; i++)
 		if (isclr(blksfree, bno + i))
 			break;
 	cgp->cg_frsum[i - numfrags(fs, osize)]--;
 	if (i != frags)
 		cgp->cg_frsum[i - frags]++;
 	for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) {
 		clrbit(blksfree, bno + i);
 		cgp->cg_cs.cs_nffree--;
 		nffree++;
 	}
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nffree -= nffree;
 	fs->fs_cs(fs, cg).cs_nffree -= nffree;
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
 		    frags, numfrags(fs, osize));
 	bdwrite(bp);
 	return (bprev);
 
 fail:
 	brelse(bp);
 	UFS_LOCK(ump);
 	return (0);
 
 }
 
 /*
  * Determine whether a block can be allocated.
  *
  * Check to see if a block of the appropriate size is available,
  * and if it is, allocate it.
  */
 static ufs2_daddr_t
 ffs_alloccg(ip, cg, bpref, size, rsize)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t bpref;
 	int size;
 	int rsize;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	int i, allocsiz, error, frags;
 	u_int8_t *blksfree;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
 		return (0);
 	UFS_UNLOCK(ump);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 ||
 	   (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
 		goto fail;
 	if (size == fs->fs_bsize) {
 		UFS_LOCK(ump);
 		blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
 		return (blkno);
 	}
 	/*
 	 * check to see if any fragments are already available
 	 * allocsiz is the size which will be allocated, hacking
 	 * it down to a smaller size if necessary
 	 */
 	blksfree = cg_blksfree(cgp);
 	frags = numfrags(fs, size);
 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
 		if (cgp->cg_frsum[allocsiz] != 0)
 			break;
 	if (allocsiz == fs->fs_frag) {
 		/*
 		 * no fragments were available, so a block will be
 		 * allocated, and hacked up
 		 */
 		if (cgp->cg_cs.cs_nbfree == 0)
 			goto fail;
 		UFS_LOCK(ump);
 		blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
 		return (blkno);
 	}
 	KASSERT(size == rsize,
 	    ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
 	if (bno < 0)
 		goto fail;
 	for (i = 0; i < frags; i++)
 		clrbit(blksfree, bno + i);
 	cgp->cg_cs.cs_nffree -= frags;
 	cgp->cg_frsum[allocsiz]--;
 	if (frags != allocsiz)
 		cgp->cg_frsum[allocsiz - frags]++;
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nffree -= frags;
 	fs->fs_cs(fs, cg).cs_nffree -= frags;
 	fs->fs_fmod = 1;
 	blkno = cgbase(fs, cg) + bno;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
 	bdwrite(bp);
 	return (blkno);
 
 fail:
 	brelse(bp);
 	UFS_LOCK(ump);
 	return (0);
 }
 
 /*
  * Allocate a block in a cylinder group.
  *
  * This algorithm implements the following policy:
  *   1) allocate the requested block.
  *   2) allocate a rotationally optimal block in the same cylinder.
  *   3) allocate the next available block on the block rotor for the
  *      specified cylinder group.
  * Note that this routine only allocates fs_bsize blocks; these
  * blocks may be fragmented by the routine that allocates them.
  */
 static ufs2_daddr_t
 ffs_alloccgblk(ip, bp, bpref, size)
 	struct inode *ip;
 	struct buf *bp;
 	ufs2_daddr_t bpref;
 	int size;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct ufsmount *ump;
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	u_int8_t *blksfree;
 	int i, cgbpref;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	cgp = (struct cg *)bp->b_data;
 	blksfree = cg_blksfree(cgp);
 	if (bpref == 0) {
 		bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag;
 	} else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) {
 		/* map bpref to correct zone in this cg */
 		if (bpref < cgdata(fs, cgbpref))
 			bpref = cgmeta(fs, cgp->cg_cgx);
 		else
 			bpref = cgdata(fs, cgp->cg_cgx);
 	}
 	/*
 	 * if the requested block is available, use it
 	 */
 	bno = dtogd(fs, blknum(fs, bpref));
 	if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
 		goto gotit;
 	/*
 	 * Take the next available block in this cylinder group.
 	 */
 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
 	if (bno < 0)
 		return (0);
 	/* Update cg_rotor only if allocated from the data zone */
 	if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx)))
 		cgp->cg_rotor = bno;
 gotit:
 	blkno = fragstoblks(fs, bno);
 	ffs_clrblock(fs, blksfree, (long)blkno);
 	ffs_clusteracct(fs, cgp, blkno, -1);
 	cgp->cg_cs.cs_nbfree--;
 	fs->fs_cstotal.cs_nbfree--;
 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
 	fs->fs_fmod = 1;
 	blkno = cgbase(fs, cgp->cg_cgx) + bno;
 	/*
 	 * If the caller didn't want the whole block free the frags here.
 	 */
 	size = numfrags(fs, size);
 	if (size != fs->fs_frag) {
 		bno = dtogd(fs, blkno);
 		for (i = size; i < fs->fs_frag; i++)
 			setbit(blksfree, bno + i);
 		i = fs->fs_frag - size;
 		cgp->cg_cs.cs_nffree += i;
 		fs->fs_cstotal.cs_nffree += i;
 		fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
 		fs->fs_fmod = 1;
 		cgp->cg_frsum[i]++;
 	}
 	/* XXX Fixme. */
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
 	UFS_LOCK(ump);
 	return (blkno);
 }
 
 /*
  * Determine whether a cluster can be allocated.
  *
  * We do not currently check for optimal rotational layout if there
  * are multiple choices in the same cylinder group. Instead we just
  * take the first one that we find following bpref.
  */
 static ufs2_daddr_t
 ffs_clusteralloc(ip, cg, bpref, len)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t bpref;
 	int len;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	int i, run, bit, map, got, error;
 	ufs2_daddr_t bno;
 	u_char *mapp;
 	int32_t *lp;
 	u_int8_t *blksfree;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 	if (fs->fs_maxcluster[cg] < len)
 		return (0);
 	UFS_UNLOCK(ump);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) {
 		UFS_LOCK(ump);
 		return (0);
 	}
 	/*
 	 * Check to see if a cluster of the needed size (or bigger) is
 	 * available in this cylinder group.
 	 */
 	lp = &cg_clustersum(cgp)[len];
 	for (i = len; i <= fs->fs_contigsumsize; i++)
 		if (*lp++ > 0)
 			break;
 	if (i > fs->fs_contigsumsize) {
 		/*
 		 * This is the first time looking for a cluster in this
 		 * cylinder group. Update the cluster summary information
 		 * to reflect the true maximum sized cluster so that
 		 * future cluster allocation requests can avoid reading
 		 * the cylinder group map only to find no clusters.
 		 */
 		lp = &cg_clustersum(cgp)[len - 1];
 		for (i = len - 1; i > 0; i--)
 			if (*lp-- > 0)
 				break;
 		UFS_LOCK(ump);
 		fs->fs_maxcluster[cg] = i;
 		brelse(bp);
 		return (0);
 	}
 	/*
 	 * Search the cluster map to find a big enough cluster.
 	 * We take the first one that we find, even if it is larger
 	 * than we need as we prefer to get one close to the previous
 	 * block allocation. We do not search before the current
 	 * preference point as we do not want to allocate a block
 	 * that is allocated before the previous one (as we will
 	 * then have to wait for another pass of the elevator
 	 * algorithm before it will be read). We prefer to fail and
 	 * be recalled to try an allocation in the next cylinder group.
 	 */
 	if (dtog(fs, bpref) != cg)
 		bpref = cgdata(fs, cg);
 	else
 		bpref = blknum(fs, bpref);
 	bpref = fragstoblks(fs, dtogd(fs, bpref));
 	mapp = &cg_clustersfree(cgp)[bpref / NBBY];
 	map = *mapp++;
 	bit = 1 << (bpref % NBBY);
 	for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
 		if ((map & bit) == 0) {
 			run = 0;
 		} else {
 			run++;
 			if (run == len)
 				break;
 		}
 		if ((got & (NBBY - 1)) != (NBBY - 1)) {
 			bit <<= 1;
 		} else {
 			map = *mapp++;
 			bit = 1;
 		}
 	}
 	if (got >= cgp->cg_nclusterblks) {
 		UFS_LOCK(ump);
 		brelse(bp);
 		return (0);
 	}
 	/*
 	 * Allocate the cluster that we have found.
 	 */
 	blksfree = cg_blksfree(cgp);
 	for (i = 1; i <= len; i++)
 		if (!ffs_isblock(fs, blksfree, got - run + i))
 			panic("ffs_clusteralloc: map mismatch");
 	bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1);
 	if (dtog(fs, bno) != cg)
 		panic("ffs_clusteralloc: allocated out of group");
 	len = blkstofrags(fs, len);
 	UFS_LOCK(ump);
 	for (i = 0; i < len; i += fs->fs_frag)
 		if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
 			panic("ffs_clusteralloc: lost block");
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	bdwrite(bp);
 	return (bno);
 }
 
 static inline struct buf *
 getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
 {
 	struct fs *fs;
 
 	fs = ITOFS(ip);
 	return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs,
 	    cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
 	    gbflags));
 }
 
 /*
  * Synchronous inode initialization is needed only when barrier writes do not
  * work as advertised, and will impose a heavy cost on file creation in a newly
  * created filesystem.
  */
 static int doasyncinodeinit = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN,
     &doasyncinodeinit, 0,
     "Perform inode block initialization using asynchronous writes");
 
 /*
  * Determine whether an inode can be allocated.
  *
  * Check to see if an inode is available, and if it is,
  * allocate it using the following policy:
  *   1) allocate the requested inode.
  *   2) allocate the next available inode after the requested
  *      inode in the specified cylinder group.
  */
 static ufs2_daddr_t
 ffs_nodealloccg(ip, cg, ipref, mode, unused)
 	struct inode *ip;
 	u_int cg;
 	ufs2_daddr_t ipref;
 	int mode;
 	int unused;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp, *ibp;
 	struct ufsmount *ump;
 	u_int8_t *inosused, *loc;
 	struct ufs2_dinode *dp2;
 	int error, start, len, i;
 	u_int32_t old_initediblk;
 
 	ump = ITOUMP(ip);
 	fs = ump->um_fs;
 check_nifree:
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
 		return (0);
 	UFS_UNLOCK(ump);
 	if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) {
 		UFS_LOCK(ump);
 		return (0);
 	}
 restart:
 	if (cgp->cg_cs.cs_nifree == 0) {
 		brelse(bp);
 		UFS_LOCK(ump);
 		return (0);
 	}
 	inosused = cg_inosused(cgp);
 	if (ipref) {
 		ipref %= fs->fs_ipg;
 		if (isclr(inosused, ipref))
 			goto gotit;
 	}
 	start = cgp->cg_irotor / NBBY;
 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
 	loc = memcchr(&inosused[start], 0xff, len);
 	if (loc == NULL) {
 		len = start + 1;
 		start = 0;
 		loc = memcchr(&inosused[start], 0xff, len);
 		if (loc == NULL) {
 			printf("cg = %d, irotor = %ld, fs = %s\n",
 			    cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
 			panic("ffs_nodealloccg: map corrupted");
 			/* NOTREACHED */
 		}
 	}
 	ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
 gotit:
 	/*
 	 * Check to see if we need to initialize more inodes.
 	 */
 	if (fs->fs_magic == FS_UFS2_MAGIC &&
 	    ipref + INOPB(fs) > cgp->cg_initediblk &&
 	    cgp->cg_initediblk < cgp->cg_niblk) {
 		old_initediblk = cgp->cg_initediblk;
 
 		/*
 		 * Free the cylinder group lock before writing the
 		 * initialized inode block.  Entering the
 		 * babarrierwrite() with the cylinder group lock
 		 * causes lock order violation between the lock and
 		 * snaplk.
 		 *
 		 * Another thread can decide to initialize the same
 		 * inode block, but whichever thread first gets the
 		 * cylinder group lock after writing the newly
 		 * allocated inode block will update it and the other
 		 * will realize that it has lost and leave the
 		 * cylinder group unchanged.
 		 */
 		ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
 		brelse(bp);
 		if (ibp == NULL) {
 			/*
 			 * The inode block buffer is already owned by
 			 * another thread, which must initialize it.
 			 * Wait on the buffer to allow another thread
 			 * to finish the updates, with dropped cg
 			 * buffer lock, then retry.
 			 */
 			ibp = getinobuf(ip, cg, old_initediblk, 0);
 			brelse(ibp);
 			UFS_LOCK(ump);
 			goto check_nifree;
 		}
 		bzero(ibp->b_data, (int)fs->fs_bsize);
 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
 		for (i = 0; i < INOPB(fs); i++) {
 			while (dp2->di_gen == 0)
 				dp2->di_gen = arc4random();
 			dp2++;
 		}
 
 		/*
 		 * Rather than adding a soft updates dependency to ensure
 		 * that the new inode block is written before it is claimed
 		 * by the cylinder group map, we just do a barrier write
 		 * here. The barrier write will ensure that the inode block
 		 * gets written before the updated cylinder group map can be
 		 * written. The barrier write should only slow down bulk
 		 * loading of newly created filesystems.
 		 */
 		if (doasyncinodeinit)
 			babarrierwrite(ibp);
 		else
 			bwrite(ibp);
 
 		/*
 		 * After the inode block is written, try to update the
 		 * cg initediblk pointer.  If another thread beat us
 		 * to it, then leave it unchanged as the other thread
 		 * has already set it correctly.
 		 */
 		error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp);
 		UFS_LOCK(ump);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		if (error != 0)
 			return (error);
 		if (cgp->cg_initediblk == old_initediblk)
 			cgp->cg_initediblk += INOPB(fs);
 		goto restart;
 	}
 	cgp->cg_irotor = ipref;
 	UFS_LOCK(ump);
 	ACTIVECLEAR(fs, cg);
 	setbit(inosused, ipref);
 	cgp->cg_cs.cs_nifree--;
 	fs->fs_cstotal.cs_nifree--;
 	fs->fs_cs(fs, cg).cs_nifree--;
 	fs->fs_fmod = 1;
 	if ((mode & IFMT) == IFDIR) {
 		cgp->cg_cs.cs_ndir++;
 		fs->fs_cstotal.cs_ndir++;
 		fs->fs_cs(fs, cg).cs_ndir++;
 	}
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode);
 	bdwrite(bp);
 	return ((ino_t)(cg * fs->fs_ipg + ipref));
 }
 
 /*
  * Free a block or fragment.
  *
  * The specified block or fragment is placed back in the
  * free map. If a fragment is deallocated, a possible
  * block reassembly is checked.
  */
 static void
 ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 	struct workhead *dephd;
 {
 	struct mount *mp;
 	struct cg *cgp;
 	struct buf *bp;
 	daddr_t dbn;
 	ufs1_daddr_t fragno, cgbno;
 	int i, blk, frags, bbase, error;
 	u_int cg;
 	u_int8_t *blksfree;
 	struct cdev *dev;
 
 	cg = dtog(fs, bno);
 	if (devvp->v_type == VREG) {
 		/* devvp is a snapshot */
 		MPASS(devvp->v_mount->mnt_data == ump);
 		dev = ump->um_devvp->v_rdev;
 	} else if (devvp->v_type == VCHR) {
 		/* devvp is a normal disk device */
 		dev = devvp->v_rdev;
 		ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg");
 	} else
 		return;
 #ifdef INVARIANTS
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
 	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
 		printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",
 		    devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize,
 		    size, fs->fs_fsmnt);
 		panic("ffs_blkfree_cg: bad size");
 	}
 #endif
 	if ((u_int)bno >= fs->fs_size) {
 		printf("bad block %jd, ino %lu\n", (intmax_t)bno,
 		    (u_long)inum);
 		ffs_fserr(fs, inum, "bad block");
 		return;
 	}
 	if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) {
 		if (!ffs_fsfail_cleanup(ump, error) ||
 		    !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR)
 			return;
 		if (devvp->v_type == VREG)
 			dbn = fragstoblks(fs, cgtod(fs, cg));
 		else
 			dbn = fsbtodb(fs, cgtod(fs, cg));
 		error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp);
 		KASSERT(error == 0, ("getblkx failed"));
 		softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
 		    numfrags(fs, size), dephd);
 		bp->b_flags |= B_RELBUF | B_NOCACHE;
 		bp->b_flags &= ~B_CACHE;
 		bawrite(bp);
 		return;
 	}
 	cgbno = dtogd(fs, bno);
 	blksfree = cg_blksfree(cgp);
 	UFS_LOCK(ump);
 	if (size == fs->fs_bsize) {
 		fragno = fragstoblks(fs, cgbno);
 		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
 			if (devvp->v_type == VREG) {
 				UFS_UNLOCK(ump);
 				/* devvp is a snapshot */
 				brelse(bp);
 				return;
 			}
 			printf("dev = %s, block = %jd, fs = %s\n",
 			    devtoname(dev), (intmax_t)bno, fs->fs_fsmnt);
 			panic("ffs_blkfree_cg: freeing free block");
 		}
 		ffs_setblock(fs, blksfree, fragno);
 		ffs_clusteracct(fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 		fs->fs_cstotal.cs_nbfree++;
 		fs->fs_cs(fs, cg).cs_nbfree++;
 	} else {
 		bbase = cgbno - fragnum(fs, cgbno);
 		/*
 		 * decrement the counts associated with the old frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
 		/*
 		 * deallocate the fragment
 		 */
 		frags = numfrags(fs, size);
 		for (i = 0; i < frags; i++) {
 			if (isset(blksfree, cgbno + i)) {
 				printf("dev = %s, block = %jd, fs = %s\n",
 				    devtoname(dev), (intmax_t)(bno + i),
 				    fs->fs_fsmnt);
 				panic("ffs_blkfree_cg: freeing free frag");
 			}
 			setbit(blksfree, cgbno + i);
 		}
 		cgp->cg_cs.cs_nffree += i;
 		fs->fs_cstotal.cs_nffree += i;
 		fs->fs_cs(fs, cg).cs_nffree += i;
 		/*
 		 * add back in counts associated with the new frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
 		/*
 		 * if a complete block has been reassembled, account for it
 		 */
 		fragno = fragstoblks(fs, bbase);
 		if (ffs_isblock(fs, blksfree, fragno)) {
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
 			ffs_clusteracct(fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 			fs->fs_cstotal.cs_nbfree++;
 			fs->fs_cs(fs, cg).cs_nbfree++;
 		}
 	}
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	mp = UFSTOVFS(ump);
 	if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR)
 		softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
 		    numfrags(fs, size), dephd);
 	bdwrite(bp);
 }
 
 /*
  * Structures and routines associated with trim management.
  *
  * The following requests are passed to trim_lookup to indicate
  * the actions that should be taken.
  */
 #define	NEW	1	/* if found, error else allocate and hash it */
 #define	OLD	2	/* if not found, error, else return it */
 #define	REPLACE	3	/* if not found, error else unhash and reallocate it */
 #define	DONE	4	/* if not found, error else unhash and return it */
 #define	SINGLE	5	/* don't look up, just allocate it and don't hash it */
 
 MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
 
 #define	TRIMLIST_HASH(ump, key) \
 	(&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize])
 
 /*
  * These structures describe each of the block free requests aggregated
  * together to make up a trim request.
  */
 struct trim_blkreq {
 	TAILQ_ENTRY(trim_blkreq) blkreqlist;
 	ufs2_daddr_t bno;
 	long size;
 	struct workhead *pdephd;
 	struct workhead dephd;
 };
 
 /*
  * Description of a trim request.
  */
 struct ffs_blkfree_trim_params {
 	TAILQ_HEAD(, trim_blkreq) blklist;
 	LIST_ENTRY(ffs_blkfree_trim_params) hashlist;
 	struct task task;
 	struct ufsmount *ump;
 	struct vnode *devvp;
 	ino_t inum;
 	ufs2_daddr_t bno;
 	long size;
 	long key;
 };
 
 static void	ffs_blkfree_trim_completed(struct buf *);
 static void	ffs_blkfree_trim_task(void *ctx, int pending __unused);
 static struct	ffs_blkfree_trim_params *trim_lookup(struct ufsmount *,
 		    struct vnode *, ufs2_daddr_t, long, ino_t, u_long, int);
 static void	ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *);
 
 /*
  * Called on trim completion to start a task to free the associated block(s).
  */
 static void
 ffs_blkfree_trim_completed(bp)
 	struct buf *bp;
 {
 	struct ffs_blkfree_trim_params *tp;
 
 	tp = bp->b_fsprivate1;
 	free(bp, M_TRIM);
 	TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
 	taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
 }
 
 /*
  * Trim completion task that free associated block(s).
  */
 static void
 ffs_blkfree_trim_task(ctx, pending)
 	void *ctx;
 	int pending;
 {
 	struct ffs_blkfree_trim_params *tp;
 	struct trim_blkreq *blkelm;
 	struct ufsmount *ump;
 
 	tp = ctx;
 	ump = tp->ump;
 	while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) {
 		ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno,
 		    blkelm->size, tp->inum, blkelm->pdephd);
 		TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist);
 		free(blkelm, M_TRIM);
 	}
 	vn_finished_secondary_write(UFSTOVFS(ump));
 	UFS_LOCK(ump);
 	ump->um_trim_inflight -= 1;
 	ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size);
 	UFS_UNLOCK(ump);
 	free(tp, M_TRIM);
 }
 
 /*
  * Lookup a trim request by inode number.
  * Allocate if requested (NEW, REPLACE, SINGLE).
  */
 static struct ffs_blkfree_trim_params *
 trim_lookup(ump, devvp, bno, size, inum, key, alloctype)
 	struct ufsmount *ump;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 	u_long key;
 	int alloctype;
 {
 	struct trimlist_hashhead *tphashhead;
 	struct ffs_blkfree_trim_params *tp, *ntp;
 
 	ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
 	if (alloctype != SINGLE) {
 		KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key"));
 		UFS_LOCK(ump);
 		tphashhead = TRIMLIST_HASH(ump, key);
 		LIST_FOREACH(tp, tphashhead, hashlist)
 			if (key == tp->key)
 				break;
 	}
 	switch (alloctype) {
 	case NEW:
 		KASSERT(tp == NULL, ("trim_lookup: found trim"));
 		break;
 	case OLD:
 		KASSERT(tp != NULL,
 		    ("trim_lookup: missing call to ffs_blkrelease_start()"));
 		UFS_UNLOCK(ump);
 		free(ntp, M_TRIM);
 		return (tp);
 	case REPLACE:
 		KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim"));
 		LIST_REMOVE(tp, hashlist);
 		/* tp will be freed by caller */
 		break;
 	case DONE:
 		KASSERT(tp != NULL, ("trim_lookup: missing DONE trim"));
 		LIST_REMOVE(tp, hashlist);
 		UFS_UNLOCK(ump);
 		free(ntp, M_TRIM);
 		return (tp);
 	}
 	TAILQ_INIT(&ntp->blklist);
 	ntp->ump = ump;
 	ntp->devvp = devvp;
 	ntp->bno = bno;
 	ntp->size = size;
 	ntp->inum = inum;
 	ntp->key = key;
 	if (alloctype != SINGLE) {
 		LIST_INSERT_HEAD(tphashhead, ntp, hashlist);
 		UFS_UNLOCK(ump);
 	}
 	return (ntp);
 }
 
 /*
  * Dispatch a trim request.
  */
 static void
 ffs_blkfree_sendtrim(tp)
 	struct ffs_blkfree_trim_params *tp;
 {
 	struct ufsmount *ump;
 	struct mount *mp;
 	struct buf *bp;
 
 	/*
 	 * Postpone the set of the free bit in the cg bitmap until the
 	 * BIO_DELETE is completed.  Otherwise, due to disk queue
 	 * reordering, TRIM might be issued after we reuse the block
 	 * and write some new data into it.
 	 */
 	ump = tp->ump;
 	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
 	bp->b_iocmd = BIO_DELETE;
 	bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno));
 	bp->b_iodone = ffs_blkfree_trim_completed;
 	bp->b_bcount = tp->size;
 	bp->b_fsprivate1 = tp;
 	UFS_LOCK(ump);
 	ump->um_trim_total += 1;
 	ump->um_trim_inflight += 1;
 	ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size);
 	ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size);
 	UFS_UNLOCK(ump);
 
 	mp = UFSTOVFS(ump);
 	vn_start_secondary_write(NULL, &mp, 0);
 	g_vfs_strategy(ump->um_bo, bp);
 }
 
 /*
  * Allocate a new key to use to identify a range of blocks.
  */
 u_long
 ffs_blkrelease_start(ump, devvp, inum)
 	struct ufsmount *ump;
 	struct vnode *devvp;
 	ino_t inum;
 {
 	static u_long masterkey;
 	u_long key;
 
 	if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0)
 		return (SINGLETON_KEY);
 	do {
 		key = atomic_fetchadd_long(&masterkey, 1);
 	} while (key < FIRST_VALID_KEY);
 	(void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW);
 	return (key);
 }
 
 /*
  * Deallocate a key that has been used to identify a range of blocks.
  */
 void
 ffs_blkrelease_finish(ump, key)
 	struct ufsmount *ump;
 	u_long key;
 {
 	struct ffs_blkfree_trim_params *tp;
 
 	if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0)
 		return;
 	/*
 	 * If the vfs.ffs.dotrimcons sysctl option is enabled while
 	 * a file deletion is active, specifically after a call
 	 * to ffs_blkrelease_start() but before the call to
 	 * ffs_blkrelease_finish(), ffs_blkrelease_start() will
 	 * have handed out SINGLETON_KEY rather than starting a
 	 * collection sequence. Thus if we get a SINGLETON_KEY
 	 * passed to ffs_blkrelease_finish(), we just return rather
 	 * than trying to finish the nonexistent sequence.
 	 */
 	if (key == SINGLETON_KEY) {
 #ifdef INVARIANTS
 		printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n",
 		    ump->um_mountp->mnt_stat.f_mntonname);
 #endif
 		return;
 	}
 	/*
 	 * We are done with sending blocks using this key. Look up the key
 	 * using the DONE alloctype (in tp) to request that it be unhashed
 	 * as we will not be adding to it. If the key has never been used,
 	 * tp->size will be zero, so we can just free tp. Otherwise the call
 	 * to ffs_blkfree_sendtrim(tp) causes the block range described by
 	 * tp to be issued (and then tp to be freed).
 	 */
 	tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE);
 	if (tp->size == 0)
 		free(tp, M_TRIM);
 	else
 		ffs_blkfree_sendtrim(tp);
 }
 
 /*
  * Setup to free a block or fragment.
  *
  * Check for snapshots that might want to claim the block.
  * If trims are requested, prepare a trim request. Attempt to
  * aggregate consecutive blocks into a single trim request.
  */
 void
 ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 	enum vtype vtype;
 	struct workhead *dephd;
 	u_long key;
 {
 	struct ffs_blkfree_trim_params *tp, *ntp;
 	struct trim_blkreq *blkelm;
 
 	/*
 	 * Check to see if a snapshot wants to claim the block.
 	 * Check that devvp is a normal disk device, not a snapshot,
 	 * it has a snapshot(s) associated with it, and one of the
 	 * snapshots wants to claim the block.
 	 */
 	if (devvp->v_type == VCHR &&
 	    (devvp->v_vflag & VV_COPYONWRITE) &&
 	    ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) {
 		return;
 	}
 	/*
 	 * Nothing to delay if TRIM is not required for this block or TRIM
 	 * is disabled or the operation is performed on a snapshot.
 	 */
 	if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) ||
 	    devvp->v_type == VREG) {
 		ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
 		return;
 	}
 	blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK);
 	blkelm->bno = bno;
 	blkelm->size = size;
 	if (dephd == NULL) {
 		blkelm->pdephd = NULL;
 	} else {
 		LIST_INIT(&blkelm->dephd);
 		LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list);
 		blkelm->pdephd = &blkelm->dephd;
 	}
 	if (key == SINGLETON_KEY) {
 		/*
 		 * Just a single non-contiguous piece. Use the SINGLE
 		 * alloctype to return a trim request that will not be
 		 * hashed for future lookup.
 		 */
 		tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE);
 		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
 		ffs_blkfree_sendtrim(tp);
 		return;
 	}
 	/*
 	 * The callers of this function are not tracking whether or not
 	 * the blocks are contiguous. They are just saying that they
 	 * are freeing a set of blocks. It is this code that determines
 	 * the pieces of that range that are actually contiguous.
 	 *
 	 * Calling ffs_blkrelease_start() will have created an entry
 	 * that we will use.
 	 */
 	tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD);
 	if (tp->size == 0) {
 		/*
 		 * First block of a potential range, set block and size
 		 * for the trim block.
 		 */
 		tp->bno = bno;
 		tp->size = size;
 		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
 		return;
 	}
 	/*
 	 * If this block is a continuation of the range (either
 	 * follows at the end or preceeds in the front) then we
 	 * add it to the front or back of the list and return.
 	 *
 	 * If it is not a continuation of the trim that we were
 	 * building, using the REPLACE alloctype, we request that
 	 * the old trim request (still in tp) be unhashed and a
 	 * new range started (in ntp). The ffs_blkfree_sendtrim(tp)
 	 * call causes the block range described by tp to be issued
 	 * (and then tp to be freed).
 	 */
 	if (bno + numfrags(fs, size) == tp->bno) {
 		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
 		tp->bno = bno;
 		tp->size += size;
 		return;
 	} else if (bno == tp->bno + numfrags(fs, tp->size)) {
 		TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist);
 		tp->size += size;
 		return;
 	}
 	ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE);
 	TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist);
 	ffs_blkfree_sendtrim(tp);
 }
 
 #ifdef INVARIANTS
 /*
  * Verify allocation of a block or fragment. Returns true if block or
  * fragment is allocated, false if it is free.
  */
 static int
 ffs_checkblk(ip, bno, size)
 	struct inode *ip;
 	ufs2_daddr_t bno;
 	long size;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	ufs1_daddr_t cgbno;
 	int i, error, frags, free;
 	u_int8_t *blksfree;
 
 	fs = ITOFS(ip);
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("bsize = %ld, size = %ld, fs = %s\n",
 		    (long)fs->fs_bsize, size, fs->fs_fsmnt);
 		panic("ffs_checkblk: bad size");
 	}
 	if ((u_int)bno >= fs->fs_size)
 		panic("ffs_checkblk: bad block %jd", (intmax_t)bno);
 	error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp);
 	if (error)
 		panic("ffs_checkblk: cylinder group read failed");
 	blksfree = cg_blksfree(cgp);
 	cgbno = dtogd(fs, bno);
 	if (size == fs->fs_bsize) {
 		free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno));
 	} else {
 		frags = numfrags(fs, size);
 		for (free = 0, i = 0; i < frags; i++)
 			if (isset(blksfree, cgbno + i))
 				free++;
 		if (free != 0 && free != frags)
 			panic("ffs_checkblk: partially free fragment");
 	}
 	brelse(bp);
 	return (!free);
 }
 #endif /* INVARIANTS */
 
 /*
  * Free an inode.
  */
 int
 ffs_vfree(pvp, ino, mode)
 	struct vnode *pvp;
 	ino_t ino;
 	int mode;
 {
 	struct ufsmount *ump;
 
 	if (DOINGSOFTDEP(pvp)) {
 		softdep_freefile(pvp, ino, mode);
 		return (0);
 	}
 	ump = VFSTOUFS(pvp->v_mount);
 	return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL));
 }
 
 /*
  * Do the actual free operation.
  * The specified inode is placed back in the free map.
  */
 int
 ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ino_t ino;
 	int mode;
 	struct workhead *wkhd;
 {
 	struct cg *cgp;
 	struct buf *bp;
 	daddr_t dbn;
 	int error;
 	u_int cg;
 	u_int8_t *inosused;
 	struct cdev *dev;
 	ino_t cgino;
 
 	cg = ino_to_cg(fs, ino);
 	if (devvp->v_type == VREG) {
 		/* devvp is a snapshot */
 		MPASS(devvp->v_mount->mnt_data == ump);
 		dev = ump->um_devvp->v_rdev;
 	} else if (devvp->v_type == VCHR) {
 		/* devvp is a normal disk device */
 		dev = devvp->v_rdev;
 	} else {
 		bp = NULL;
 		return (0);
 	}
 	if (ino >= fs->fs_ipg * fs->fs_ncg)
 		panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s",
 		    devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt);
 	if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) {
 		if (!ffs_fsfail_cleanup(ump, error) ||
 		    !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR)
 			return (error);
 		if (devvp->v_type == VREG)
 			dbn = fragstoblks(fs, cgtod(fs, cg));
 		else
 			dbn = fsbtodb(fs, cgtod(fs, cg));
 		error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp);
 		KASSERT(error == 0, ("getblkx failed"));
 		softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd);
 		bp->b_flags |= B_RELBUF | B_NOCACHE;
 		bp->b_flags &= ~B_CACHE;
 		bawrite(bp);
 		return (error);
 	}
 	inosused = cg_inosused(cgp);
 	cgino = ino % fs->fs_ipg;
 	if (isclr(inosused, cgino)) {
 		printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev),
 		    (uintmax_t)ino, fs->fs_fsmnt);
 		if (fs->fs_ronly == 0)
 			panic("ffs_freefile: freeing free inode");
 	}
 	clrbit(inosused, cgino);
 	if (cgino < cgp->cg_irotor)
 		cgp->cg_irotor = cgino;
 	cgp->cg_cs.cs_nifree++;
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nifree++;
 	fs->fs_cs(fs, cg).cs_nifree++;
 	if ((mode & IFMT) == IFDIR) {
 		cgp->cg_cs.cs_ndir--;
 		fs->fs_cstotal.cs_ndir--;
 		fs->fs_cs(fs, cg).cs_ndir--;
 	}
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR)
 		softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd);
 	bdwrite(bp);
 	return (0);
 }
 
 /*
  * Check to see if a file is free.
  * Used to check for allocated files in snapshots.
  */
 int
 ffs_checkfreefile(fs, devvp, ino)
 	struct fs *fs;
 	struct vnode *devvp;
 	ino_t ino;
 {
 	struct cg *cgp;
 	struct buf *bp;
 	int ret, error;
 	u_int cg;
 	u_int8_t *inosused;
 
 	cg = ino_to_cg(fs, ino);
 	if ((devvp->v_type != VREG) && (devvp->v_type != VCHR))
 		return (1);
 	if (ino >= fs->fs_ipg * fs->fs_ncg)
 		return (1);
 	if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0)
 		return (1);
 	inosused = cg_inosused(cgp);
 	ino %= fs->fs_ipg;
 	ret = isclr(inosused, ino);
 	brelse(bp);
 	return (ret);
 }
 
 /*
  * Find a block of the specified size in the specified cylinder group.
  *
  * It is a panic if a request is made to find a block if none are
  * available.
  */
 static ufs1_daddr_t
 ffs_mapsearch(fs, cgp, bpref, allocsiz)
 	struct fs *fs;
 	struct cg *cgp;
 	ufs2_daddr_t bpref;
 	int allocsiz;
 {
 	ufs1_daddr_t bno;
 	int start, len, loc, i;
 	int blk, field, subfield, pos;
 	u_int8_t *blksfree;
 
 	/*
 	 * find the fragment by searching through the free block
 	 * map for an appropriate bit pattern
 	 */
 	if (bpref)
 		start = dtogd(fs, bpref) / NBBY;
 	else
 		start = cgp->cg_frotor / NBBY;
 	blksfree = cg_blksfree(cgp);
 	len = howmany(fs->fs_fpg, NBBY) - start;
 	loc = scanc((u_int)len, (u_char *)&blksfree[start],
 		fragtbl[fs->fs_frag],
 		(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 	if (loc == 0) {
 		len = start + 1;
 		start = 0;
 		loc = scanc((u_int)len, (u_char *)&blksfree[0],
 			fragtbl[fs->fs_frag],
 			(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 		if (loc == 0) {
 			printf("start = %d, len = %d, fs = %s\n",
 			    start, len, fs->fs_fsmnt);
 			panic("ffs_alloccg: map corrupted");
 			/* NOTREACHED */
 		}
 	}
 	bno = (start + len - loc) * NBBY;
 	cgp->cg_frotor = bno;
 	/*
 	 * found the byte in the map
 	 * sift through the bits to find the selected frag
 	 */
 	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
 		blk = blkmap(fs, blksfree, bno);
 		blk <<= 1;
 		field = around[allocsiz];
 		subfield = inside[allocsiz];
 		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
 			if ((blk & field) == subfield)
 				return (bno + pos);
 			field <<= 1;
 			subfield <<= 1;
 		}
 	}
 	printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt);
 	panic("ffs_alloccg: block not in map");
 	return (-1);
 }
 
 static const struct statfs *
 ffs_getmntstat(struct vnode *devvp)
 {
 
 	if (devvp->v_type == VCHR)
 		return (&devvp->v_rdev->si_mountpt->mnt_stat);
 	return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp));
 }
 
 /*
  * Fetch and verify a cylinder group.
  */
 int
 ffs_getcg(fs, devvp, cg, flags, bpp, cgpp)
 	struct fs *fs;
 	struct vnode *devvp;
 	u_int cg;
 	int flags;
 	struct buf **bpp;
 	struct cg **cgpp;
 {
 	struct buf *bp;
 	struct cg *cgp;
 	const struct statfs *sfs;
 	daddr_t blkno;
 	int error;
 
 	*bpp = NULL;
 	*cgpp = NULL;
 	if ((fs->fs_metackhash & CK_CYLGRP) != 0)
 		flags |= GB_CKHASH;
 	if (devvp->v_type == VREG)
 		blkno = fragstoblks(fs, cgtod(fs, cg));
 	else
 		blkno = fsbtodb(fs, cgtod(fs, cg));
 	error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL,
 	    NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp);
 	if (error != 0)
 		return (error);
 	cgp = (struct cg *)bp->b_data;
 	if ((fs->fs_metackhash & CK_CYLGRP) != 0 &&
 	    (bp->b_flags & B_CKHASH) != 0 &&
 	    cgp->cg_ckhash != bp->b_ckhash) {
 		sfs = ffs_getmntstat(devvp);
 		printf("UFS %s%s (%s) cylinder checksum failed: cg %u, cgp: "
 		    "0x%x != bp: 0x%jx\n",
 		    devvp->v_type == VCHR ? "" : "snapshot of ",
 		    sfs->f_mntfromname, sfs->f_mntonname,
 		    cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash);
 		bp->b_flags &= ~B_CKHASH;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 		return (EIO);
 	}
 	if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) {
 		sfs = ffs_getmntstat(devvp);
 		printf("UFS %s%s (%s)",
 		    devvp->v_type == VCHR ? "" : "snapshot of ",
 		    sfs->f_mntfromname, sfs->f_mntonname);
 		if (!cg_chkmagic(cgp))
 			printf(" cg %u: bad magic number 0x%x should be 0x%x\n",
 			    cg, cgp->cg_magic, CG_MAGIC);
 		else
 			printf(": wrong cylinder group cg %u != cgx %u\n", cg,
 			    cgp->cg_cgx);
 		bp->b_flags &= ~B_CKHASH;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 		return (EIO);
 	}
 	bp->b_flags &= ~B_CKHASH;
 	bp->b_xflags |= BX_BKGRDWRITE;
 	/*
 	 * If we are using check hashes on the cylinder group then we want
 	 * to limit changing the cylinder group time to when we are actually
 	 * going to write it to disk so that its check hash remains correct
 	 * in memory. If the CK_CYLGRP flag is set the time is updated in
 	 * ffs_bufwrite() as the buffer is queued for writing. Otherwise we
 	 * update the time here as we have done historically.
 	 */
 	if ((fs->fs_metackhash & CK_CYLGRP) != 0)
 		bp->b_xflags |= BX_CYLGRP;
 	else
 		cgp->cg_old_time = cgp->cg_time = time_second;
 	*bpp = bp;
 	*cgpp = cgp;
 	return (0);
 }
 
 static void
 ffs_ckhash_cg(bp)
 	struct buf *bp;
 {
 	uint32_t ckhash;
 	struct cg *cgp;
 
 	cgp = (struct cg *)bp->b_data;
 	ckhash = cgp->cg_ckhash;
 	cgp->cg_ckhash = 0;
 	bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
 	cgp->cg_ckhash = ckhash;
 }
 
 /*
  * Fserr prints the name of a filesystem with an error diagnostic.
  *
  * The form of the error message is:
  *	fs: error message
  */
 void
 ffs_fserr(fs, inum, cp)
 	struct fs *fs;
 	ino_t inum;
 	char *cp;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct proc *p = td->td_proc;
 
 	log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n",
 	    p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum,
 	    fs->fs_fsmnt, cp);
 }
 
 /*
  * This function provides the capability for the fsck program to
  * update an active filesystem. Fourteen operations are provided:
  *
  * adjrefcnt(inode, amt) - adjusts the reference count on the
  *	specified inode by the specified amount. Under normal
  *	operation the count should always go down. Decrementing
  *	the count to zero will cause the inode to be freed.
  * adjblkcnt(inode, amt) - adjust the number of blocks used by the
  *	inode by the specified amount.
  * setsize(inode, size) - set the size of the inode to the
  *	specified size.
  * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
  *	adjust the superblock summary.
  * freedirs(inode, count) - directory inodes [inode..inode + count - 1]
  *	are marked as free. Inodes should never have to be marked
  *	as in use.
  * freefiles(inode, count) - file inodes [inode..inode + count - 1]
  *	are marked as free. Inodes should never have to be marked
  *	as in use.
  * freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
  *	are marked as free. Blocks should never have to be marked
  *	as in use.
  * setflags(flags, set/clear) - the fs_flags field has the specified
  *	flags set (second parameter +1) or cleared (second parameter -1).
  * setcwd(dirinode) - set the current directory to dirinode in the
  *	filesystem associated with the snapshot.
  * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".."
  *	in the current directory is oldvalue then change it to newvalue.
  * unlink(nameptr, oldvalue) - Verify that the inode number associated
  *	with nameptr in the current directory is oldvalue then unlink it.
  */
 
 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt,
     CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT,
     0, 0, sysctl_ffs_fsck, "S,fsck",
     "Adjust Inode Reference Count");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust Inode Used Blocks Count");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Set the inode size");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of directories");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of free blocks");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of free inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of free frags");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Adjust number of free clusters");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Free Range of Directory Inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Free Range of File Inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Free Range of Blocks");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Change Filesystem Flags");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Set Current Working Directory");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Change Value of .. Entry");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink,
     CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck,
     "Unlink a Duplicate Name");
 
 #ifdef DIAGNOSTIC
 static int fsckcmds = 0;
 SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0,
 	"print out fsck_ffs-based filesystem update commands");
 #endif /* DIAGNOSTIC */
 
 static int
 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td = curthread;
 	struct fsck_cmd cmd;
 	struct ufsmount *ump;
 	struct vnode *vp, *dvp, *fdvp;
 	struct inode *ip, *dp;
 	struct mount *mp;
 	struct fs *fs;
 	struct pwd *pwd;
 	ufs2_daddr_t blkno;
 	long blkcnt, blksize;
 	u_long key;
 	struct file *fp;
 	cap_rights_t rights;
 	int filetype, error;
 
 	if (req->newlen > sizeof cmd)
 		return (EBADRPC);
 	if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0)
 		return (error);
 	if (cmd.version != FFS_CMD_VERSION)
 		return (ERPCMISMATCH);
 	if ((error = getvnode(td, cmd.handle,
-	    cap_rights_init(&rights, CAP_FSCK), &fp)) != 0)
+	    cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0)
 		return (error);
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG && vp->v_type != VDIR) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	vn_start_write(vp, &mp, V_WAIT);
 	if (mp == NULL ||
 	    strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
 		vn_finished_write(mp);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	ump = VFSTOUFS(mp);
 	if ((mp->mnt_flag & MNT_RDONLY) &&
 	    ump->um_fsckpid != td->td_proc->p_pid) {
 		vn_finished_write(mp);
 		fdrop(fp, td);
 		return (EROFS);
 	}
 	fs = ump->um_fs;
 	filetype = IFREG;
 
 	switch (oidp->oid_number) {
 	case FFS_SET_FLAGS:
 #ifdef DIAGNOSTIC
 		if (fsckcmds)
 			printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
 			    cmd.size > 0 ? "set" : "clear");
 #endif /* DIAGNOSTIC */
 		if (cmd.size > 0)
 			fs->fs_flags |= (long)cmd.value;
 		else
 			fs->fs_flags &= ~(long)cmd.value;
 		break;
 
 	case FFS_ADJ_REFCNT:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust inode %jd link count by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
 		ip->i_nlink += cmd.size;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_effnlink += cmd.size;
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED);
 		error = ffs_update(vp, 1);
 		if (DOINGSOFTDEP(vp))
 			softdep_change_linkcnt(ip);
 		vput(vp);
 		break;
 
 	case FFS_ADJ_BLKCNT:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust inode %jd block count by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED);
 		error = ffs_update(vp, 1);
 		vput(vp);
 		break;
 
 	case FFS_SET_SIZE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: set inode %jd size to %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
 		DIP_SET(ip, i_size, cmd.size);
 		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED);
 		error = ffs_update(vp, 1);
 		vput(vp);
 		break;
 
 	case FFS_DIR_FREE:
 		filetype = IFDIR;
 		/* fall through */
 
 	case FFS_FILE_FREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			if (cmd.size == 1)
 				printf("%s: free %s inode %ju\n",
 				    mp->mnt_stat.f_mntonname,
 				    filetype == IFDIR ? "directory" : "file",
 				    (uintmax_t)cmd.value);
 			else
 				printf("%s: free %s inodes %ju-%ju\n",
 				    mp->mnt_stat.f_mntonname,
 				    filetype == IFDIR ? "directory" : "file",
 				    (uintmax_t)cmd.value,
 				    (uintmax_t)(cmd.value + cmd.size - 1));
 		}
 #endif /* DIAGNOSTIC */
 		while (cmd.size > 0) {
 			if ((error = ffs_freefile(ump, fs, ump->um_devvp,
 			    cmd.value, filetype, NULL)))
 				break;
 			cmd.size -= 1;
 			cmd.value += 1;
 		}
 		break;
 
 	case FFS_BLK_FREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			if (cmd.size == 1)
 				printf("%s: free block %jd\n",
 				    mp->mnt_stat.f_mntonname,
 				    (intmax_t)cmd.value);
 			else
 				printf("%s: free blocks %jd-%jd\n",
 				    mp->mnt_stat.f_mntonname, 
 				    (intmax_t)cmd.value,
 				    (intmax_t)cmd.value + cmd.size - 1);
 		}
 #endif /* DIAGNOSTIC */
 		blkno = cmd.value;
 		blkcnt = cmd.size;
 		blksize = fs->fs_frag - (blkno % fs->fs_frag);
 		key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO);
 		while (blkcnt > 0) {
 			if (blkcnt < blksize)
 				blksize = blkcnt;
 			ffs_blkfree(ump, fs, ump->um_devvp, blkno,
 			    blksize * fs->fs_fsize, UFS_ROOTINO, 
 			    VDIR, NULL, key);
 			blkno += blksize;
 			blkcnt -= blksize;
 			blksize = fs->fs_frag;
 		}
 		ffs_blkrelease_finish(ump, key);
 		break;
 
 	/*
 	 * Adjust superblock summaries.  fsck(8) is expected to
 	 * submit deltas when necessary.
 	 */
 	case FFS_ADJ_NDIR:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of directories by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_ndir += cmd.value;
 		break;
 
 	case FFS_ADJ_NBFREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of free blocks by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_nbfree += cmd.value;
 		break;
 
 	case FFS_ADJ_NIFREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of free inodes by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_nifree += cmd.value;
 		break;
 
 	case FFS_ADJ_NFFREE:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of free frags by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_nffree += cmd.value;
 		break;
 
 	case FFS_ADJ_NUMCLUSTERS:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: adjust number of free clusters by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		fs->fs_cstotal.cs_numclusters += cmd.value;
 		break;
 
 	case FFS_SET_CWD:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: set current directory to inode %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DIAGNOSTIC */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp)))
 			break;
 		AUDIT_ARG_VNODE1(vp);
 		if ((error = change_dir(vp, td)) != 0) {
 			vput(vp);
 			break;
 		}
 		VOP_UNLOCK(vp);
 		pwd_chdir(td, vp);
 		break;
 
 	case FFS_SET_DOTDOT:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("%s: change .. in cwd from %jd to %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		/*
 		 * First we have to get and lock the parent directory
 		 * to which ".." points.
 		 */
 		error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp);
 		if (error)
 			break;
 		/*
 		 * Now we get and lock the child directory containing "..".
 		 */
 		pwd = pwd_hold(td);
 		dvp = pwd->pwd_cdir;
 		if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) {
 			vput(fdvp);
 			pwd_drop(pwd);
 			break;
 		}
 		dp = VTOI(dvp);
 		SET_I_OFFSET(dp, 12);	/* XXX mastertemplate.dot_reclen */
 		error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size,
 		    DT_DIR, 0);
 		cache_purge(fdvp);
 		cache_purge(dvp);
 		vput(dvp);
 		vput(fdvp);
 		pwd_drop(pwd);
 		break;
 
 	case FFS_UNLINK:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			char buf[32];
 
 			if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL))
 				strncpy(buf, "Name_too_long", 32);
 			printf("%s: unlink %s (inode %jd)\n",
 			    mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size);
 		}
 #endif /* DIAGNOSTIC */
 		/*
 		 * kern_funlinkat will do its own start/finish writes and
 		 * they do not nest, so drop ours here. Setting mp == NULL
 		 * indicates that vn_finished_write is not needed down below.
 		 */
 		vn_finished_write(mp);
 		mp = NULL;
 		error = kern_funlinkat(td, AT_FDCWD,
 		    (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE,
 		    0, (ino_t)cmd.size);
 		break;
 
 	default:
 #ifdef DIAGNOSTIC
 		if (fsckcmds) {
 			printf("Invalid request %d from fsck\n",
 			    oidp->oid_number);
 		}
 #endif /* DIAGNOSTIC */
 		error = EINVAL;
 		break;
 	}
 	fdrop(fp, td);
 	vn_finished_write(mp);
 	return (error);
 }