diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 67350f4ad71e..7a43fbb2eb80 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -1,5079 +1,5079 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/kdb.h>
 #include <sys/smr.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes");
 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static __read_mostly uma_zone_t file_zone;
 static __read_mostly uma_zone_t filedesc0_zone;
 __read_mostly uma_zone_t pwd_zone;
 VFS_SMR_DECLARE;
 
 static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
 		    struct thread *td, bool holdleaders, bool audit);
 static int	fd_first_free(struct filedesc *fdp, int low, int size);
 static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 static int	getmaxfd(struct thread *td);
 static u_long	*filecaps_copy_prep(const struct filecaps *src);
 static void	filecaps_copy_finish(const struct filecaps *src,
 		    struct filecaps *dst, u_long *ioctls);
 static u_long 	*filecaps_free_prep(struct filecaps *fcaps);
 static void	filecaps_free_finish(u_long *ioctls);
 
 static struct pwd *pwd_alloc(void);
 
 /*
  * Each process has:
  *
  * - An array of open file descriptors (fd_ofiles)
  * - An array of file flags (fd_ofileflags)
  * - A bitmap recording which descriptors are in use (fd_map)
  *
  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  * been selected based the historical limit of 20 open files, and an
  * assumption that the majority of processes, especially short-lived
  * processes like shells, will never need more.
  *
  * If this initial allocation is exhausted, a larger descriptor table and
  * map are allocated dynamically, and the pointers in the process's struct
  * filedesc are updated to point to those.  This is repeated every time
  * the process runs out of file descriptors (provided it hasn't hit its
  * resource limit).
  *
  * Since threads may hold references to individual descriptor table
  * entries, the tables are never freed.  Instead, they are placed on a
  * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * SLIST entry used to keep track of ofiles which must be reclaimed when
  * the process exits.
  */
 struct freetable {
 	struct fdescenttbl *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
  * Initial allocation: a filedesc structure + the head of SLIST used to
  * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 
 struct fdescenttbl0 {
 	int	fdt_nfiles;
 	struct	filedescent fdt_ofiles[NDFILE];
 };
 
 struct filedesc0 {
 	struct filedesc fd_fd;
 	SLIST_HEAD(, freetable) fd_free;
 	struct	fdescenttbl0 fd_dfiles;
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 static int __exclusive_cache_line openfiles; /* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
  * given bitmap, starting at low and not exceeding size - 1. Return size if
  * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the last used fd.
  *
  * Call this variant if fdp can't be modified by anyone else (e.g, during exec).
  * Otherwise use fdlastfile.
  */
 int
 fdlastfile_single(struct filedesc *fdp)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	int off, minoff;
 
 	off = NDSLOT(fdp->fd_nfiles - 1);
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (-1);
 }
 
 int
 fdlastfile(struct filedesc *fdp)
 {
 
 	FILEDESC_LOCK_ASSERT(fdp);
 	return (fdlastfile_single(fdp));
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused_init(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 }
 
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fdused_init(fdp, fd);
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile++;
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 }
 
 /*
  * Free a file descriptor.
  *
  * Avoid some work if fdp is about to be destroyed.
  */
 static inline void
 fdefree_last(struct filedescent *fde)
 {
 
 	filecaps_free(&fde->fde_caps);
 }
 
 static inline void
 fdfree(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seqc_write_begin(&fde->fde_seqc);
 #endif
 	fde->fde_file = NULL;
 #ifdef CAPABILITIES
 	seqc_write_end(&fde->fde_seqc);
 #endif
 	fdefree_last(fde);
 	fdunused(fdp, fd);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 #ifdef	RACCT
 	uint64_t lim;
 #endif
 
 	td->td_retval[0] = getmaxfd(td);
 #ifdef	RACCT
 	PROC_LOCK(td->td_proc);
 	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	PROC_UNLOCK(td->td_proc);
 	if (lim < td->td_retval[0])
 		td->td_retval[0] = lim;
 #endif
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
 }
 
 int
 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 {
 	struct flock fl;
 	struct __oflock ofl;
 	intptr_t arg1;
 	int error, newcmd;
 
 	error = 0;
 	newcmd = cmd;
 	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		/*
 		 * Convert old flock structure to new.
 		 */
 		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
 		fl.l_type = ofl.l_type;
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
 		switch (cmd) {
 		case F_OGETLK:
 			newcmd = F_GETLK;
 			break;
 		case F_OSETLK:
 			newcmd = F_SETLK;
 			break;
 		case F_OSETLKW:
 			newcmd = F_SETLKW;
 			break;
 		}
 		arg1 = (intptr_t)&fl;
 		break;
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 	case F_SETLK_REMOTE:
 		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
 		arg1 = (intptr_t)&fl;
 		break;
 	default:
 		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, fd, newcmd, arg1);
 	if (error)
 		return (error);
 	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
 		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
 	} else if (cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp, *fp2;
 	struct filedescent *fde;
 	struct proc *p;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, flg, seals, tmp;
 	uint64_t bsize;
 	off_t foffset;
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	AUDIT_ARG_FD(cmd);
 	AUDIT_ARG_CMD(cmd);
 	switch (cmd) {
 	case F_DUPFD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
 		break;
 
 	case F_DUPFD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_DUP2FD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
 		break;
 
 	case F_DUP2FD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_GETFD:
 		error = EBADF;
 		FILEDESC_SLOCK(fdp);
 		fde = fdeget_locked(fdp, fd);
 		if (fde != NULL) {
 			td->td_retval[0] =
 			    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		error = EBADF;
 		FILEDESC_XLOCK(fdp);
 		fde = fdeget_locked(fdp, fd);
 		if (fde != NULL) {
 			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 			    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 			error = 0;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
 		if (error != 0)
 			break;
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
 		if (error != 0)
 			break;
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
-		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
+		} while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
 		if (error != 0)
 			break;
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
 		if (error != 0)
 			break;
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLK_REMOTE:
 		error = priv_check(td, PRIV_NFS_LOCKD);
 		if (error != 0)
 			return (error);
 		flg = F_REMOTE;
 		goto do_setlk;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 	do_setlk:
 		flp = (struct flock *)arg;
 		if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
 			error = EINVAL;
 			break;
 		}
 
 		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
 			     foffset > OFF_MAX - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
 		vp = fp->f_vnode;
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
 				PROC_LOCK(p->p_leader);
 				p->p_leader->p_flag |= P_ADVLOCK;
 				PROC_UNLOCK(p->p_leader);
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
 				PROC_LOCK(p->p_leader);
 				p->p_leader->p_flag |= P_ADVLOCK;
 				PROC_UNLOCK(p->p_leader);
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, flg);
 			break;
 		case F_UNLCKSYS:
 			if (flg != F_REMOTE) {
 				error = EINVAL;
 				break;
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error != 0 || flp->l_type == F_UNLCK ||
 		    flp->l_type == F_UNLCKSYS) {
 			fdrop(fp, td);
 			break;
 		}
 
 		/*
 		 * Check for a race with close.
 		 *
 		 * The vnode is now advisory locked (or unlocked, but this case
 		 * is not really important) as the caller requested.
 		 * We had to drop the filedesc lock, so we need to recheck if
 		 * the descriptor is still valid, because if it was closed
 		 * in the meantime we need to remove advisory lock from the
 		 * vnode - close on any descriptor leading to an advisory
 		 * locked vnode, removes that lock.
 		 * We will return 0 on purpose in that case, as the result of
 		 * successful advisory lock might have been externally visible
 		 * already. This is fine - effectively we pretend to the caller
 		 * that the closing thread was a bit slower and that the
 		 * advisory lock succeeded before the close.
 		 */
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCK, flp, F_POSIX);
 		}
 		fdrop(fp, td);
 		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
 		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			error = EINVAL;
 			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			    foffset < OFF_MIN - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 
 	case F_ADD_SEALS:
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		error = fo_add_seals(fp, arg);
 		fdrop(fp, td);
 		break;
 
 	case F_GET_SEALS:
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fo_get_seals(fp, &seals) == 0)
 			td->td_retval[0] = seals;
 		else
 			error = EINVAL;
 		fdrop(fp, td);
 		break;
 
 	case F_RDAHEAD:
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		if (vp->v_type != VREG) {
 			fdrop(fp, td);
 			error = ENOTTY;
 			break;
 		}
 
 		/*
 		 * Exclusive lock synchronizes against f_seqcount reads and
 		 * writes in sequential_heuristic().
 		 */
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
 			arg = MIN(arg, INT_MAX - bsize + 1);
 			fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX,
 			    (arg + bsize - 1) / bsize);
 			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
 			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
 		VOP_UNLOCK(vp);
 		fdrop(fp, td);
 		break;
 
 	case F_ISUNIONSTACK:
 		/*
 		 * Check if the vnode is part of a union stack (either the
 		 * "union" flag from mount(2) or unionfs).
 		 *
 		 * Prior to introduction of this op libc's readdir would call
 		 * fstatfs(2), in effect unnecessarily copying kilobytes of
 		 * data just to check fs name and a mount flag.
 		 *
 		 * Fixing the code to handle everything in the kernel instead
 		 * is a non-trivial endeavor and has low priority, thus this
 		 * horrible kludge facilitates the current behavior in a much
 		 * cheaper manner until someone(tm) sorts this out.
 		 */
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		/*
 		 * Since we don't prevent dooming the vnode even non-null mp
 		 * found can become immediately stale. This is tolerable since
 		 * mount points are type-stable (providing safe memory access)
 		 * and any vfs op on this vnode going forward will return an
 		 * error (meaning return value in this case is meaningless).
 		 */
 		mp = atomic_load_ptr(&vp->v_mount);
 		if (__predict_false(mp == NULL)) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		td->td_retval[0] = 0;
 		if (mp->mnt_kern_flag & MNTK_UNIONFS ||
 		    mp->mnt_flag & MNT_UNION)
 			td->td_retval[0] = 1;
 		fdrop(fp, td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static int
 getmaxfd(struct thread *td)
 {
 
 	return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
 }
 
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
 int
 kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 {
 	struct filedesc *fdp;
 	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *delfp, *oldfp;
 	u_long *oioctls, *nioctls;
 	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	oioctls = NULL;
 
 	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
 	MPASS(mode < FDDUP_LASTMODE);
 
 	AUDIT_ARG_FD(old);
 	/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
 	 * return EINVAL when the new descriptor is out of bounds.
 	 */
 	if (old < 0)
 		return (EBADF);
 	if (new < 0)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 	maxfd = getmaxfd(td);
 	if (new >= maxfd)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 
 	error = EBADF;
 	FILEDESC_XLOCK(fdp);
 	if (fget_locked(fdp, old) == NULL)
 		goto unlock;
 	if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
 		td->td_retval[0] = new;
 		if (flags & FDDUP_FLAG_CLOEXEC)
 			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		error = 0;
 		goto unlock;
 	}
 
 	oldfde = &fdp->fd_ofiles[old];
 	oldfp = oldfde->fde_file;
 	if (!fhold(oldfp))
 		goto unlock;
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.
 	 */
 	switch (mode) {
 	case FDDUP_NORMAL:
 	case FDDUP_FCNTL:
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			fdrop(oldfp, td);
 			goto unlock;
 		}
 		break;
 	case FDDUP_MUSTREPLACE:
 		/* Target file descriptor must exist. */
 		if (fget_locked(fdp, new) == NULL) {
 			fdrop(oldfp, td);
 			goto unlock;
 		}
 		break;
 	case FDDUP_FIXED:
 		if (new >= fdp->fd_nfiles) {
 			/*
 			 * The resource limits are here instead of e.g.
 			 * fdalloc(), because the file descriptor table may be
 			 * shared between processes, so we can't really use
 			 * racct_add()/racct_sub().  Instead of counting the
 			 * number of actually allocated descriptors, just put
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
 			if (RACCT_ENABLED()) {
 				error = racct_set_unlocked(p, RACCT_NOFILE, new + 1);
 				if (error != 0) {
 					error = EMFILE;
 					fdrop(oldfp, td);
 					goto unlock;
 				}
 			}
 #endif
 			fdgrowtable_exp(fdp, new + 1);
 		}
 		if (!fdisused(fdp, new))
 			fdused(fdp, new);
 		break;
 	default:
 		KASSERT(0, ("%s unsupported mode %d", __func__, mode));
 	}
 
 	KASSERT(old != new, ("new fd is same as old"));
 
 	/* Refetch oldfde because the table may have grown and old one freed. */
 	oldfde = &fdp->fd_ofiles[old];
 	KASSERT(oldfp == oldfde->fde_file,
 	    ("fdt_ofiles shift from growth observed at fd %d",
 	    old));
 
 	newfde = &fdp->fd_ofiles[new];
 	delfp = newfde->fde_file;
 
 	nioctls = filecaps_copy_prep(&oldfde->fde_caps);
 
 	/*
 	 * Duplicate the source descriptor.
 	 */
 #ifdef CAPABILITIES
 	seqc_write_begin(&newfde->fde_seqc);
 #endif
 	oioctls = filecaps_free_prep(&newfde->fde_caps);
 	memcpy(newfde, oldfde, fde_change_size);
 	filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 	    nioctls);
 	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 #ifdef CAPABILITIES
 	seqc_write_end(&newfde->fde_seqc);
 #endif
 	td->td_retval[0] = new;
 
 	error = 0;
 
 	if (delfp != NULL) {
 		(void) closefp(fdp, new, delfp, td, true, false);
 		FILEDESC_UNLOCK_ASSERT(fdp);
 	} else {
 unlock:
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	filecaps_free_finish(oioctls);
 	return (error);
 }
 
 static void
 sigiofree(struct sigio *sigio)
 {
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 }
 
 static struct sigio *
 funsetown_locked(struct sigio *sigio)
 {
 	struct proc *p;
 	struct pgrp *pg;
 
 	SIGIO_ASSERT_LOCKED();
 
 	if (sigio == NULL)
 		return (NULL);
 	*(sigio->sio_myref) = NULL;
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 		    sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		p = sigio->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 		    sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	return (sigio);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	/* Racy check, consumers must provide synchronization. */
 	if (*sigiop == NULL)
 		return;
 
 	SIGIO_LOCK();
 	sigio = funsetown_locked(*sigiop);
 	SIGIO_UNLOCK();
 	if (sigio != NULL)
 		sigiofree(sigio);
 }
 
 /*
  * Free a list of sigio structures.  The caller must ensure that new sigio
  * structures cannot be added after this point.  For process groups this is
  * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves
  * as an interlock.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio, *tmp;
 
 	/* Racy check. */
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 
 	p = NULL;
 	pg = NULL;
 
 	SIGIO_LOCK();
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 
 	/*
 	 * Every entry of the list should belong to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		sx_assert(&proctree_lock, SX_XLOCKED);
 		PGRP_LOCK(pg);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK(p);
 		KASSERT((p->p_flag & P_WEXIT) != 0,
 		    ("%s: process %p is not exiting", __func__, p));
 	}
 
 	SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) {
 		*sigio->sio_myref = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 		}
 	}
 
 	if (pg != NULL)
 		PGRP_UNLOCK(pg);
 	else
 		PROC_UNLOCK(p);
 	SIGIO_UNLOCK();
 
 	SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp)
 		sigiofree(sigio);
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *osigio, *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	SIGIO_LOCK();
 	osigio = funsetown_locked(*sigiop);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (proc->p_session != curthread->td_proc->p_session) {
 			PROC_UNLOCK(proc);
 			ret = EPERM;
 			goto fail;
 		}
 
 		sigio->sio_proc = proc;
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		PROC_UNLOCK(proc);
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			PGRP_UNLOCK(pgrp);
 			ret = EPERM;
 			goto fail;
 		}
 
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	if (osigio != NULL)
 		sigiofree(osigio);
 	return (0);
 
 fail:
 	SIGIO_UNLOCK();
 	sx_sunlock(&proctree_lock);
 	sigiofree(sigio);
 	if (osigio != NULL)
 		sigiofree(osigio);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(struct sigio **sigiop)
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 static int
 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool audit)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist)))
 		knote_fdclose(td, fd);
 
 	/*
 	 * We need to notify mqueue if the object is of type mqueue.
 	 */
 	if (__predict_false(fp->f_type == DTYPE_MQUEUE))
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 #ifdef AUDIT
 	if (AUDITING_TD(td) && audit)
 		audit_sysclose(td, fd, fp);
 #endif
 	error = closef(fp, td);
 
 	/*
 	 * All paths leading up to closefp() will have already removed or
 	 * replaced the fd in the filedesc table, so a restart would not
 	 * operate on the same file.
 	 */
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 static int
 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool holdleaders, bool audit)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (holdleaders) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 		} else {
 			holdleaders = false;
 		}
 	}
 
 	error = closefp_impl(fdp, fd, fp, td, audit);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 static int
 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool holdleaders, bool audit)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (__predict_false(td->td_proc->p_fdtol != NULL)) {
 		return (closefp_hl(fdp, fd, fp, td, holdleaders, audit));
 	} else {
 		return (closefp_impl(fdp, fd, fp, td, audit));
 	}
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_close(struct thread *td, struct close_args *uap)
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(struct thread *td, int fd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdfree(fdp, fd);
 
 	/* closefp() drops the FILEDESC lock for us. */
 	return (closefp(fdp, fd, fp, td, true, true));
 }
 
 int
 kern_close_range(struct thread *td, u_int lowfd, u_int highfd)
 {
 	struct filedesc *fdp;
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 	int fd;
 
 	/*
 	 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2
 	 * open should not be a usage error.  From a close_range() perspective,
 	 * close_range(3, ~0U, 0) in the same scenario should also likely not
 	 * be a usage error as all fd above 3 are in-fact already closed.
 	 */
 	if (highfd < lowfd) {
 		return (EINVAL);
 	}
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	fdt = atomic_load_ptr(&fdp->fd_files);
 	highfd = MIN(highfd, fdt->fdt_nfiles - 1);
 	fd = lowfd;
 	if (__predict_false(fd > highfd)) {
 		goto out_locked;
 	}
 	for (;;) {
 		fp = fdt->fdt_ofiles[fd].fde_file;
 		if (fp == NULL) {
 			if (fd == highfd)
 				goto out_locked;
 		} else {
 			fdfree(fdp, fd);
 			(void) closefp(fdp, fd, fp, td, true, true);
 			if (fd == highfd)
 				goto out_unlocked;
 			FILEDESC_XLOCK(fdp);
 			fdt = atomic_load_ptr(&fdp->fd_files);
 		}
 		fd++;
 	}
 out_locked:
 	FILEDESC_XUNLOCK(fdp);
 out_unlocked:
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct close_range_args {
 	u_int	lowfd;
 	u_int	highfd;
 	int	flags;
 };
 #endif
 int
 sys_close_range(struct thread *td, struct close_range_args *uap)
 {
 
 	AUDIT_ARG_FD(uap->lowfd);
 	AUDIT_ARG_CMD(uap->highfd);
 	AUDIT_ARG_FFLAGS(uap->flags);
 
 	/* No flags currently defined */
 	if (uap->flags != 0)
 		return (EINVAL);
 	return (kern_close_range(td, uap->lowfd, uap->highfd));
 }
 
 #ifdef COMPAT_FREEBSD12
 /*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd12_closefrom_args {
 	int	lowfd;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap)
 {
 	u_int lowfd;
 
 	AUDIT_ARG_FD(uap->lowfd);
 
 	/*
 	 * Treat negative starting file descriptor values identical to
 	 * closefrom(0) which closes all files.
 	 */
 	lowfd = MAX(0, uap->lowfd);
 	return (kern_close_range(td, lowfd, ~0U));
 }
 #endif	/* COMPAT_FREEBSD12 */
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD11)
 int
 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->sb, sizeof(osb));
 	return (error);
 }
 #endif	/* COMPAT_FREEBSD11 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	error = fget(td, fd, &cap_fstat_rights, &fp);
 	if (__predict_false(error != 0))
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 #ifdef __STAT_TIME_T_EXT
 	sbp->st_atim_ext = 0;
 	sbp->st_mtim_ext = 0;
 	sbp->st_ctim_ext = 0;
 	sbp->st_btim_ext = 0;
 #endif
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrstat_error(sbp, error);
 #endif
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		freebsd11_cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	long value;
 	int error;
 
 	error = kern_fpathconf(td, uap->fd, uap->name, &value);
 	if (error == 0)
 		td->td_retval[0] = value;
 	return (error);
 }
 
 int
 kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error;
 
 	error = fget(td, fd, &cap_fpathconf_rights, &fp);
 	if (error != 0)
 		return (error);
 
 	if (name == _PC_ASYNC_IO) {
 		*valuep = _POSIX_ASYNCHRONOUS_IO;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, name, valuep);
 		VOP_UNLOCK(vp);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			*valuep = PIPE_BUF;
 			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Copy filecaps structure allocating memory for ioctls array if needed.
  *
  * The last parameter indicates whether the fdtable is locked. If it is not and
  * ioctls are encountered, copying fails and the caller must lock the table.
  *
  * Note that if the table was not locked, the caller has to check the relevant
  * sequence counter to determine whether the operation was successful.
  */
 bool
 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
 {
 	size_t size;
 
 	if (src->fc_ioctls != NULL && !locked)
 		return (false);
 	memcpy(dst, src, sizeof(*src));
 	if (src->fc_ioctls == NULL)
 		return (true);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	memcpy(dst->fc_ioctls, src->fc_ioctls, size);
 	return (true);
 }
 
 static u_long *
 filecaps_copy_prep(const struct filecaps *src)
 {
 	u_long *ioctls;
 	size_t size;
 
 	if (__predict_true(src->fc_ioctls == NULL))
 		return (NULL);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	return (ioctls);
 }
 
 static void
 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
     u_long *ioctls)
 {
 	size_t size;
 
 	*dst = *src;
 	if (__predict_true(src->fc_ioctls == NULL)) {
 		MPASS(ioctls == NULL);
 		return;
 	}
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = ioctls;
 	bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 }
 
 /*
  * Move filecaps structure to the new place and clear the old place.
  */
 void
 filecaps_move(struct filecaps *src, struct filecaps *dst)
 {
 
 	*dst = *src;
 	bzero(src, sizeof(*src));
 }
 
 /*
  * Fill the given filecaps structure with full rights.
  */
 static void
 filecaps_fill(struct filecaps *fcaps)
 {
 
 	CAP_ALL(&fcaps->fc_rights);
 	fcaps->fc_ioctls = NULL;
 	fcaps->fc_nioctls = -1;
 	fcaps->fc_fcntls = CAP_FCNTL_ALL;
 }
 
 /*
  * Free memory allocated within filecaps structure.
  */
 void
 filecaps_free(struct filecaps *fcaps)
 {
 
 	free(fcaps->fc_ioctls, M_FILECAPS);
 	bzero(fcaps, sizeof(*fcaps));
 }
 
 static u_long *
 filecaps_free_prep(struct filecaps *fcaps)
 {
 	u_long *ioctls;
 
 	ioctls = fcaps->fc_ioctls;
 	bzero(fcaps, sizeof(*fcaps));
 	return (ioctls);
 }
 
 static void
 filecaps_free_finish(u_long *ioctls)
 {
 
 	free(ioctls, M_FILECAPS);
 }
 
 /*
  * Validate the given filecaps structure.
  */
 static void
 filecaps_validate(const struct filecaps *fcaps, const char *func)
 {
 
 	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 	    ("%s: invalid rights", func));
 	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 	    ("%s: invalid fcntls", func));
 	KASSERT(fcaps->fc_fcntls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 	    ("%s: fcntls without CAP_FCNTL", func));
 	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 	    ("%s: invalid ioctls", func));
 	KASSERT(fcaps->fc_nioctls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 	    ("%s: ioctls without CAP_IOCTL", func));
 }
 
 static void
 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 {
 	int nfd1;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	nfd1 = fdp->fd_nfiles * 2;
 	if (nfd1 < nfd)
 		nfd1 = nfd;
 	fdgrowtable(fdp, nfd1);
 }
 
 /*
  * Grow the file table to accommodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	struct fdescenttbl *ntable;
 	struct fdescenttbl *otable;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap, *omap;
 
 	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
 	/* save old values */
 	onfiles = fdp->fd_nfiles;
 	otable = fdp->fd_files;
 	omap = fdp->fd_map;
 
 	/* compute the size of the new table */
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/*
 	 * Allocate a new table.  We need enough space for the number of
 	 * entries, file entries themselves and the struct freetable we will use
 	 * when we decommission the table and place it on the freelist.
 	 * We place the struct freetable in the middle so we don't have
 	 * to worry about padding.
 	 */
 	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 	    sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	/* copy the old data */
 	ntable->fdt_nfiles = nnfiles;
 	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 	    onfiles * sizeof(ntable->fdt_ofiles[0]));
 
 	/*
 	 * Allocate a new map only if the old is not large enough.  It will
 	 * grow at a slower rate than the table as it can map more
 	 * entries than the table can hold.
 	 */
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 		    M_ZERO | M_WAITOK);
 		/* copy over the old data and update the pointer */
 		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 		fdp->fd_map = nmap;
 	}
 
 	/*
 	 * Make sure that ntable is correctly initialized before we replace
 	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 	 * data.
 	 */
 	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 
 	/*
 	 * Free the old file table when not shared by other threads or processes.
 	 * The old file table is considered to be shared when either are true:
 	 * - The process has more than one thread.
 	 * - The file descriptor table has been shared via fdshare().
 	 *
 	 * When shared, the old file table will be placed on a freelist
 	 * which will be processed when the struct filedesc is released.
 	 *
 	 * Note that if onfiles == NDFILE, we're dealing with the original
 	 * static allocation contained within (struct filedesc0 *)fdp,
 	 * which must not be freed.
 	 */
 	if (onfiles > NDFILE) {
 		/*
 		 * Note we may be called here from fdinit while allocating a
 		 * table for a new process in which case ->p_fd points
 		 * elsewhere.
 		 */
 		if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) {
 			free(otable, M_FILEDESC);
 		} else {
 			ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 			fdp0 = (struct filedesc0 *)fdp;
 			ft->ft_table = otable;
 			SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 		}
 	}
 	/*
 	 * The map does not have the same possibility of threads still
 	 * holding references to it.  So always free it as long as it
 	 * does not reference the original static allocation.
 	 */
 	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 		free(omap, M_FILEDESC);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
 	maxfd = getmaxfd(td);
 
 	/*
 	 * Search the bitmap for a free descriptor starting at minfd.
 	 * If none is found, grow the file table.
 	 */
 	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 	if (__predict_false(fd >= maxfd))
 		return (EMFILE);
 	if (__predict_false(fd >= fdp->fd_nfiles)) {
 		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		if (RACCT_ENABLED()) {
 			error = racct_set_unlocked(p, RACCT_NOFILE, allocfd);
 			if (error != 0)
 				return (EMFILE);
 		}
 #endif
 		/*
 		 * fd is already equal to first free descriptor >= minfd, so
 		 * we only need to grow the table and we are done.
 		 */
 		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("file descriptor isn't free"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Allocate n file descriptors for the process.
  */
 int
 fdallocn(struct thread *td, int minfd, int *fds, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int i;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	for (i = 0; i < n; i++)
 		if (fdalloc(td, 0, &fds[i]) != 0)
 			break;
 
 	if (i < n) {
 		for (i--; i >= 0; i--)
 			fdunused(fdp, fds[i]);
 		return (EMFILE);
 	}
 
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file descriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
     struct filecaps *fcaps)
 {
 	struct file *fp;
 	int error, fd;
 
 	MPASS(resultfp != NULL);
 	MPASS(resultfd != NULL);
 
 	error = _falloc_noinstall(td, &fp, 2);
 	if (__predict_false(error != 0)) {
 		return (error);
 	}
 
 	error = finstall_refed(td, fp, &fd, flags, fcaps);
 	if (__predict_false(error != 0)) {
 		falloc_abort(td, fp);
 		return (error);
 	}
 
 	*resultfp = fp;
 	*resultfd = fd;
 
 	return (0);
 }
 
 /*
  * Create a new open file structure without allocating a file descriptor.
  */
 int
 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n)
 {
 	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	int openfiles_new;
 	static struct timeval lastfail;
 	static int curfail;
 
 	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 	MPASS(n > 0);
 
 	openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
 	if ((openfiles_new >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles_new >= maxfiles) {
 		atomic_subtract_int(&openfiles, 1);
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, (%s) "
 			    "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
 		}
 		return (ENFILE);
 	}
 	fp = uma_zalloc(file_zone, M_WAITOK);
 	bzero(fp, sizeof(*fp));
 	refcount_init(&fp->f_count, n);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	*resultfp = fp;
 	return (0);
 }
 
 void
 falloc_abort(struct thread *td, struct file *fp)
 {
 
 	/*
 	 * For assertion purposes.
 	 */
 	refcount_init(&fp->f_count, 0);
 	_fdrop(fp, td);
 }
 
 /*
  * Install a file in a file descriptor table.
  */
 void
 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedescent *fde;
 
 	MPASS(fp != NULL);
 	if (fcaps != NULL)
 		filecaps_validate(fcaps, __func__);
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seqc_write_begin(&fde->fde_seqc);
 #endif
 	fde->fde_file = fp;
 	fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
 #ifdef CAPABILITIES
 	seqc_write_end(&fde->fde_seqc);
 #endif
 }
 
 int
 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 
 	MPASS(fd != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	error = fdalloc(td, 0, fd);
 	if (__predict_true(error == 0)) {
 		_finstall(fdp, fp, *fd, flags, fcaps);
 	}
 	FILEDESC_XUNLOCK(fdp);
 	return (error);
 }
 
 int
 finstall(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	int error;
 
 	MPASS(fd != NULL);
 
 	if (!fhold(fp))
 		return (EBADF);
 	error = finstall_refed(td, fp, fd, flags, fcaps);
 	if (__predict_false(error != 0)) {
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * Build a new filedesc structure from another.
  *
  * If fdp is not NULL, return with it shared locked.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp, bool prepfiles, int *lastfile)
 {
 	struct filedesc0 *newfdp0;
 	struct filedesc *newfdp;
 
 	if (prepfiles)
 		MPASS(lastfile != NULL);
 	else
 		MPASS(lastfile == NULL);
 
 	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 	newfdp = &newfdp0->fd_fd;
 
 	/* Create the file descriptor table. */
 	FILEDESC_LOCK_INIT(newfdp);
 	refcount_init(&newfdp->fd_refcnt, 1);
 	refcount_init(&newfdp->fd_holdcnt, 1);
 	newfdp->fd_map = newfdp0->fd_dmap;
 	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 	newfdp->fd_files->fdt_nfiles = NDFILE;
 
 	if (fdp == NULL)
 		return (newfdp);
 
 	FILEDESC_SLOCK(fdp);
 	if (!prepfiles) {
 		FILEDESC_SUNLOCK(fdp);
 		return (newfdp);
 	}
 
 	for (;;) {
 		*lastfile = fdlastfile(fdp);
 		if (*lastfile < newfdp->fd_nfiles)
 			break;
 		FILEDESC_SUNLOCK(fdp);
 		fdgrowtable(newfdp, *lastfile + 1);
 		FILEDESC_SLOCK(fdp);
 	}
 
 	return (newfdp);
 }
 
 /*
  * Build a pwddesc structure from another.
  * Copy the current, root, and jail root vnode references.
  *
  * If pdp is not NULL, return with it shared locked.
  */
 struct pwddesc *
 pdinit(struct pwddesc *pdp, bool keeplock)
 {
 	struct pwddesc *newpdp;
 	struct pwd *newpwd;
 
 	newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO);
 
 	PWDDESC_LOCK_INIT(newpdp);
 	refcount_init(&newpdp->pd_refcount, 1);
 	newpdp->pd_cmask = CMASK;
 
 	if (pdp == NULL) {
 		newpwd = pwd_alloc();
 		smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
 		return (newpdp);
 	}
 
 	PWDDESC_XLOCK(pdp);
 	newpwd = pwd_hold_pwddesc(pdp);
 	smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
 	if (!keeplock)
 		PWDDESC_XUNLOCK(pdp);
 	return (newpdp);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		refcount_acquire(&fdp->fd_holdcnt);
 	return (fdp);
 }
 
 static struct pwddesc *
 pdhold(struct proc *p)
 {
 	struct pwddesc *pdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	pdp = p->p_pd;
 	if (pdp != NULL)
 		refcount_acquire(&pdp->pd_refcount);
 	return (pdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 
 	if (refcount_load(&fdp->fd_holdcnt) > 1) {
 		if (refcount_release(&fdp->fd_holdcnt) == 0)
 			return;
 	}
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	uma_zfree(filedesc0_zone, fdp);
 }
 
 static void
 pddrop(struct pwddesc *pdp)
 {
 	struct pwd *pwd;
 
 	if (refcount_release_if_not_last(&pdp->pd_refcount))
 		return;
 
 	PWDDESC_XLOCK(pdp);
 	if (refcount_release(&pdp->pd_refcount) == 0) {
 		PWDDESC_XUNLOCK(pdp);
 		return;
 	}
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	pwd_set(pdp, NULL);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(pwd);
 
 	PWDDESC_LOCK_DESTROY(pdp);
 	free(pdp, M_PWDDESC);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	refcount_acquire(&fdp->fd_refcnt);
 	return (fdp);
 }
 
 /*
  * Share a pwddesc structure.
  */
 struct pwddesc *
 pdshare(struct pwddesc *pdp)
 {
 	refcount_acquire(&pdp->pd_refcount);
 	return (pdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct thread *td)
 {
 	struct filedesc *tmp;
 	struct proc *p = td->td_proc;
 
 	if (refcount_load(&p->p_fd->fd_refcnt) == 1)
 		return;
 
 	tmp = fdcopy(p->p_fd);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
 
 /*
  * Unshare a pwddesc structure.
  */
 void
 pdunshare(struct thread *td)
 {
 	struct pwddesc *pdp;
 	struct proc *p;
 
 	p = td->td_proc;
 	/* Not shared. */
 	if (p->p_pd->pd_refcount == 1)
 		return;
 
 	pdp = pdcopy(p->p_pd);
 	pdescfree(td);
 	p->p_pd = pdp;
 }
 
 void
 fdinstall_remapped(struct thread *td, struct filedesc *fdp)
 {
 
 	fdescfree(td);
 	td->td_proc->p_fd = fdp;
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int i, lastfile;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true, &lastfile);
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= lastfile; ++i) {
 		ofde = &fdp->fd_ofiles[i];
 		if (ofde->fde_file == NULL ||
 		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
 		    !fhold(ofde->fde_file)) {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 			continue;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fdused_init(newfdp, i);
 	}
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
 /*
  * Copy a pwddesc structure.
  */
 struct pwddesc *
 pdcopy(struct pwddesc *pdp)
 {
 	struct pwddesc *newpdp;
 
 	MPASS(pdp != NULL);
 
 	newpdp = pdinit(pdp, true);
 	newpdp->pd_cmask = pdp->pd_cmask;
 	PWDDESC_XUNLOCK(pdp);
 	return (newpdp);
 }
 
 /*
  * Copies a filedesc structure, while remapping all file descriptors
  * stored inside using a translation table.
  *
  * File descriptors are copied over to the new file descriptor table,
  * regardless of whether the close-on-exec flag is set.
  */
 int
 fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
     struct filedesc **ret)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int error, i, lastfile;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true, &lastfile);
 	if (nfds > lastfile + 1) {
 		/* New table cannot be larger than the old one. */
 		error = E2BIG;
 		goto bad;
 	}
 	/* Copy all passable descriptors (i.e. not kqueue). */
 	newfdp->fd_freefile = nfds;
 	for (i = 0; i < nfds; ++i) {
 		if (fds[i] < 0 || fds[i] > lastfile) {
 			/* File descriptor out of bounds. */
 			error = EBADF;
 			goto bad;
 		}
 		ofde = &fdp->fd_ofiles[fds[i]];
 		if (ofde->fde_file == NULL) {
 			/* Unused file descriptor. */
 			error = EBADF;
 			goto bad;
 		}
 		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 			/* File descriptor cannot be passed. */
 			error = EINVAL;
 			goto bad;
 		}
 		if (!fhold(ofde->fde_file)) {
 			error = EBADF;
 			goto bad;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fdused_init(newfdp, i);
 	}
 	FILEDESC_SUNLOCK(fdp);
 	*ret = newfdp;
 	return (0);
 bad:
 	FILEDESC_SUNLOCK(fdp);
 	fdescfree_remapped(newfdp);
 	return (error);
 }
 
 /*
  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
  * one of processes using it exits) and the table used to be shared.
  */
 static void
 fdclearlocks(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedesc_to_leader *fdtol;
 	struct flock lf;
 	struct file *fp;
 	struct proc *p;
 	struct vnode *vp;
 	int i, lastfile;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	fdtol = p->p_fdtol;
 	MPASS(fdtol != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	KASSERT(fdtol->fdl_refcount > 0,
 	    ("filedesc_to_refcount botch: fdl_refcount=%d",
 	    fdtol->fdl_refcount));
 	if (fdtol->fdl_refcount == 1 &&
 	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 		lastfile = fdlastfile(fdp);
 		for (i = 0; i <= lastfile; i++) {
 			fp = fdp->fd_ofiles[i].fde_file;
 			if (fp == NULL || fp->f_type != DTYPE_VNODE ||
 			    !fhold(fp))
 				continue;
 			FILEDESC_XUNLOCK(fdp);
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			vp = fp->f_vnode;
 			(void) VOP_ADVLOCK(vp,
 			    (caddr_t)p->p_leader, F_UNLCK,
 			    &lf, F_POSIX);
 			FILEDESC_XLOCK(fdp);
 			fdrop(fp, td);
 		}
 	}
 retry:
 	if (fdtol->fdl_refcount == 1) {
 		if (fdp->fd_holdleaderscount > 0 &&
 		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 			/*
 			 * close() or kern_dup() has cleared a reference
 			 * in a shared file descriptor table.
 			 */
 			fdp->fd_holdleaderswakeup = 1;
 			sx_sleep(&fdp->fd_holdleaderscount,
 			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 			goto retry;
 		}
 		if (fdtol->fdl_holdcount > 0) {
 			/*
 			 * Ensure that fdtol->fdl_leader remains
 			 * valid in closef().
 			 */
 			fdtol->fdl_wakeup = 1;
 			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 			    "fdlhold", 0);
 			goto retry;
 		}
 	}
 	fdtol->fdl_refcount--;
 	if (fdtol->fdl_refcount == 0 &&
 	    fdtol->fdl_holdcount == 0) {
 		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 	} else
 		fdtol = NULL;
 	p->p_fdtol = NULL;
 	FILEDESC_XUNLOCK(fdp);
 	if (fdtol != NULL)
 		free(fdtol, M_FILEDESC_TO_LEADER);
 }
 
 /*
  * Release a filedesc structure.
  */
 static void
 fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft, *tft;
 	struct filedescent *fde;
 	struct file *fp;
 	int i, lastfile;
 
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 0,
 	    ("%s: fd table %p carries references", __func__, fdp));
 
 	/*
 	 * Serialize with threads iterating over the table, if any.
 	 */
 	if (refcount_load(&fdp->fd_holdcnt) > 1) {
 		FILEDESC_XLOCK(fdp);
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	lastfile = fdlastfile_single(fdp);
 	for (i = 0; i <= lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL) {
 			fdefree_last(fde);
 			if (needclose)
 				(void) closef(fp, td);
 			else
 				fdrop(fp, td);
 		}
 	}
 
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		free(fdp->fd_map, M_FILEDESC);
 	if (fdp->fd_nfiles > NDFILE)
 		free(fdp->fd_files, M_FILEDESC);
 
 	fdp0 = (struct filedesc0 *)fdp;
 	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 		free(ft->ft_table, M_FILEDESC);
 
 	fddrop(fdp);
 }
 
 void
 fdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct filedesc *fdp;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	MPASS(fdp != NULL);
 
 #ifdef RACCT
 	if (RACCT_ENABLED())
 		racct_set_unlocked(p, RACCT_NOFILE, 0);
 #endif
 
 	if (p->p_fdtol != NULL)
 		fdclearlocks(td);
 
 	PROC_LOCK(p);
 	p->p_fd = NULL;
 	PROC_UNLOCK(p);
 
 	if (refcount_release(&fdp->fd_refcnt) == 0)
 		return;
 
 	fdescfree_fds(td, fdp, 1);
 }
 
 void
 pdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct pwddesc *pdp;
 
 	p = td->td_proc;
 	pdp = p->p_pd;
 	MPASS(pdp != NULL);
 
 	PROC_LOCK(p);
 	p->p_pd = NULL;
 	PROC_UNLOCK(p);
 
 	pddrop(pdp);
 }
 
 void
 fdescfree_remapped(struct filedesc *fdp)
 {
 #ifdef INVARIANTS
 	/* fdescfree_fds() asserts that fd_refcnt == 0. */
 	if (!refcount_release(&fdp->fd_refcnt))
 		panic("%s: fd table %p has extra references", __func__, fdp);
 #endif
 	fdescfree_fds(curthread, fdp, 0);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static bool
 is_unsafe(struct file *fp)
 {
 	struct vnode *vp;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (false);
 
 	vp = fp->f_vnode;
 	return ((vp->v_vflag & VV_PROCDEP) != 0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 fdsetugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	for (i = 0; i <= 2; i++) {
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL && is_unsafe(fp)) {
 			FILEDESC_XLOCK(fdp);
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
 	}
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct thread *td, struct file *fp, int idx)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx].fde_file == fp) {
 		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedescent *fde;
 	struct file *fp;
 	int i, lastfile;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	lastfile = fdlastfile_single(fdp);
 	for (i = 0; i <= lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fde->fde_flags & UF_EXCLOSE))) {
 			FILEDESC_XLOCK(fdp);
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, false, false);
 			FILEDESC_UNLOCK_ASSERT(fdp);
 		}
 	}
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	devnull = -1;
 	for (i = 0; i <= 2; i++) {
 		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 
 		save = td->td_retval[0];
 		if (devnull != -1) {
 			error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
 		} else {
 			error = kern_openat(td, AT_FDCWD, "/dev/null",
 			    UIO_SYSSPACE, O_RDWR, 0);
 			if (error == 0) {
 				devnull = td->td_retval[0];
 				KASSERT(devnull == i, ("we didn't get our fd"));
 			}
 		}
 		td->td_retval[0] = save;
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	MPASS(td != NULL);
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			    fdtol != td->td_proc->p_fdtol;
 			    fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				    P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	}
 	return (fdrop_close(fp, td));
 }
 
 /*
  * Hack for file descriptor passing code.
  */
 void
 closef_nothread(struct file *fp)
 {
 
 	fdrop(fp, NULL);
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  *
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 void
 finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops)
 {
 	fp->f_seqcount[UIO_READ] = 1;
 	fp->f_seqcount[UIO_WRITE] = 1;
 	finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE,
 	    data, ops);
 }
 
 int
 fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedescent *fde;
 	int error;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	fde = fdeget_locked(fdp, fd);
 	if (fde == NULL) {
 		error = EBADF;
 		goto out;
 	}
 
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights_fde_inline(fde), needrightsp);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (havecapsp != NULL)
 		filecaps_copy(&fde->fde_caps, havecapsp, true);
 
 	*fpp = fde->fde_file;
 
 	error = 0;
 out:
 	return (error);
 }
 
 int
 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 #ifndef CAPABILITIES
 	error = fget_unlocked(fdp, fd, needrightsp, fpp);
 	if (havecapsp != NULL && error == 0)
 		filecaps_fill(havecapsp);
 #else
 	struct file *fp;
 	seqc_t seq;
 
 	*fpp = NULL;
 	for (;;) {
 		error = fget_unlocked_seq(fdp, fd, needrightsp, &fp, &seq);
 		if (error != 0)
 			return (error);
 
 		if (havecapsp != NULL) {
 			if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
 			    havecapsp, false)) {
 				fdrop(fp, td);
 				goto get_locked;
 			}
 		}
 
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 
 	*fpp = fp;
 	return (0);
 
 get_locked:
 	FILEDESC_SLOCK(fdp);
 	error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
 	if (error == 0 && !fhold(*fpp))
 		error = EBADF;
 	FILEDESC_SUNLOCK(fdp);
 #endif
 	return (error);
 }
 
 #ifdef CAPABILITIES
 int
 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
 {
 	const struct filedescent *fde;
 	const struct fdescenttbl *fdt;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct vnode *vp;
 	const cap_rights_t *haverights;
 	cap_rights_t rights;
 	seqc_t seq;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	rights = *ndp->ni_rightsneeded;
 	cap_rights_set_one(&rights, CAP_LOOKUP);
 
 	fdp = curproc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 	seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 	fde = &fdt->fdt_ofiles[fd];
 	haverights = cap_rights_fde_inline(fde);
 	fp = fde->fde_file;
 	if (__predict_false(fp == NULL))
 		return (EAGAIN);
 	if (__predict_false(cap_check_inline_transient(haverights, &rights)))
 		return (EAGAIN);
 	*fsearch = ((fp->f_flag & FSEARCH) != 0);
 	vp = fp->f_vnode;
 	if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
 		return (EAGAIN);
 	}
 	if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) {
 		return (EAGAIN);
 	}
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 	if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
 		return (EAGAIN);
 	/*
 	 * If file descriptor doesn't have all rights,
 	 * all lookups relative to it must also be
 	 * strictly relative.
 	 *
 	 * Not yet supported by fast path.
 	 */
 	CAP_ALL(&rights);
 	if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
 	    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 	    ndp->ni_filecaps.fc_nioctls != -1) {
 #ifdef notyet
 		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 #else
 		return (EAGAIN);
 #endif
 	}
 	*vpp = vp;
 	return (0);
 }
 #else
 int
 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
 {
 	const struct fdescenttbl *fdt;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct vnode *vp;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	fdp = curproc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 	fp = fdt->fdt_ofiles[fd].fde_file;
 	if (__predict_false(fp == NULL))
 		return (EAGAIN);
 	*fsearch = ((fp->f_flag & FSEARCH) != 0);
 	vp = fp->f_vnode;
 	if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
 		return (EAGAIN);
 	}
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 	if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
 		return (EAGAIN);
 	filecaps_fill(&ndp->ni_filecaps);
 	*vpp = vp;
 	return (0);
 }
 #endif
 
 int
 fget_unlocked_seq(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, seqc_t *seqp)
 {
 #ifdef CAPABILITIES
 	const struct filedescent *fde;
 #endif
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 #ifdef CAPABILITIES
 	seqc_t seq;
 	cap_rights_t haverights;
 	int error;
 #endif
 
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 	/*
 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
 	 * never raising a refcount above 0.  To accomplish this we have
 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
 	 * must be re-verified once we acquire a reference to be certain
 	 * that the identity is still correct and we did not lose a race
 	 * due to preemption.
 	 */
 	for (;;) {
 #ifdef CAPABILITIES
 		seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 		fde = &fdt->fdt_ofiles[fd];
 		haverights = *cap_rights_fde_inline(fde);
 		fp = fde->fde_file;
 		if (!seqc_consistent(fd_seqc(fdt, fd), seq))
 			continue;
 #else
 		fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
 		error = cap_check_inline(&haverights, needrightsp);
 		if (error != 0)
 			return (error);
 #endif
 		if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
 			/*
 			 * Force a reload. Other thread could reallocate the
 			 * table before this fd was closed, so it is possible
 			 * that there is a stale fp pointer in cached version.
 			 */
 			fdt = atomic_load_ptr(&fdp->fd_files);
 			continue;
 		}
 		/*
 		 * Use an acquire barrier to force re-reading of fdt so it is
 		 * refreshed for verification.
 		 */
 		atomic_thread_fence_acq();
 		fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 		if (seqc_consistent_nomb(fd_seqc(fdt, fd), seq))
 #else
 		if (fp == fdt->fdt_ofiles[fd].fde_file)
 #endif
 			break;
 		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	if (seqp != NULL) {
 #ifdef CAPABILITIES
 		*seqp = seq;
 #endif
 	}
 	return (0);
 }
 
 /*
  * See the comments in fget_unlocked_seq for an explanation of how this works.
  *
  * This is a simplified variant which bails out to the aforementioned routine
  * if anything goes wrong. In practice this only happens when userspace is
  * racing with itself.
  */
 int
 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 #ifdef CAPABILITIES
 	const struct filedescent *fde;
 #endif
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 #ifdef CAPABILITIES
 	seqc_t seq;
 	const cap_rights_t *haverights;
 #endif
 
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 #ifdef CAPABILITIES
 	seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 	fde = &fdt->fdt_ofiles[fd];
 	haverights = cap_rights_fde_inline(fde);
 	fp = fde->fde_file;
 #else
 	fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 	if (__predict_false(fp == NULL))
 		goto out_fallback;
 #ifdef CAPABILITIES
 	if (__predict_false(cap_check_inline_transient(haverights, needrightsp)))
 		goto out_fallback;
 #endif
 	if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count)))
 		goto out_fallback;
 
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 	if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
 #else
 	if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
 #endif
 		goto out_fdrop;
 	*fpp = fp;
 	return (0);
 out_fdrop:
 	fdrop(fp, curthread);
 out_fallback:
 	return (fget_unlocked_seq(fdp, fd, needrightsp, fpp, NULL));
 }
 
 /*
  * Translate fd -> file when the caller guarantees the file descriptor table
  * can't be changed by others.
  *
  * Note this does not mean the file object itself is only visible to the caller,
  * merely that it wont disappear without having to be referenced.
  *
  * Must be paired with fput_only_user.
  */
 #ifdef	CAPABILITIES
 int
 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 	const struct filedescent *fde;
 	const struct fdescenttbl *fdt;
 	const cap_rights_t *haverights;
 	struct file *fp;
 	int error;
 
 	MPASS(FILEDESC_IS_ONLY_USER(fdp));
 
 	if (__predict_false(fd >= fdp->fd_nfiles))
 		return (EBADF);
 
 	fdt = fdp->fd_files;
 	fde = &fdt->fdt_ofiles[fd];
 	fp = fde->fde_file;
 	if (__predict_false(fp == NULL))
 		return (EBADF);
 	MPASS(refcount_load(&fp->f_count) > 0);
 	haverights = cap_rights_fde_inline(fde);
 	error = cap_check_inline(haverights, needrightsp);
 	if (__predict_false(error != 0))
 		return (error);
 	*fpp = fp;
 	return (0);
 }
 #else
 int
 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 	struct file *fp;
 
 	MPASS(FILEDESC_IS_ONLY_USER(fdp));
 
 	if (__predict_false(fd >= fdp->fd_nfiles))
 		return (EBADF);
 
 	fp = fdp->fd_ofiles[fd].fde_file;
 	if (__predict_false(fp == NULL))
 		return (EBADF);
 
 	MPASS(refcount_load(&fp->f_count) > 0);
 	*fpp = fp;
 	return (0);
 }
 #endif
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
  * File's rights will be checked against the capability rights mask.
  *
  * If an error occurred the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
     cap_rights_t *needrightsp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 
 	*fpp = NULL;
 	fdp = td->td_proc->p_fd;
 	error = fget_unlocked(fdp, fd, needrightsp, &fp);
 	if (__predict_false(error != 0))
 		return (error);
 	if (__predict_false(fp->f_ops == &badfileops)) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 */
 	error = 0;
 	switch (flags) {
 	case FREAD:
 	case FWRITE:
 		if ((fp->f_flag & flags) == 0)
 			error = EBADF;
 		break;
 	case FEXEC:
 	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 		    ((fp->f_flag & FWRITE) != 0))
 			error = EBADF;
 		break;
 	case 0:
 		break;
 	default:
 		KASSERT(0, ("wrong flags"));
 	}
 
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, 0, rightsp));
 }
 
 int
 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp,
     struct file **fpp)
 {
 	int error;
 #ifndef CAPABILITIES
 	error = _fget(td, fd, fpp, 0, rightsp);
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 	return (error);
 #else
 	cap_rights_t fdrights;
 	struct filedesc *fdp;
 	struct file *fp;
 	seqc_t seq;
 
 	*fpp = NULL;
 	fdp = td->td_proc->p_fd;
 	MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
 	for (;;) {
 		error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
 		if (__predict_false(error != 0))
 			return (error);
 		if (__predict_false(fp->f_ops == &badfileops)) {
 			fdrop(fp, td);
 			return (EBADF);
 		}
 		if (maxprotp != NULL)
 			fdrights = *cap_rights(fdp, fd);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 
 	/*
 	 * If requested, convert capability rights to access flags.
 	 */
 	if (maxprotp != NULL)
 		*maxprotp = cap_rights_to_vmprot(&fdrights);
 	*fpp = fp;
 	return (0);
 #endif
 }
 
 int
 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FREAD, rightsp));
 }
 
 int
 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FWRITE, rightsp));
 }
 
 int
 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
     struct file **fpp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 #ifndef CAPABILITIES
 	return (fget_unlocked(fdp, fd, rightsp, fpp));
 #else
 	struct file *fp;
 	int error;
 	seqc_t seq;
 
 	*fpp = NULL;
 	MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
 	for (;;) {
 		error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
 		if (error != 0)
 			return (error);
 		error = cap_fcntl_check(fdp, fd, needfcntl);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 	*fpp = fp;
 	return (0);
 #endif
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
     struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	error = _fget(td, fd, &fp, flags, needrightsp);
 	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vrefact(*vpp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp)
 {
 	struct filecaps caps;
 	struct file *fp;
 	int error;
 
 	error = fget_cap(td, fd, needrightsp, &fp, &caps);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	*havecaps = caps;
 	*vpp = fp->f_vnode;
 	vrefact(*vpp);
 	fdrop(fp, td);
 
 	return (0);
 out:
 	filecaps_free(&caps);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
 /*
  * Handle the last reference to a file being closed.
  *
  * Without the noinline attribute clang keeps inlining the func thorough this
  * file when fdrop is used.
  */
 int __noinline
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 #ifdef INVARIANTS
 	int count;
 
 	count = refcount_load(&fp->f_count);
 	if (count != 0)
 		panic("fdrop: fp %p count %d", fp, count);
 #endif
 	error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	free(fp->f_advice, M_FADVISE);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 sys_flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
 	error = fget(td, uap->fd, &cap_flock_rights, &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
 	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	u_long *ioctls;
 	int error, indx;
 
 	KASSERT(openerror == ENODEV || openerror == ENXIO,
 	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	error = fdalloc(td, 0, &indx);
 	if (error != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 */
 	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		if (!fhold(fp)) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EBADF);
 		}
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 		ioctls = filecaps_copy_prep(&oldfde->fde_caps);
 #ifdef CAPABILITIES
 		seqc_write_begin(&newfde->fde_seqc);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 		    ioctls);
 #ifdef CAPABILITIES
 		seqc_write_end(&newfde->fde_seqc);
 #endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seqc_write_begin(&newfde->fde_seqc);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		oldfde->fde_file = NULL;
 		fdunused(fdp, dfd);
 #ifdef CAPABILITIES
 		seqc_write_end(&newfde->fde_seqc);
 #endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	*indxp = indx;
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
     &chroot_allow_open_directories, 0,
     "Allow a process to chroot(2) if it has a directory open");
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(struct filedesc *fdp)
 {
 	struct vnode *vp;
 	struct file *fp;
 	int fd, lastfile;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	lastfile = fdlastfile(fdp);
 	for (fd = 0; fd <= lastfile; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 static void
 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
 {
 
 	if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) {
 		vrefact(oldpwd->pwd_cdir);
 		newpwd->pwd_cdir = oldpwd->pwd_cdir;
 	}
 
 	if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) {
 		vrefact(oldpwd->pwd_rdir);
 		newpwd->pwd_rdir = oldpwd->pwd_rdir;
 	}
 
 	if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) {
 		vrefact(oldpwd->pwd_jdir);
 		newpwd->pwd_jdir = oldpwd->pwd_jdir;
 	}
 }
 
 struct pwd *
 pwd_hold_pwddesc(struct pwddesc *pdp)
 {
 	struct pwd *pwd;
 
 	PWDDESC_ASSERT_XLOCKED(pdp);
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (pwd != NULL)
 		refcount_acquire(&pwd->pwd_refcount);
 	return (pwd);
 }
 
 bool
 pwd_hold_smr(struct pwd *pwd)
 {
 
 	MPASS(pwd != NULL);
 	if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) {
 		return (true);
 	}
 	return (false);
 }
 
 struct pwd *
 pwd_hold(struct thread *td)
 {
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 
 	pdp = td->td_proc->p_pd;
 
 	vfs_smr_enter();
 	pwd = vfs_smr_entered_load(&pdp->pd_pwd);
 	if (pwd_hold_smr(pwd)) {
 		vfs_smr_exit();
 		return (pwd);
 	}
 	vfs_smr_exit();
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	MPASS(pwd != NULL);
 	PWDDESC_XUNLOCK(pdp);
 	return (pwd);
 }
 
 static struct pwd *
 pwd_alloc(void)
 {
 	struct pwd *pwd;
 
 	pwd = uma_zalloc_smr(pwd_zone, M_WAITOK);
 	bzero(pwd, sizeof(*pwd));
 	refcount_init(&pwd->pwd_refcount, 1);
 	return (pwd);
 }
 
 void
 pwd_drop(struct pwd *pwd)
 {
 
 	if (!refcount_release(&pwd->pwd_refcount))
 		return;
 
 	if (pwd->pwd_cdir != NULL)
 		vrele(pwd->pwd_cdir);
 	if (pwd->pwd_rdir != NULL)
 		vrele(pwd->pwd_rdir);
 	if (pwd->pwd_jdir != NULL)
 		vrele(pwd->pwd_jdir);
 	uma_zfree_smr(pwd_zone, pwd);
 }
 
 /*
 * The caller is responsible for invoking priv_check() and
 * mac_vnode_check_chroot() to authorize this operation.
 */
 int
 pwd_chroot(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct filedesc *fdp;
 	struct pwd *newpwd, *oldpwd;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	pdp = td->td_proc->p_pd;
 	newpwd = pwd_alloc();
 	FILEDESC_SLOCK(fdp);
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 &&
 	    oldpwd->pwd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		FILEDESC_SUNLOCK(fdp);
 		if (error != 0) {
 			PWDDESC_XUNLOCK(pdp);
 			pwd_drop(newpwd);
 			return (error);
 		}
 	} else {
 		FILEDESC_SUNLOCK(fdp);
 	}
 
 	vrefact(vp);
 	newpwd->pwd_rdir = vp;
 	if (oldpwd->pwd_jdir == NULL) {
 		vrefact(vp);
 		newpwd->pwd_jdir = vp;
 	}
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 	return (0);
 }
 
 void
 pwd_chdir(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct pwd *newpwd, *oldpwd;
 
 	VNPASS(vp->v_usecount > 0, vp);
 
 	newpwd = pwd_alloc();
 	pdp = td->td_proc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	newpwd->pwd_cdir = vp;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 /*
  * jail_attach(2) changes both root and working directories.
  */
 int
 pwd_chroot_chdir(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct filedesc *fdp;
 	struct pwd *newpwd, *oldpwd;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	pdp = td->td_proc->p_pd;
 	newpwd = pwd_alloc();
 	FILEDESC_SLOCK(fdp);
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	error = chroot_refuse_vdir_fds(fdp);
 	FILEDESC_SUNLOCK(fdp);
 	if (error != 0) {
 		PWDDESC_XUNLOCK(pdp);
 		pwd_drop(newpwd);
 		return (error);
 	}
 
 	vrefact(vp);
 	newpwd->pwd_rdir = vp;
 	vrefact(vp);
 	newpwd->pwd_cdir = vp;
 	if (oldpwd->pwd_jdir == NULL) {
 		vrefact(vp);
 		newpwd->pwd_jdir = vp;
 	}
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 	return (0);
 }
 
 void
 pwd_ensure_dirs(void)
 {
 	struct pwddesc *pdp;
 	struct pwd *oldpwd, *newpwd;
 
 	pdp = curproc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) {
 		PWDDESC_XUNLOCK(pdp);
 		return;
 	}
 	PWDDESC_XUNLOCK(pdp);
 
 	newpwd = pwd_alloc();
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	pwd_fill(oldpwd, newpwd);
 	if (newpwd->pwd_cdir == NULL) {
 		vrefact(rootvnode);
 		newpwd->pwd_cdir = rootvnode;
 	}
 	if (newpwd->pwd_rdir == NULL) {
 		vrefact(rootvnode);
 		newpwd->pwd_rdir = rootvnode;
 	}
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 void
 pwd_set_rootvnode(void)
 {
 	struct pwddesc *pdp;
 	struct pwd *oldpwd, *newpwd;
 
 	pdp = curproc->p_pd;
 
 	newpwd = pwd_alloc();
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	vrefact(rootvnode);
 	newpwd->pwd_cdir = rootvnode;
 	vrefact(rootvnode);
 	newpwd->pwd_rdir = rootvnode;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 /*
  * Scan all active processes and prisons to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct pwddesc *pdp;
 	struct pwd *newpwd, *oldpwd;
 	struct prison *pr;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	nrele = 0;
 	newpwd = pwd_alloc();
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		pdp = pdhold(p);
 		PROC_UNLOCK(p);
 		if (pdp == NULL)
 			continue;
 		PWDDESC_XLOCK(pdp);
 		oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 		if (oldpwd == NULL ||
 		    (oldpwd->pwd_cdir != olddp &&
 		    oldpwd->pwd_rdir != olddp &&
 		    oldpwd->pwd_jdir != olddp)) {
 			PWDDESC_XUNLOCK(pdp);
 			pddrop(pdp);
 			continue;
 		}
 		if (oldpwd->pwd_cdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_cdir = newdp;
 		}
 		if (oldpwd->pwd_rdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_rdir = newdp;
 		}
 		if (oldpwd->pwd_jdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_jdir = newdp;
 		}
 		pwd_fill(oldpwd, newpwd);
 		pwd_set(pdp, newpwd);
 		PWDDESC_XUNLOCK(pdp);
 		pwd_drop(oldpwd);
 		pddrop(pdp);
 		newpwd = pwd_alloc();
 	}
 	sx_sunlock(&allproc_lock);
 	pwd_drop(newpwd);
 	if (rootvnode == olddp) {
 		vrefact(newdp);
 		rootvnode = newdp;
 		nrele++;
 	}
 	mtx_lock(&prison0.pr_mtx);
 	if (prison0.pr_root == olddp) {
 		vrefact(newdp);
 		prison0.pr_root = newdp;
 		nrele++;
 	}
 	mtx_unlock(&prison0.pr_mtx);
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		if (pr->pr_root == olddp) {
 			vrefact(newdp);
 			pr->pr_root = newdp;
 			nrele++;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	while (nrele--)
 		vrele(olddp);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
 	    M_FILEDESC_TO_LEADER, M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 static int
 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
 {
 	NDSLOTTYPE *map;
 	struct filedesc *fdp;
 	int count, off, minoff;
 
 	if (*(int *)arg1 != 0)
 		return (EINVAL);
 
 	fdp = curproc->p_fd;
 	count = 0;
 	FILEDESC_SLOCK(fdp);
 	map = fdp->fd_map;
 	off = NDSLOT(fdp->fd_nfiles - 1);
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		count += bitcountl(map[off]);
 	FILEDESC_SUNLOCK(fdp);
 
 	return (SYSCTL_OUT(req, &count, sizeof(count)));
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
     CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
     "Number of open file descriptors");
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n, lastfile;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			fdp = fdhold(p);
 			PROC_UNLOCK(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			n += fdp->fd_nfiles;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		lastfile = fdlastfile(fdp);
 		for (n = 0; refcount_load(&fdp->fd_refcnt) > 0 && n <= lastfile;
 		    n++) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = (uintptr_t)fp;
 			xf.xf_data = (uintptr_t)fp->f_data;
 			xf.xf_vnode = (uintptr_t)fp->f_vnode;
 			xf.xf_type = (uintptr_t)fp->f_type;
 			xf.xf_count = refcount_load(&fp->f_count);
 			xf.xf_msgcount = 0;
 			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static int
 xlate_fflags(int fflags)
 {
 	static const struct {
 		int	fflag;
 		int	kf_fflag;
 	} fflags_table[] = {
 		{ FAPPEND, KF_FLAG_APPEND },
 		{ FASYNC, KF_FLAG_ASYNC },
 		{ FFSYNC, KF_FLAG_FSYNC },
 		{ FHASLOCK, KF_FLAG_HASLOCK },
 		{ FNONBLOCK, KF_FLAG_NONBLOCK },
 		{ FREAD, KF_FLAG_READ },
 		{ FWRITE, KF_FLAG_WRITE },
 		{ O_CREAT, KF_FLAG_CREAT },
 		{ O_DIRECT, KF_FLAG_DIRECT },
 		{ O_EXCL, KF_FLAG_EXCL },
 		{ O_EXEC, KF_FLAG_EXEC },
 		{ O_EXLOCK, KF_FLAG_EXLOCK },
 		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 		{ O_SHLOCK, KF_FLAG_SHLOCK },
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 	unsigned int i;
 	int kflags;
 
 	kflags = 0;
 	for (i = 0; i < nitems(fflags_table); i++)
 		if (fflags & fflags_table[i].fflag)
 			kflags |=  fflags_table[i].kf_fflag;
 	return (kflags);
 }
 
 /* Trim unused data from kf_path by truncating the structure size. */
 void
 pack_kinfo(struct kinfo_file *kif)
 {
 
 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 	    strlen(kif->kf_path) + 1;
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 }
 
 static void
 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
     struct kinfo_file *kif, struct filedesc *fdp, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	/* Set a default type to allow for empty fill_kinfo() methods. */
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	kif->kf_flags = xlate_fflags(fp->f_flag);
 	if (rightsp != NULL)
 		kif->kf_cap_rights = *rightsp;
 	else
 		cap_rights_init_zero(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = refcount_load(&fp->f_count);
 	kif->kf_offset = foffset_get(fp);
 
 	/*
 	 * This may drop the filedesc lock, so the 'fp' cannot be
 	 * accessed after this call.
 	 */
 	error = fo_fill_kinfo(fp, kif, fdp);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 }
 
 static void
 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
     struct kinfo_file *kif, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	kif->kf_type = KF_TYPE_VNODE;
 	error = vn_fill_kinfo_vnode(vp, kif);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	kif->kf_flags = xlate_fflags(fflags);
 	cap_rights_init_zero(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = -1;
 	kif->kf_offset = -1;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 	vrele(vp);
 }
 
 struct export_fd_buf {
 	struct filedesc		*fdp;
 	struct pwddesc	*pdp;
 	struct sbuf 		*sb;
 	ssize_t			remainder;
 	struct kinfo_file	kif;
 	int			flags;
 };
 
 static int
 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 {
 	struct kinfo_file *kif;
 
 	kif = &efbuf->kif;
 	if (efbuf->remainder != -1) {
 		if (efbuf->remainder < kif->kf_structsize) {
 			/* Terminate export. */
 			efbuf->remainder = 0;
 			return (0);
 		}
 		efbuf->remainder -= kif->kf_structsize;
 	}
 	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
 }
 
 static int
 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
 	    efbuf->flags);
 	FILEDESC_SUNLOCK(efbuf->fdp);
 	error = export_kinfo_to_sb(efbuf);
 	FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 static int
 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	if (efbuf->pdp != NULL)
 		PWDDESC_XUNLOCK(efbuf->pdp);
 	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
 	error = export_kinfo_to_sb(efbuf);
 	if (efbuf->pdp != NULL)
 		PWDDESC_XLOCK(efbuf->pdp);
 	return (error);
 }
 
 /*
  * Store a process file descriptor information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
     int flags)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	struct pwddesc *pdp;
 	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	struct pwd *pwd;
 	int error, i, lastfile;
 	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* ktrace vnode */
 	tracevp = p->p_tracevp;
 	if (tracevp != NULL)
 		vrefact(tracevp);
 	/* text vnode */
 	textvp = p->p_textvp;
 	if (textvp != NULL)
 		vrefact(textvp);
 	/* Controlling tty. */
 	cttyvp = NULL;
 	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 		if (cttyvp != NULL)
 			vrefact(cttyvp);
 	}
 	fdp = fdhold(p);
 	pdp = pdhold(p);
 	PROC_UNLOCK(p);
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->pdp = NULL;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	efbuf->flags = flags;
 	if (tracevp != NULL)
 		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
 		    efbuf);
 	if (textvp != NULL)
 		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
 	if (cttyvp != NULL)
 		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
 		    efbuf);
 	error = 0;
 	if (pdp == NULL || fdp == NULL)
 		goto fail;
 	efbuf->fdp = fdp;
 	efbuf->pdp = pdp;
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	if (pwd != NULL) {
 		/* working directory */
 		if (pwd->pwd_cdir != NULL) {
 			vrefact(pwd->pwd_cdir);
 			export_vnode_to_sb(pwd->pwd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 		}
 		/* root directory */
 		if (pwd->pwd_rdir != NULL) {
 			vrefact(pwd->pwd_rdir);
 			export_vnode_to_sb(pwd->pwd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
 		}
 		/* jail directory */
 		if (pwd->pwd_jdir != NULL) {
 			vrefact(pwd->pwd_jdir);
 			export_vnode_to_sb(pwd->pwd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
 		}
 	}
 	PWDDESC_XUNLOCK(pdp);
 	if (pwd != NULL)
 		pwd_drop(pwd);
 	FILEDESC_SLOCK(fdp);
 	lastfile = fdlastfile(fdp);
 	for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 #ifdef CAPABILITIES
 		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
 		rights = cap_no_rights;
 #endif
 		/*
 		 * Create sysctl entry.  It is OK to drop the filedesc
 		 * lock inside of export_file_to_sb() as we will
 		 * re-validate and re-evaluate its properties when the
 		 * loop continues.
 		 */
 		error = export_file_to_sb(fp, i, &rights, efbuf);
 		if (error != 0 || efbuf->remainder == 0)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 fail:
 	if (fdp != NULL)
 		fddrop(fdp);
 	if (pdp != NULL)
 		pddrop(pdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 #define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_filedesc_out(p, &sb, maxlen,
 	    KERN_FILEDESC_PACK_KINFO);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #ifdef COMPAT_FREEBSD7
 #ifdef KINFO_OFILE_SIZE
 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 #endif
 
 static void
 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 {
 
 	okif->kf_structsize = sizeof(*okif);
 	okif->kf_type = kif->kf_type;
 	okif->kf_fd = kif->kf_fd;
 	okif->kf_ref_count = kif->kf_ref_count;
 	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 	okif->kf_offset = kif->kf_offset;
 	if (kif->kf_type == KF_TYPE_VNODE)
 		okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
 	else
 		okif->kf_vnode_type = KF_VTYPE_VNON;
 	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
 	if (kif->kf_type == KF_TYPE_SOCKET) {
 		okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
 		okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
 		okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
 		okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
 		okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
 	} else {
 		okif->kf_sa_local.ss_family = AF_UNSPEC;
 		okif->kf_sa_peer.ss_family = AF_UNSPEC;
 	}
 }
 
 static int
 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
     struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req)
 {
 	int error;
 
 	vrefact(vp);
 	PWDDESC_XUNLOCK(pdp);
 	export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
 	kinfo_to_okinfo(kif, okif);
 	error = SYSCTL_OUT(req, okif, sizeof(*okif));
 	PWDDESC_XLOCK(pdp);
 	return (error);
 }
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_ofile *okif;
 	struct kinfo_file *kif;
 	struct filedesc *fdp;
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 	int error, i, lastfile, *name;
 	struct file *fp;
 	struct proc *p;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
 	if (fdp != NULL)
 		pdp = pdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL || pdp == NULL) {
 		if (fdp != NULL)
 			fddrop(fdp);
 		return (ENOENT);
 	}
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	if (pwd != NULL) {
 		if (pwd->pwd_cdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif,
 			    okif, pdp, req);
 		if (pwd->pwd_rdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif,
 			    okif, pdp, req);
 		if (pwd->pwd_jdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif,
 			    okif, pdp, req);
 	}
 	PWDDESC_XUNLOCK(pdp);
 	if (pwd != NULL)
 		pwd_drop(pwd);
 	FILEDESC_SLOCK(fdp);
 	lastfile = fdlastfile(fdp);
 	for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		export_file_to_kinfo(fp, i, NULL, kif, fdp,
 		    KERN_FILEDESC_PACK_KINFO);
 		FILEDESC_SUNLOCK(fdp);
 		kinfo_to_okinfo(kif, okif);
 		error = SYSCTL_OUT(req, okif, sizeof(*okif));
 		FILEDESC_SLOCK(fdp);
 		if (error)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	pddrop(pdp);
 	free(kif, M_TEMP);
 	free(okif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
     "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 int
 vntype_to_kinfo(int vtype)
 {
 	struct {
 		int	vtype;
 		int	kf_vtype;
 	} vtypes_table[] = {
 		{ VBAD, KF_VTYPE_VBAD },
 		{ VBLK, KF_VTYPE_VBLK },
 		{ VCHR, KF_VTYPE_VCHR },
 		{ VDIR, KF_VTYPE_VDIR },
 		{ VFIFO, KF_VTYPE_VFIFO },
 		{ VLNK, KF_VTYPE_VLNK },
 		{ VNON, KF_VTYPE_VNON },
 		{ VREG, KF_VTYPE_VREG },
 		{ VSOCK, KF_VTYPE_VSOCK }
 	};
 	unsigned int i;
 
 	/*
 	 * Perform vtype translation.
 	 */
 	for (i = 0; i < nitems(vtypes_table); i++)
 		if (vtypes_table[i].vtype == vtype)
 			return (vtypes_table[i].kf_vtype);
 
 	return (KF_VTYPE_UNKNOWN);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
     "Process filedesc entries");
 
 /*
  * Store a process current working directory information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 	struct export_fd_buf *efbuf;
 	struct vnode *cdir;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	pdp = pdhold(p);
 	PROC_UNLOCK(p);
 	if (pdp == NULL)
 		return (EINVAL);
 
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->pdp = pdp;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 
 	PWDDESC_XLOCK(pdp);
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	cdir = pwd->pwd_cdir;
 	if (cdir == NULL) {
 		error = EINVAL;
 	} else {
 		vrefact(cdir);
 		error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 	}
 	PWDDESC_XUNLOCK(pdp);
 	pddrop(pdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Get per-process current working directory.
  */
 static int
 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_cwd_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
     sysctl_kern_proc_cwd, "Process current working directory");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnode");
 	case DTYPE_SOCKET:
 		return ("socket");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kqueue");
 	case DTYPE_CRYPTO:
 		return ("crypto");
 	case DTYPE_MQUEUE:
 		return ("mqueue");
 	case DTYPE_SHM:
 		return ("shm");
 	case DTYPE_SEM:
 		return ("ksem");
 	case DTYPE_PTS:
 		return ("pts");
 	case DTYPE_DEV:
 		return ("dev");
 	case DTYPE_PROCDESC:
 		return ("proc");
 	case DTYPE_EVENTFD:
 		return ("eventfd");
 	case DTYPE_LINUXTFD:
 		return ("ltimer");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; n++) {
 			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
 	struct proc *p;
 
 	if (header)
 		db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
 		    XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
 		    "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
 		    "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
 	    fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
 	    fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 
 #undef XPTRWIDTH
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     &openfiles, 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
 	/*
 	 * XXXMJG this is a temporary hack due to boot ordering issues against
 	 * the vnode zone.
 	 */
 	vfs_smr = uma_zone_get_smr(pwd_zone);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return (0);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = badfo_fill_kinfo,
 };
 
 int
 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_ioctl(struct file *fp, u_long com, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	return (ENOTTY);
 }
 
 int
 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (poll_no_poll(events));
 }
 
 int
 invfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 3db34c3689de..48df8a3e9051 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1,4925 +1,4925 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/disk.h>
 #include <sys/sysent.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/dirent.h>
 #include <sys/jail.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 #include <fs/devfs/devfs.h>
 
 #include <ufs/ufs/quota.h>
 
 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
 
 static int kern_chflagsat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, u_long flags, int atflag);
 static int setfflags(struct thread *td, struct vnode *, u_long);
 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
 static int getutimens(const struct timespec *, enum uio_seg,
     struct timespec *, int *);
 static int setutimes(struct thread *td, struct vnode *,
     const struct timespec *, int, int);
 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
     struct thread *td);
 static int kern_fhlinkat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, fhandle_t *fhp);
 static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg,
     size_t count, struct thread *td);
 static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd,
     const char *path, enum uio_seg segflag);
 
 static uint64_t
 at2cnpflags(u_int at_flags, u_int mask)
 {
 	u_int64_t res;
 
 	MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) !=
 	    (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW));
 
 	res = 0;
 	at_flags &= mask;
 	if ((at_flags & AT_RESOLVE_BENEATH) != 0)
 		res |= RBENEATH;
 	if ((at_flags & AT_SYMLINK_FOLLOW) != 0)
 		res |= FOLLOW;
 	/* NOFOLLOW is pseudo flag */
 	if ((mask & AT_SYMLINK_NOFOLLOW) != 0) {
 		res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW :
 		    FOLLOW;
 	}
 	return (res);
 }
 
 int
 kern_sync(struct thread *td)
 {
 	struct mount *mp, *nmp;
 	int save;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 			save = curthread_pflags_set(TDP_SYNCIO);
 			vfs_periodic(mp, MNT_NOWAIT);
 			VFS_SYNC(mp, MNT_NOWAIT);
 			curthread_pflags_restore(save);
 			vn_finished_write(mp);
 		}
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (0);
 }
 
 /*
  * Sync each mounted filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sync_args {
 	int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sync(struct thread *td, struct sync_args *uap)
 {
 
 	return (kern_sync(td));
 }
 
 /*
  * Change filesystem quotas.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct quotactl_args {
 	char *path;
 	int cmd;
 	int uid;
 	caddr_t arg;
 };
 #endif
 int
 sys_quotactl(struct thread *td, struct quotactl_args *uap)
 {
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_CMD(uap->cmd);
 	AUDIT_ARG_UID(uap->uid);
 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
 		return (EPERM);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
 	vput(nd.ni_vp);
 	error = vfs_busy(mp, 0);
 	if (error != 0) {
 		vfs_rel(mp);
 		return (error);
 	}
 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
 
 	/*
 	 * Since quota on operation typically needs to open quota
 	 * file, the Q_QUOTAON handler needs to unbusy the mount point
 	 * before calling into namei.  Otherwise, unmount might be
 	 * started between two vfs_busy() invocations (first is our,
 	 * second is from mount point cross-walk code in lookup()),
 	 * causing deadlock.
 	 *
 	 * Require that Q_QUOTAON handles the vfs_busy() reference on
 	 * its own, always returning with ubusied mount point.
 	 */
 	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON &&
 	    (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF)
 		vfs_unbusy(mp);
 	vfs_rel(mp);
 	return (error);
 }
 
 /*
  * Used by statfs conversion routines to scale the block size up if
  * necessary so that all of the block counts are <= 'max_size'.  Note
  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
  * value of 'n'.
  */
 void
 statfs_scale_blocks(struct statfs *sf, long max_size)
 {
 	uint64_t count;
 	int shift;
 
 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
 
 	/*
 	 * Attempt to scale the block counts to give a more accurate
 	 * overview to userland of the ratio of free space to used
 	 * space.  To do this, find the largest block count and compute
 	 * a divisor that lets it fit into a signed integer <= max_size.
 	 */
 	if (sf->f_bavail < 0)
 		count = -sf->f_bavail;
 	else
 		count = sf->f_bavail;
 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
 	if (count <= max_size)
 		return;
 
 	count >>= flsl(max_size);
 	shift = 0;
 	while (count > 0) {
 		shift++;
 		count >>=1;
 	}
 
 	sf->f_bsize <<= shift;
 	sf->f_blocks >>= shift;
 	sf->f_bfree >>= shift;
 	sf->f_bavail >>= shift;
 }
 
 static int
 kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
 {
 	int error;
 
 	if (mp == NULL)
 		return (EBADF);
 	error = vfs_busy(mp, 0);
 	vfs_rel(mp);
 	if (error != 0)
 		return (error);
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #endif
 	error = VFS_STATFS(mp, buf);
 	if (error != 0)
 		goto out;
 	if (priv_check_cred_vfs_generation(td->td_ucred)) {
 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, buf);
 	}
 out:
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct statfs_args {
 	char *path;
 	struct statfs *buf;
 };
 #endif
 int
 sys_statfs(struct thread *td, struct statfs_args *uap)
 {
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
 	if (error == 0)
 		error = copyout(sfp, uap->buf, sizeof(struct statfs));
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 int
 kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg,
     struct statfs *buf)
 {
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	mp = vfs_ref_from_vp(nd.ni_vp);
 	NDFREE_NOTHING(&nd);
 	vrele(nd.ni_vp);
 	return (kern_do_statfs(td, mp, buf));
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatfs_args {
 	int fd;
 	struct statfs *buf;
 };
 #endif
 int
 sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
 {
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sfp);
 	if (error == 0)
 		error = copyout(sfp, uap->buf, sizeof(struct statfs));
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 int
 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 {
 	struct file *fp;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getvnode(td, fd, &cap_fstatfs_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	if (AUDITING_TD(td)) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		AUDIT_ARG_VNODE1(vp);
 		VOP_UNLOCK(vp);
 	}
 #endif
 	mp = vfs_ref_from_vp(vp);
 	fdrop(fp, td);
 	return (kern_do_statfs(td, mp, buf));
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfsstat_args {
 	struct statfs *buf;
 	long bufsize;
 	int mode;
 };
 #endif
 int
 sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
 {
 	size_t count;
 	int error;
 
 	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
 		return (EINVAL);
 	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
 	    UIO_USERSPACE, uap->mode);
 	if (error == 0)
 		td->td_retval[0] = count;
 	return (error);
 }
 
 /*
  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
  *	The caller is responsible for freeing memory which will be allocated
  *	in '*buf'.
  */
 int
 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
     size_t *countp, enum uio_seg bufseg, int mode)
 {
 	struct mount *mp, *nmp;
 	struct statfs *sfsp, *sp, *sptmp, *tofree;
 	size_t count, maxcount;
 	int error;
 
 	switch (mode) {
 	case MNT_WAIT:
 	case MNT_NOWAIT:
 		break;
 	default:
 		if (bufseg == UIO_SYSSPACE)
 			*buf = NULL;
 		return (EINVAL);
 	}
 restart:
 	maxcount = bufsize / sizeof(struct statfs);
 	if (bufsize == 0) {
 		sfsp = NULL;
 		tofree = NULL;
 	} else if (bufseg == UIO_USERSPACE) {
 		sfsp = *buf;
 		tofree = NULL;
 	} else /* if (bufseg == UIO_SYSSPACE) */ {
 		count = 0;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			count++;
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (maxcount > count)
 			maxcount = count;
 		tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
 		    M_STATFS, M_WAITOK);
 	}
 
 	count = 0;
 
 	/*
 	 * If there is no target buffer they only want the count.
 	 *
 	 * This could be TAILQ_FOREACH but it is open-coded to match the original
 	 * code below.
 	 */
 	if (sfsp == NULL) {
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (prison_canseemount(td->td_ucred, mp) != 0) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 #ifdef MAC
 			if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 #endif
 			count++;
 			nmp = TAILQ_NEXT(mp, mnt_list);
 		}
 		mtx_unlock(&mountlist_mtx);
 		*countp = count;
 		return (0);
 	}
 
 	/*
 	 * They want the entire thing.
 	 *
 	 * Short-circuit the corner case of no room for anything, avoids
 	 * relocking below.
 	 */
 	if (maxcount < 1) {
 		goto out;
 	}
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (prison_canseemount(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #ifdef MAC
 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #endif
 		if (mode == MNT_WAIT) {
 			if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
 				/*
 				 * If vfs_busy() failed, and MBF_NOWAIT
 				 * wasn't passed, then the mp is gone.
 				 * Furthermore, because of MBF_MNTLSTLOCK,
 				 * the mountlist_mtx was dropped.  We have
 				 * no other choice than to start over.
 				 */
 				mtx_unlock(&mountlist_mtx);
 				free(tofree, M_STATFS);
 				goto restart;
 			}
 		} else {
 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 		}
 		sp = &mp->mnt_stat;
 		/*
 		 * If MNT_NOWAIT is specified, do not refresh
 		 * the fsstat cache.
 		 */
 		if (mode != MNT_NOWAIT) {
 			error = VFS_STATFS(mp, sp);
 			if (error != 0) {
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp);
 				continue;
 			}
 		}
 		if (priv_check_cred_vfs_generation(td->td_ucred)) {
 			sptmp = malloc(sizeof(struct statfs), M_STATFS,
 			    M_WAITOK);
 			*sptmp = *sp;
 			sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
 			prison_enforce_statfs(td->td_ucred, mp, sptmp);
 			sp = sptmp;
 		} else
 			sptmp = NULL;
 		if (bufseg == UIO_SYSSPACE) {
 			bcopy(sp, sfsp, sizeof(*sp));
 			free(sptmp, M_STATFS);
 		} else /* if (bufseg == UIO_USERSPACE) */ {
 			error = copyout(sp, sfsp, sizeof(*sp));
 			free(sptmp, M_STATFS);
 			if (error != 0) {
 				vfs_unbusy(mp);
 				return (error);
 			}
 		}
 		sfsp++;
 		count++;
 
 		if (count == maxcount) {
 			vfs_unbusy(mp);
 			goto out;
 		}
 
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 out:
 	*countp = count;
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD4
 /*
  * Get old format filesystem statistics.
  */
 static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_statfs_args {
 	char *path;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
 {
 	struct ostatfs osb;
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
 	if (error == 0) {
 		freebsd4_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fstatfs_args {
 	int fd;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
 {
 	struct ostatfs osb;
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sfp);
 	if (error == 0) {
 		freebsd4_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_getfsstat_args {
 	struct ostatfs *buf;
 	long bufsize;
 	int mode;
 };
 #endif
 int
 freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
 {
 	struct statfs *buf, *sp;
 	struct ostatfs osb;
 	size_t count, size;
 	int error;
 
 	if (uap->bufsize < 0)
 		return (EINVAL);
 	count = uap->bufsize / sizeof(struct ostatfs);
 	if (count > SIZE_MAX / sizeof(struct statfs))
 		return (EINVAL);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
 	    uap->mode);
 	if (error == 0)
 		td->td_retval[0] = count;
 	if (size != 0) {
 		sp = buf;
 		while (count != 0 && error == 0) {
 			freebsd4_cvtstatfs(sp, &osb);
 			error = copyout(&osb, uap->buf, sizeof(osb));
 			sp++;
 			uap->buf++;
 			count--;
 		}
 		free(buf, M_STATFS);
 	}
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
 {
 	struct ostatfs osb;
 	struct statfs *sfp;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error != 0)
 		return (error);
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sfp);
 	if (error == 0) {
 		freebsd4_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Convert a new format statfs structure to an old format statfs structure.
  */
 static void
 freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
 {
 
 	statfs_scale_blocks(nsp, LONG_MAX);
 	bzero(osp, sizeof(*osp));
 	osp->f_bsize = nsp->f_bsize;
 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
 	osp->f_blocks = nsp->f_blocks;
 	osp->f_bfree = nsp->f_bfree;
 	osp->f_bavail = nsp->f_bavail;
 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 	osp->f_owner = nsp->f_owner;
 	osp->f_type = nsp->f_type;
 	osp->f_flags = nsp->f_flags;
 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
 	    MIN(MFSNAMELEN, OMFSNAMELEN));
 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	osp->f_fsid = nsp->f_fsid;
 }
 #endif /* COMPAT_FREEBSD4 */
 
 #if defined(COMPAT_FREEBSD11)
 /*
  * Get old format filesystem statistics.
  */
 static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *);
 
 int
 freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap)
 {
 	struct freebsd11_statfs osb;
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
 	if (error == 0) {
 		freebsd11_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 int
 freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap)
 {
 	struct freebsd11_statfs osb;
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sfp);
 	if (error == 0) {
 		freebsd11_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Get statistics on all filesystems.
  */
 int
 freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap)
 {
 	struct freebsd11_statfs osb;
 	struct statfs *buf, *sp;
 	size_t count, size;
 	int error;
 
 	count = uap->bufsize / sizeof(struct ostatfs);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
 	    uap->mode);
 	if (error == 0)
 		td->td_retval[0] = count;
 	if (size > 0) {
 		sp = buf;
 		while (count > 0 && error == 0) {
 			freebsd11_cvtstatfs(sp, &osb);
 			error = copyout(&osb, uap->buf, sizeof(osb));
 			sp++;
 			uap->buf++;
 			count--;
 		}
 		free(buf, M_STATFS);
 	}
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 int
 freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap)
 {
 	struct freebsd11_statfs osb;
 	struct statfs *sfp;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sfp);
 	if (error == 0) {
 		freebsd11_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Convert a new format statfs structure to an old format statfs structure.
  */
 static void
 freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp)
 {
 
 	bzero(osp, sizeof(*osp));
 	osp->f_version = FREEBSD11_STATFS_VERSION;
 	osp->f_type = nsp->f_type;
 	osp->f_flags = nsp->f_flags;
 	osp->f_bsize = nsp->f_bsize;
 	osp->f_iosize = nsp->f_iosize;
 	osp->f_blocks = nsp->f_blocks;
 	osp->f_bfree = nsp->f_bfree;
 	osp->f_bavail = nsp->f_bavail;
 	osp->f_files = nsp->f_files;
 	osp->f_ffree = nsp->f_ffree;
 	osp->f_syncwrites = nsp->f_syncwrites;
 	osp->f_asyncwrites = nsp->f_asyncwrites;
 	osp->f_syncreads = nsp->f_syncreads;
 	osp->f_asyncreads = nsp->f_asyncreads;
 	osp->f_namemax = nsp->f_namemax;
 	osp->f_owner = nsp->f_owner;
 	osp->f_fsid = nsp->f_fsid;
 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
 	    MIN(MFSNAMELEN, sizeof(osp->f_fstypename)));
 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
 	    MIN(MNAMELEN, sizeof(osp->f_mntonname)));
 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 	    MIN(MNAMELEN, sizeof(osp->f_mntfromname)));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Change current working directory to a given file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchdir_args {
 	int	fd;
 };
 #endif
 int
 sys_fchdir(struct thread *td, struct fchdir_args *uap)
 {
 	struct vnode *vp, *tdp;
 	struct mount *mp;
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	error = getvnode(td, uap->fd, &cap_fchdir_rights,
 	    &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 	vrefact(vp);
 	fdrop(fp, td);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	error = change_dir(vp, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		if (vfs_busy(mp, 0))
 			continue;
 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
 		vfs_unbusy(mp);
 		if (error != 0)
 			break;
 		vput(vp);
 		vp = tdp;
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	VOP_UNLOCK(vp);
 	pwd_chdir(td, vp);
 	return (0);
 }
 
 /*
  * Change current working directory (``.'').
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chdir_args {
 	char	*path;
 };
 #endif
 int
 sys_chdir(struct thread *td, struct chdir_args *uap)
 {
 
 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
 }
 
 int
 kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
 		vput(nd.ni_vp);
 		NDFREE_NOTHING(&nd);
 		return (error);
 	}
 	VOP_UNLOCK(nd.ni_vp);
 	NDFREE_NOTHING(&nd);
 	pwd_chdir(td, nd.ni_vp);
 	return (0);
 }
 
 /*
  * Change notion of root (``/'') directory.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chroot_args {
 	char	*path;
 };
 #endif
 int
 sys_chroot(struct thread *td, struct chroot_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_CHROOT);
 	if (error != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error != 0)
 		goto error;
 	error = change_dir(nd.ni_vp, td);
 	if (error != 0)
 		goto e_vunlock;
 #ifdef MAC
 	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
 	if (error != 0)
 		goto e_vunlock;
 #endif
 	VOP_UNLOCK(nd.ni_vp);
 	error = pwd_chroot(td, nd.ni_vp);
 	vrele(nd.ni_vp);
 	NDFREE_NOTHING(&nd);
 	return (error);
 e_vunlock:
 	vput(nd.ni_vp);
 error:
 	NDFREE_NOTHING(&nd);
 	return (error);
 }
 
 /*
  * Common routine for chroot and chdir.  Callers must provide a locked vnode
  * instance.
  */
 int
 change_dir(struct vnode *vp, struct thread *td)
 {
 #ifdef MAC
 	int error;
 #endif
 
 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 #ifdef MAC
 	error = mac_vnode_check_chdir(td->td_ucred, vp);
 	if (error != 0)
 		return (error);
 #endif
 	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
 }
 
 static __inline void
 flags_to_rights(int flags, cap_rights_t *rightsp)
 {
 
 	if (flags & O_EXEC) {
 		cap_rights_set_one(rightsp, CAP_FEXECVE);
 	} else {
 		switch ((flags & O_ACCMODE)) {
 		case O_RDONLY:
 			cap_rights_set_one(rightsp, CAP_READ);
 			break;
 		case O_RDWR:
 			cap_rights_set_one(rightsp, CAP_READ);
 			/* FALLTHROUGH */
 		case O_WRONLY:
 			cap_rights_set_one(rightsp, CAP_WRITE);
 			if (!(flags & (O_APPEND | O_TRUNC)))
 				cap_rights_set_one(rightsp, CAP_SEEK);
 			break;
 		}
 	}
 
 	if (flags & O_CREAT)
 		cap_rights_set_one(rightsp, CAP_CREATE);
 
 	if (flags & O_TRUNC)
 		cap_rights_set_one(rightsp, CAP_FTRUNCATE);
 
 	if (flags & (O_SYNC | O_FSYNC))
 		cap_rights_set_one(rightsp, CAP_FSYNC);
 
 	if (flags & (O_EXLOCK | O_SHLOCK))
 		cap_rights_set_one(rightsp, CAP_FLOCK);
 }
 
 /*
  * Check permissions, allocate an open file structure, and call the device
  * open routine if any.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct open_args {
 	char	*path;
 	int	flags;
 	int	mode;
 };
 #endif
 int
 sys_open(struct thread *td, struct open_args *uap)
 {
 
 	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->flags, uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct openat_args {
 	int	fd;
 	char	*path;
 	int	flag;
 	int	mode;
 };
 #endif
 int
 sys_openat(struct thread *td, struct openat_args *uap)
 {
 
 	AUDIT_ARG_FD(uap->fd);
 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 	    uap->mode));
 }
 
 int
 kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
     int flags, int mode)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp;
 	struct pwddesc *pdp;
 	struct file *fp;
 	struct vnode *vp;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int cmode, error, indx;
 
 	indx = -1;
 	fdp = p->p_fd;
 	pdp = p->p_pd;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	cap_rights_init_one(&rights, CAP_LOOKUP);
 	flags_to_rights(flags, &rights);
 	/*
 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
 	 * may be specified.
 	 */
 	if (flags & O_EXEC) {
 		if (flags & O_ACCMODE)
 			return (EINVAL);
 	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
 		return (EINVAL);
 	} else {
 		flags = FFLAGS(flags);
 	}
 
 	/*
 	 * Allocate a file structure. The descriptor to reference it
 	 * is allocated and used by finstall_refed() below.
 	 */
 	error = falloc_noinstall(td, &fp);
 	if (error != 0)
 		return (error);
 	/* Set the flags early so the finit in devfs can pick them up. */
 	fp->f_flag = flags & FMASK;
 	cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT;
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
 	    &rights, td);
 	td->td_dupfd = -1;		/* XXX check for fdopen */
 	error = vn_open(&nd, &flags, cmode, fp);
 	if (error != 0) {
 		/*
 		 * If the vn_open replaced the method vector, something
 		 * wonderous happened deep below and we just pass it up
 		 * pretending we know what we do.
 		 */
 		if (error == ENXIO && fp->f_ops != &badfileops)
 			goto success;
 
 		/*
 		 * Handle special fdopen() case. bleh.
 		 *
 		 * Don't do this for relative (capability) lookups; we don't
 		 * understand exactly what would happen, and we don't think
 		 * that it ever should.
 		 */
 		if ((nd.ni_resflags & NIRES_STRICTREL) == 0 &&
 		    (error == ENODEV || error == ENXIO) &&
 		    td->td_dupfd >= 0) {
 			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
 			    &indx);
 			if (error == 0)
 				goto success;
 		}
 
 		goto bad;
 	}
 	td->td_dupfd = 0;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/*
 	 * Store the vnode, for any f_type. Typically, the vnode use
 	 * count is decremented by direct call to vn_closefile() for
 	 * files that switched type in the cdevsw fdopen() method.
 	 */
 	fp->f_vnode = vp;
 	/*
 	 * If the file wasn't claimed by devfs bind it to the normal
 	 * vnode operations here.
 	 */
 	if (fp->f_ops == &badfileops) {
 		KASSERT(vp->v_type != VFIFO,
 		    ("Unexpected fifo fp %p vp %p", fp, vp));
 		finit_vnode(fp, flags, NULL, &vnops);
 	}
 
 	VOP_UNLOCK(vp);
 	if (flags & O_TRUNC) {
 		error = fo_truncate(fp, 0, td->td_ucred, td);
 		if (error != 0)
 			goto bad;
 	}
 success:
 	/*
 	 * If we haven't already installed the FD (for dupfdopen), do so now.
 	 */
 	if (indx == -1) {
 		struct filecaps *fcaps;
 
 #ifdef CAPABILITIES
 		if ((nd.ni_resflags & NIRES_STRICTREL) != 0)
 			fcaps = &nd.ni_filecaps;
 		else
 #endif
 			fcaps = NULL;
 		error = finstall_refed(td, fp, &indx, flags, fcaps);
 		/* On success finstall_refed() consumes fcaps. */
 		if (error != 0) {
 			filecaps_free(&nd.ni_filecaps);
 			goto bad;
 		}
 	} else {
 		filecaps_free(&nd.ni_filecaps);
 		falloc_abort(td, fp);
 	}
 
 	td->td_retval[0] = indx;
 	return (0);
 bad:
 	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
 	falloc_abort(td, fp);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Create a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ocreat_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 ocreat(struct thread *td, struct ocreat_args *uap)
 {
 
 	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
 }
 #endif /* COMPAT_43 */
 
 /*
  * Create a special file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mknodat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 	dev_t	dev;
 };
 #endif
 int
 sys_mknodat(struct thread *td, struct mknodat_args *uap)
 {
 
 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
 	    uap->dev));
 }
 
 #if defined(COMPAT_FREEBSD11)
 int
 freebsd11_mknod(struct thread *td,
     struct freebsd11_mknod_args *uap)
 {
 
 	return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode, uap->dev));
 }
 
 int
 freebsd11_mknodat(struct thread *td,
     struct freebsd11_mknodat_args *uap)
 {
 
 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
 	    uap->dev));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 int
 kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
     int mode, dev_t dev)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error, whiteout = 0;
 
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_DEV(dev);
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
 	case S_IFBLK:
 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
 		if (error == 0 && dev == VNOVAL)
 			error = EINVAL;
 		break;
 	case S_IFWHT:
 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
 		break;
 	case S_IFIFO:
 		if (dev == 0)
 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
 		/* FALLTHROUGH */
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error != 0)
 		return (error);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 	    NOCACHE, pathseg, path, fd, &cap_mknodat_rights,
 	    td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
 		return (EEXIST);
 	} else {
 		VATTR_NULL(&vattr);
 		vattr.va_mode = (mode & ALLPERMS) &
 		    ~td->td_proc->p_pd->pd_cmask;
 		vattr.va_rdev = dev;
 		whiteout = 0;
 
 		switch (mode & S_IFMT) {
 		case S_IFCHR:
 			vattr.va_type = VCHR;
 			break;
 		case S_IFBLK:
 			vattr.va_type = VBLK;
 			break;
 		case S_IFWHT:
 			whiteout = 1;
 			break;
 		default:
 			panic("kern_mknod: invalid mode");
 		}
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 #ifdef MAC
 	if (error == 0 && !whiteout)
 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
 		    &nd.ni_cnd, &vattr);
 #endif
 	if (error == 0) {
 		if (whiteout)
 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 		else {
 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 						&nd.ni_cnd, &vattr);
 		}
 	}
 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL,
 	    true);
 	vn_finished_write(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == ERELOOKUP)
 		goto restart;
 	return (error);
 }
 
 /*
  * Create a named pipe.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifo_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
 {
 
 	return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifoat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 };
 #endif
 int
 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
 {
 
 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->mode));
 }
 
 int
 kern_mkfifoat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_MODE(mode);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 	    NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights,
 	    td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VFIFO;
 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask;
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 #ifdef MAC
 out:
 #endif
 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 	vn_finished_write(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == ERELOOKUP)
 		goto restart;
 	return (error);
 }
 
 /*
  * Make a hard file link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct link_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 sys_link(struct thread *td, struct link_args *uap)
 {
 
 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
 	    UIO_USERSPACE, FOLLOW));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct linkat_args {
 	int	fd1;
 	char	*path1;
 	int	fd2;
 	char	*path2;
 	int	flag;
 };
 #endif
 int
 sys_linkat(struct thread *td, struct linkat_args *uap)
 {
 	int flag;
 
 	flag = uap->flag;
 	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 
 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
 	    UIO_USERSPACE, at2cnpflags(flag, AT_SYMLINK_FOLLOW |
 	    AT_RESOLVE_BENEATH)));
 }
 
 int hardlink_check_uid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
     &hardlink_check_uid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "users");
 static int hardlink_check_gid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
     &hardlink_check_gid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "groups");
 
 static int
 can_hardlink(struct vnode *vp, struct ucred *cred)
 {
 	struct vattr va;
 	int error;
 
 	if (!hardlink_check_uid && !hardlink_check_gid)
 		return (0);
 
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error != 0)
 		return (error);
 
 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK);
 		if (error != 0)
 			return (error);
 	}
 
 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK);
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 int
 kern_linkat(struct thread *td, int fd1, int fd2, const char *path1,
     const char *path2, enum uio_seg segflag, int follow)
 {
 	struct nameidata nd;
 	int error;
 
 	do {
 		bwillwrite();
 		NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflag,
 		    path1, fd1, &cap_linkat_source_rights, td);
 		if ((error = namei(&nd)) != 0)
 			return (error);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag);
 	} while (error ==  EAGAIN || error == ERELOOKUP);
 	return (error);
 }
 
 static int
 kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path,
     enum uio_seg segflag)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	int error;
 
 	if (vp->v_type == VDIR) {
 		vrele(vp);
 		return (EPERM);		/* POSIX */
 	}
 	NDINIT_ATRIGHTS(&nd, CREATE,
 	    LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd,
 	    &cap_linkat_target_rights, td);
 	if ((error = namei(&nd)) == 0) {
 		if (nd.ni_vp != NULL) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			if (nd.ni_dvp == nd.ni_vp)
 				vrele(nd.ni_dvp);
 			else
 				vput(nd.ni_dvp);
 			vrele(nd.ni_vp);
 			vrele(vp);
 			return (EEXIST);
 		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
 			/*
 			 * Cross-device link.  No need to recheck
 			 * vp->v_type, since it cannot change, except
 			 * to VBAD.
 			 */
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vput(nd.ni_dvp);
 			vrele(vp);
 			return (EXDEV);
 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
 			error = can_hardlink(vp, td->td_ucred);
 #ifdef MAC
 			if (error == 0)
 				error = mac_vnode_check_link(td->td_ucred,
 				    nd.ni_dvp, vp, &nd.ni_cnd);
 #endif
 			if (error != 0) {
 				vput(vp);
 				vput(nd.ni_dvp);
 				NDFREE(&nd, NDF_ONLY_PNBUF);
 				return (error);
 			}
 			error = vn_start_write(vp, &mp, V_NOWAIT);
 			if (error != 0) {
 				vput(vp);
 				vput(nd.ni_dvp);
 				NDFREE(&nd, NDF_ONLY_PNBUF);
 				error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH);
 				if (error != 0)
 					return (error);
 				return (EAGAIN);
 			}
 			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 			VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
 			vn_finished_write(mp);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vp = NULL;
 		} else {
 			vput(nd.ni_dvp);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vrele(vp);
 			return (EAGAIN);
 		}
 	}
 	if (vp != NULL)
 		vrele(vp);
 	return (error);
 }
 
 /*
  * Make a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct symlink_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 sys_symlink(struct thread *td, struct symlink_args *uap)
 {
 
 	return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
 	    UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct symlinkat_args {
 	char	*path;
 	int	fd;
 	char	*path2;
 };
 #endif
 int
 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
 {
 
 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
 	    UIO_USERSPACE));
 }
 
 int
 kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2,
     enum uio_seg segflg)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	const char *syspath;
 	char *tmppath;
 	struct nameidata nd;
 	int error;
 
 	if (segflg == UIO_SYSSPACE) {
 		syspath = path1;
 	} else {
 		tmppath = uma_zalloc(namei_zone, M_WAITOK);
 		if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0)
 			goto out;
 		syspath = tmppath;
 	}
 	AUDIT_ARG_TEXT(syspath);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 	    NOCACHE, segflg, path2, fd, &cap_symlinkat_rights,
 	    td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	if (nd.ni_vp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		nd.ni_vp = NULL;
 		error = EEXIST;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask;
 #ifdef MAC
 	vattr.va_type = VLNK;
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out2;
 #endif
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
 #ifdef MAC
 out2:
 #endif
 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 	vn_finished_write(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == ERELOOKUP)
 		goto restart;
 out:
 	if (segflg != UIO_SYSSPACE)
 		uma_zfree(namei_zone, tmppath);
 	return (error);
 }
 
 /*
  * Delete a whiteout from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct undelete_args {
 	char *path;
 };
 #endif
 int
 sys_undelete(struct thread *td, struct undelete_args *uap)
 {
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 
 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (nd.ni_vp)
 			vrele(nd.ni_vp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	if (error == ERELOOKUP)
 		goto restart;
 	return (error);
 }
 
 /*
  * Delete a name from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unlink_args {
 	char	*path;
 };
 #endif
 int
 sys_unlink(struct thread *td, struct unlink_args *uap)
 {
 
 	return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
 	    0, 0));
 }
 
 static int
 kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd,
     int flag, enum uio_seg pathseg, ino_t oldinum)
 {
 
 	if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 
 	if ((flag & AT_REMOVEDIR) != 0)
 		return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0));
 
 	return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct unlinkat_args {
 	int	fd;
 	char	*path;
 	int	flag;
 };
 #endif
 int
 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
 {
 
 	return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag,
 	    UIO_USERSPACE, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct funlinkat_args {
 	int		dfd;
 	const char	*path;
 	int		fd;
 	int		flag;
 };
 #endif
 int
 sys_funlinkat(struct thread *td, struct funlinkat_args *uap)
 {
 
 	return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag,
 	    UIO_USERSPACE, 0));
 }
 
 int
 kern_funlinkat(struct thread *td, int dfd, const char *path, int fd,
     enum uio_seg pathseg, int flag, ino_t oldinum)
 {
 	struct mount *mp;
 	struct file *fp;
 	struct vnode *vp;
 	struct nameidata nd;
 	struct stat sb;
 	int error;
 
 	fp = NULL;
 	if (fd != FD_NONE) {
 		error = getvnode(td, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			return (error);
 	}
 
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
 	    at2cnpflags(flag, AT_RESOLVE_BENEATH),
 	    pathseg, path, dfd, &cap_unlinkat_rights, td);
 	if ((error = namei(&nd)) != 0) {
 		if (error == EINVAL)
 			error = EPERM;
 		goto fdout;
 	}
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR && oldinum == 0) {
 		error = EPERM;		/* POSIX */
 	} else if (oldinum != 0 &&
-		  ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
-		  sb.st_ino != oldinum) {
+	    ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
+	    sb.st_ino != oldinum) {
 		error = EIDRM;	/* Identifier removed */
 	} else if (fp != NULL && fp->f_vnode != vp) {
 		if (VN_IS_DOOMED(fp->f_vnode))
 			error = EBADF;
 		else
 			error = EDEADLK;
 	} else {
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
 		 *
 		 * XXX: can this only be a VDIR case?
 		 */
 		if (vp->v_vflag & VV_ROOT)
 			error = EBUSY;
 	}
 	if (error == 0) {
 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vput(nd.ni_dvp);
 			if (vp == nd.ni_dvp)
 				vrele(vp);
 			else
 				vput(vp);
 			if ((error = vn_start_write(NULL, &mp,
 			    V_XSLEEP | PCATCH)) != 0) {
 				goto fdout;
 			}
 			goto restart;
 		}
 #ifdef MAC
 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 		    &nd.ni_cnd);
 		if (error != 0)
 			goto out;
 #endif
 		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 #ifdef MAC
 out:
 #endif
 		vn_finished_write(mp);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	if (error == ERELOOKUP)
 		goto restart;
 fdout:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lseek_args {
 	int	fd;
 	int	pad;
 	off_t	offset;
 	int	whence;
 };
 #endif
 int
 sys_lseek(struct thread *td, struct lseek_args *uap)
 {
 
 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 }
 
 int
 kern_lseek(struct thread *td, int fd, off_t offset, int whence)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = fget(td, fd, &cap_seek_rights, &fp);
 	if (error != 0)
 		return (error);
 	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
 	    fo_seek(fp, offset, whence, td) : ESPIPE;
 	fdrop(fp, td);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olseek_args {
 	int	fd;
 	long	offset;
 	int	whence;
 };
 #endif
 int
 olseek(struct thread *td, struct olseek_args *uap)
 {
 
 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD6)
 /* Version with the 'pad' argument */
 int
 freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
 {
 
 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 }
 #endif
 
 /*
  * Check access permissions using passed credentials.
  */
 static int
 vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
      struct thread *td)
 {
 	accmode_t accmode;
 	int error;
 
 	/* Flags == 0 means only check for existence. */
 	if (user_flags == 0)
 		return (0);
 
 	accmode = 0;
 	if (user_flags & R_OK)
 		accmode |= VREAD;
 	if (user_flags & W_OK)
 		accmode |= VWRITE;
 	if (user_flags & X_OK)
 		accmode |= VEXEC;
 #ifdef MAC
 	error = mac_vnode_check_access(cred, vp, accmode);
 	if (error != 0)
 		return (error);
 #endif
 	if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
 		error = VOP_ACCESS(vp, accmode, cred, td);
 	return (error);
 }
 
 /*
  * Check access permissions using "real" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct access_args {
 	char	*path;
 	int	amode;
 };
 #endif
 int
 sys_access(struct thread *td, struct access_args *uap)
 {
 
 	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    0, uap->amode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct faccessat_args {
 	int	dirfd;
 	char	*path;
 	int	amode;
 	int	flag;
 }
 #endif
 int
 sys_faccessat(struct thread *td, struct faccessat_args *uap)
 {
 
 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 	    uap->amode));
 }
 
 int
 kern_accessat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, int flag, int amode)
 {
 	struct ucred *cred, *usecred;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 	if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
 		return (EINVAL);
 
 	/*
 	 * Create and modify a temporary credential instead of one that
 	 * is potentially shared (if we need one).
 	 */
 	cred = td->td_ucred;
 	if ((flag & AT_EACCESS) == 0 &&
 	    ((cred->cr_uid != cred->cr_ruid ||
 	    cred->cr_rgid != cred->cr_groups[0]))) {
 		usecred = crdup(cred);
 		usecred->cr_uid = cred->cr_ruid;
 		usecred->cr_groups[0] = cred->cr_rgid;
 		td->td_ucred = usecred;
 	} else
 		usecred = cred;
 	AUDIT_ARG_VALUE(amode);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
 	    AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH),
 	    pathseg, path, fd, &cap_fstat_rights, td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, amode, usecred, td);
 	NDFREE_NOTHING(&nd);
 	vput(vp);
 out:
 	if (usecred != cred) {
 		td->td_ucred = cred;
 		crfree(usecred);
 	}
 	return (error);
 }
 
 /*
  * Check access permissions using "effective" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct eaccess_args {
 	char	*path;
 	int	amode;
 };
 #endif
 int
 sys_eaccess(struct thread *td, struct eaccess_args *uap)
 {
 
 	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    AT_EACCESS, uap->amode));
 }
 
 #if defined(COMPAT_43)
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ostat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 ostat(struct thread *td, struct ostat_args *uap)
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error != 0)
 		return (error);
 	cvtstat(&sb, &osb);
 	return (copyout(&osb, uap->ub, sizeof (osb)));
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olstat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 olstat(struct thread *td, struct olstat_args *uap)
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	cvtstat(&sb, &osb);
 	return (copyout(&osb, uap->ub, sizeof (osb)));
 }
 
 /*
  * Convert from an old to a new stat structure.
  * XXX: many values are blindly truncated.
  */
 void
 cvtstat(struct stat *st, struct ostat *ost)
 {
 
 	bzero(ost, sizeof(*ost));
 	ost->st_dev = st->st_dev;
 	ost->st_ino = st->st_ino;
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	ost->st_size = MIN(st->st_size, INT32_MAX);
 	ost->st_atim = st->st_atim;
 	ost->st_mtim = st->st_mtim;
 	ost->st_ctim = st->st_ctim;
 	ost->st_blksize = st->st_blksize;
 	ost->st_blocks = st->st_blocks;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
 int ino64_trunc_error;
 SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW,
     &ino64_trunc_error, 0,
     "Error on truncation of device, file or inode number, or link count");
 
 int
 freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost)
 {
 
 	ost->st_dev = st->st_dev;
 	if (ost->st_dev != st->st_dev) {
 		switch (ino64_trunc_error) {
 		default:
 			/*
 			 * Since dev_t is almost raw, don't clamp to the
 			 * maximum for case 2, but ignore the error.
 			 */
 			break;
 		case 1:
 			return (EOVERFLOW);
 		}
 	}
 	ost->st_ino = st->st_ino;
 	if (ost->st_ino != st->st_ino) {
 		switch (ino64_trunc_error) {
 		default:
 		case 0:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		case 2:
 			ost->st_ino = UINT32_MAX;
 			break;
 		}
 	}
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	if (ost->st_nlink != st->st_nlink) {
 		switch (ino64_trunc_error) {
 		default:
 		case 0:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		case 2:
 			ost->st_nlink = UINT16_MAX;
 			break;
 		}
 	}
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	if (ost->st_rdev != st->st_rdev) {
 		switch (ino64_trunc_error) {
 		default:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		}
 	}
 	ost->st_atim = st->st_atim;
 	ost->st_mtim = st->st_mtim;
 	ost->st_ctim = st->st_ctim;
 	ost->st_size = st->st_size;
 	ost->st_blocks = st->st_blocks;
 	ost->st_blksize = st->st_blksize;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 	ost->st_lspare = 0;
 	ost->st_birthtim = st->st_birthtim;
 	bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim),
 	    sizeof(*ost) - offsetof(struct freebsd11_stat,
 	    st_birthtim) - sizeof(ost->st_birthtim));
 	return (0);
 }
 
 int
 freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->ub, sizeof(osb));
 	return (error);
 }
 
 int
 freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->ub, sizeof(osb));
 	return (error);
 }
 
 int
 freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap)
 {
 	struct fhandle fh;
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error != 0)
 		return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->sb, sizeof(osb));
 	return (error);
 }
 
 int
 freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	return (error);
 }
 #endif	/* COMPAT_FREEBSD11 */
 
 /*
  * Get file status
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatat_args {
 	int	fd;
 	char	*path;
 	struct stat	*buf;
 	int	flag;
 }
 #endif
 int
 sys_fstatat(struct thread *td, struct fstatat_args *uap)
 {
 	struct stat sb;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error == 0)
 		error = copyout(&sb, uap->buf, sizeof (sb));
 	return (error);
 }
 
 int
 kern_statat(struct thread *td, int flag, int fd, const char *path,
     enum uio_seg pathseg, struct stat *sbp,
     void (*hook)(struct vnode *vp, struct stat *sbp))
 {
 	struct nameidata nd;
 	int error;
 
 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH |
 	    AT_SYMLINK_NOFOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, fd, &cap_fstat_rights, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td);
 	if (error == 0) {
 		if (__predict_false(hook != NULL))
 			hook(nd.ni_vp, sbp);
 	}
 	NDFREE_NOTHING(&nd);
 	vput(nd.ni_vp);
 #ifdef __STAT_TIME_T_EXT
 	sbp->st_atim_ext = 0;
 	sbp->st_mtim_ext = 0;
 	sbp->st_ctim_ext = 0;
 	sbp->st_btim_ext = 0;
 #endif
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrstat_error(sbp, error);
 #endif
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 /*
  * Implementation of the NetBSD [l]stat() functions.
  */
 void
 freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb)
 {
 
 	bzero(nsb, sizeof(*nsb));
 	nsb->st_dev = sb->st_dev;
 	nsb->st_ino = sb->st_ino;
 	nsb->st_mode = sb->st_mode;
 	nsb->st_nlink = sb->st_nlink;
 	nsb->st_uid = sb->st_uid;
 	nsb->st_gid = sb->st_gid;
 	nsb->st_rdev = sb->st_rdev;
 	nsb->st_atim = sb->st_atim;
 	nsb->st_mtim = sb->st_mtim;
 	nsb->st_ctim = sb->st_ctim;
 	nsb->st_size = sb->st_size;
 	nsb->st_blocks = sb->st_blocks;
 	nsb->st_blksize = sb->st_blksize;
 	nsb->st_flags = sb->st_flags;
 	nsb->st_gen = sb->st_gen;
 	nsb->st_birthtim = sb->st_birthtim;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_nstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 int
 freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap)
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error != 0)
 		return (error);
 	freebsd11_cvtnstat(&sb, &nsb);
 	return (copyout(&nsb, uap->ub, sizeof (nsb)));
 }
 
 /*
  * NetBSD lstat.  Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_nlstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 int
 freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap)
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	freebsd11_cvtnstat(&sb, &nsb);
 	return (copyout(&nsb, uap->ub, sizeof (nsb)));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Get configurable pathname variables.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 int
 sys_pathconf(struct thread *td, struct pathconf_args *uap)
 {
 	long value;
 	int error;
 
 	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW,
 	    &value);
 	if (error == 0)
 		td->td_retval[0] = value;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct lpathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 int
 sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
 {
 	long value;
 	int error;
 
 	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
 	    NOFOLLOW, &value);
 	if (error == 0)
 		td->td_retval[0] = value;
 	return (error);
 }
 
 int
 kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg,
     int name, u_long flags, long *valuep)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 
 	error = VOP_PATHCONF(nd.ni_vp, name, valuep);
 	vput(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readlink_args {
 	char	*path;
 	char	*buf;
 	size_t	count;
 };
 #endif
 int
 sys_readlink(struct thread *td, struct readlink_args *uap)
 {
 
 	return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->buf, UIO_USERSPACE, uap->count));
 }
 #ifndef _SYS_SYSPROTO_H_
 struct readlinkat_args {
 	int	fd;
 	char	*path;
 	char	*buf;
 	size_t	bufsize;
 };
 #endif
 int
 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
 {
 
 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->buf, UIO_USERSPACE, uap->bufsize));
 }
 
 int
 kern_readlinkat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	if (count > IOSIZE_MAX)
 		return (EINVAL);
 
 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, fd, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	vp = nd.ni_vp;
 
 	error = kern_readlink_vp(vp, buf, bufseg, count, td);
 	vput(vp);
 
 	return (error);
 }
 
 /*
  * Helper function to readlink from a vnode
  */
 static int
 kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count,
     struct thread *td)
 {
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked");
 #ifdef MAC
 	error = mac_vnode_check_readlink(td->td_ucred, vp);
 	if (error != 0)
 		return (error);
 #endif
 	if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
 		return (EINVAL);
 
 	aiov.iov_base = buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = 0;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	auio.uio_resid = count;
 	error = VOP_READLINK(vp, &auio, td->td_ucred);
 	td->td_retval[0] = count - auio.uio_resid;
 	return (error);
 }
 
 /*
  * Common implementation code for chflags() and fchflags().
  */
 static int
 setfflags(struct thread *td, struct vnode *vp, u_long flags)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	/* We can't support the value matching VNOVAL. */
 	if (flags == VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Prevent non-root users from setting flags on devices.  When
 	 * a device is reused, users can retain ownership of the device
 	 * if they are allowed to set flags and programs assume that
 	 * chown can't fail when done as root.
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
 		if (error != 0)
 			return (error);
 	}
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VATTR_NULL(&vattr);
 	vattr.va_flags = flags;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chflags_args {
 	const char *path;
 	u_long	flags;
 };
 #endif
 int
 sys_chflags(struct thread *td, struct chflags_args *uap)
 {
 
 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->flags, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct chflagsat_args {
 	int	fd;
 	const char *path;
 	u_long	flags;
 	int	atflag;
 }
 #endif
 int
 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
 {
 
 	if ((uap->atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 
 	return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->flags, uap->atflag));
 }
 
 /*
  * Same as chflags() but doesn't follow symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchflags_args {
 	const char *path;
 	u_long flags;
 };
 #endif
 int
 sys_lchflags(struct thread *td, struct lchflags_args *uap)
 {
 
 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->flags, AT_SYMLINK_NOFOLLOW));
 }
 
 static int
 kern_chflagsat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, u_long flags, int atflag)
 {
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_FFLAGS(flags);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH) | AUDITVNODE1, pathseg, path, fd,
 	    &cap_fchflags_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setfflags(td, nd.ni_vp, flags);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchflags_args {
 	int	fd;
 	u_long	flags;
 };
 #endif
 int
 sys_fchflags(struct thread *td, struct fchflags_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_FFLAGS(uap->flags);
 	error = getvnode(td, uap->fd, &cap_fchflags_rights,
 	    &fp);
 	if (error != 0)
 		return (error);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(fp->f_vnode);
 	VOP_UNLOCK(fp->f_vnode);
 #endif
 	error = setfflags(td, fp->f_vnode, uap->flags);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for chmod(), lchmod() and fchmod().
  */
 int
 setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_mode = mode & ALLPERMS;
 #ifdef MAC
 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change mode of a file given path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_chmod(struct thread *td, struct chmod_args *uap)
 {
 
 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fchmodat_args {
 	int	dirfd;
 	char	*path;
 	mode_t	mode;
 	int	flag;
 }
 #endif
 int
 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
 {
 
 	if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 
 	return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->mode, uap->flag));
 }
 
 /*
  * Change mode of a file given path name (don't follow links.)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_lchmod(struct thread *td, struct lchmod_args *uap)
 {
 
 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode, AT_SYMLINK_NOFOLLOW));
 }
 
 int
 kern_fchmodat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, mode_t mode, int flag)
 {
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_MODE(mode);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH) | AUDITVNODE1, pathseg, path, fd,
 	    &cap_fchmod_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Change mode of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchmod_args {
 	int	fd;
 	int	mode;
 };
 #endif
 int
 sys_fchmod(struct thread *td, struct fchmod_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_MODE(uap->mode);
 
 	error = fget(td, uap->fd, &cap_fchmod_rights, &fp);
 	if (error != 0)
 		return (error);
 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation for chown(), lchown(), and fchown()
  */
 int
 setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_uid = uid;
 	vattr.va_gid = gid;
 #ifdef MAC
 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
 	    vattr.va_gid);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_chown(struct thread *td, struct chown_args *uap)
 {
 
 	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
 	    uap->gid, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fchownat_args {
 	int fd;
 	const char * path;
 	uid_t uid;
 	gid_t gid;
 	int flag;
 };
 #endif
 int
 sys_fchownat(struct thread *td, struct fchownat_args *uap)
 {
 
 	if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 
 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
 	    uap->gid, uap->flag));
 }
 
 int
 kern_fchownat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, int uid, int gid, int flag)
 {
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_OWNER(uid, gid);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH) | AUDITVNODE1, pathseg, path, fd,
 	    &cap_fchown_rights, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name, do not cross symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_lchown(struct thread *td, struct lchown_args *uap)
 {
 
 	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
 }
 
 /*
  * Set ownership given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchown_args {
 	int	fd;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_fchown(struct thread *td, struct fchown_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
 	error = fget(td, uap->fd, &cap_fchown_rights, &fp);
 	if (error != 0)
 		return (error);
 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
     struct timespec *tsp)
 {
 	struct timeval tv[2];
 	const struct timeval *tvp;
 	int error;
 
 	if (usrtvp == NULL) {
 		vfs_timestamp(&tsp[0]);
 		tsp[1] = tsp[0];
 	} else {
 		if (tvpseg == UIO_SYSSPACE) {
 			tvp = usrtvp;
 		} else {
 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
 				return (error);
 			tvp = tv;
 		}
 
 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
 			return (EINVAL);
 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
 	}
 	return (0);
 }
 
 /*
  * Common implementation code for futimens(), utimensat().
  */
 #define	UTIMENS_NULL	0x1
 #define	UTIMENS_EXIT	0x2
 static int
 getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
     struct timespec *tsp, int *retflags)
 {
 	struct timespec tsnow;
 	int error;
 
 	vfs_timestamp(&tsnow);
 	*retflags = 0;
 	if (usrtsp == NULL) {
 		tsp[0] = tsnow;
 		tsp[1] = tsnow;
 		*retflags |= UTIMENS_NULL;
 		return (0);
 	}
 	if (tspseg == UIO_SYSSPACE) {
 		tsp[0] = usrtsp[0];
 		tsp[1] = usrtsp[1];
 	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
 		return (error);
 	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
 		*retflags |= UTIMENS_EXIT;
 	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
 		*retflags |= UTIMENS_NULL;
 	if (tsp[0].tv_nsec == UTIME_OMIT)
 		tsp[0].tv_sec = VNOVAL;
 	else if (tsp[0].tv_nsec == UTIME_NOW)
 		tsp[0] = tsnow;
 	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
 		return (EINVAL);
 	if (tsp[1].tv_nsec == UTIME_OMIT)
 		tsp[1].tv_sec = VNOVAL;
 	else if (tsp[1].tv_nsec == UTIME_NOW)
 		tsp[1] = tsnow;
 	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
  * and utimensat().
  */
 static int
 setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
     int numtimes, int nullflag)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error, setbirthtime;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	setbirthtime = 0;
 	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
 		setbirthtime = 1;
 	VATTR_NULL(&vattr);
 	vattr.va_atime = ts[0];
 	vattr.va_mtime = ts[1];
 	if (setbirthtime)
 		vattr.va_birthtime = ts[1];
 	if (numtimes > 2)
 		vattr.va_birthtime = ts[2];
 	if (nullflag)
 		vattr.va_vaflags |= VA_UTIMES_NULL;
 #ifdef MAC
 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
 	    vattr.va_mtime);
 #endif
 	if (error == 0)
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct utimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_utimes(struct thread *td, struct utimes_args *uap)
 {
 
 	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->tptr, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct futimesat_args {
 	int fd;
 	const char * path;
 	const struct timeval * times;
 };
 #endif
 int
 sys_futimesat(struct thread *td, struct futimesat_args *uap)
 {
 
 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->times, UIO_USERSPACE));
 }
 
 int
 kern_utimesat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct nameidata nd;
 	struct timespec ts[2];
 	int error;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
 	    &cap_futimes_rights, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lutimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_lutimes(struct thread *td, struct lutimes_args *uap)
 {
 
 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 	    UIO_USERSPACE));
 }
 
 int
 kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct nameidata nd;
 	int error;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct futimes_args {
 	int	fd;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_futimes(struct thread *td, struct futimes_args *uap)
 {
 
 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
 }
 
 int
 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
     enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getutimes(tptr, tptrseg, ts);
 	if (error != 0)
 		return (error);
 	error = getvnode(td, fd, &cap_futimes_rights, &fp);
 	if (error != 0)
 		return (error);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(fp->f_vnode);
 	VOP_UNLOCK(fp->f_vnode);
 #endif
 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_futimens(struct thread *td, struct futimens_args *uap)
 {
 
 	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
 }
 
 int
 kern_futimens(struct thread *td, int fd, struct timespec *tptr,
     enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct file *fp;
 	int error, flags;
 
 	AUDIT_ARG_FD(fd);
 	error = getutimens(tptr, tptrseg, ts, &flags);
 	if (error != 0)
 		return (error);
 	if (flags & UTIMENS_EXIT)
 		return (0);
 	error = getvnode(td, fd, &cap_futimes_rights, &fp);
 	if (error != 0)
 		return (error);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(fp->f_vnode);
 	VOP_UNLOCK(fp->f_vnode);
 #endif
 	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_utimensat(struct thread *td, struct utimensat_args *uap)
 {
 
 	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->times, UIO_USERSPACE, uap->flag));
 }
 
 int
 kern_utimensat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg,
     int flag)
 {
 	struct nameidata nd;
 	struct timespec ts[2];
 	int error, flags;
 
 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 
 	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
 		return (error);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH) | AUDITVNODE1,
 	    pathseg, path, fd, &cap_futimes_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	/*
 	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
 	 * POSIX states:
 	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
 	 * "Search permission is denied by a component of the path prefix."
 	 */
 	NDFREE_NOTHING(&nd);
 	if ((flags & UTIMENS_EXIT) == 0)
 		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct truncate_args {
 	char	*path;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 sys_truncate(struct thread *td, struct truncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 
 int
 kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg,
     off_t length)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	if (length < 0)
 		return (EINVAL);
 retry:
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vn_rangelock_unlock(vp, rl_cookie);
 		vrele(vp);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 #ifdef MAC
 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
 	}
 #endif
 	else if ((error = vn_writechk(vp)) == 0 &&
 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	}
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	vn_rangelock_unlock(vp, rl_cookie);
 	vrele(vp);
 	if (error == ERELOOKUP)
 		goto retry;
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct otruncate_args {
 	char	*path;
 	long	length;
 };
 #endif
 int
 otruncate(struct thread *td, struct otruncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD6)
 /* Versions with the pad argument */
 int
 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 
 int
 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 #endif
 
 int
 kern_fsync(struct thread *td, int fd, bool fullsync)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	int error, lock_flags;
 
 	AUDIT_ARG_FD(fd);
 	error = getvnode(td, fd, &cap_fsync_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 #if 0
 	if (!fullsync)
 		/* XXXKIB: compete outstanding aio writes */;
 #endif
 retry:
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		goto drop;
 	if (MNT_SHARED_WRITES(mp) ||
 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
 		lock_flags = LK_SHARED;
 	} else {
 		lock_flags = LK_EXCLUSIVE;
 	}
 	vn_lock(vp, lock_flags | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	if (error == ERELOOKUP)
 		goto retry;
 drop:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Sync an open file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fsync_args {
 	int	fd;
 };
 #endif
 int
 sys_fsync(struct thread *td, struct fsync_args *uap)
 {
 
 	return (kern_fsync(td, uap->fd, true));
 }
 
 int
 sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
 {
 
 	return (kern_fsync(td, uap->fd, false));
 }
 
 /*
  * Rename files.  Source and destination must either both be directories, or
  * both not be directories.  If target is a directory, it must be empty.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rename_args {
 	char	*from;
 	char	*to;
 };
 #endif
 int
 sys_rename(struct thread *td, struct rename_args *uap)
 {
 
 	return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
 	    uap->to, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct renameat_args {
 	int	oldfd;
 	char	*old;
 	int	newfd;
 	char	*new;
 };
 #endif
 int
 sys_renameat(struct thread *td, struct renameat_args *uap)
 {
 
 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
 	    UIO_USERSPACE));
 }
 
 #ifdef MAC
 static int
 kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd,
     const char *new, enum uio_seg pathseg, struct nameidata *fromnd)
 {
 	int error;
 
 	NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
 	    AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td);
 	if ((error = namei(fromnd)) != 0)
 		return (error);
 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp,
 	    fromnd->ni_vp, &fromnd->ni_cnd);
 	VOP_UNLOCK(fromnd->ni_dvp);
 	if (fromnd->ni_dvp != fromnd->ni_vp)
 		VOP_UNLOCK(fromnd->ni_vp);
 	if (error != 0) {
 		NDFREE(fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd->ni_dvp);
 		vrele(fromnd->ni_vp);
 		if (fromnd->ni_startdir)
 			vrele(fromnd->ni_startdir);
 	}
 	return (error);
 }
 #endif
 
 int
 kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
     const char *new, enum uio_seg pathseg)
 {
 	struct mount *mp = NULL;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	u_int64_t tondflags;
 	int error;
 
 again:
 	bwillwrite();
 #ifdef MAC
 	if (mac_vnode_check_rename_from_enabled()) {
 		error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg,
 		    &fromnd);
 		if (error != 0)
 			return (error);
 	} else {
 #endif
 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
 	    pathseg, old, oldfd, &cap_renameat_source_rights, td);
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
 #ifdef MAC
 	}
 #endif
 	fvp = fromnd.ni_vp;
 	tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2;
 	if (fromnd.ni_vp->v_type == VDIR)
 		tondflags |= WILLBEDIR;
 	NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd,
 	    &cap_renameat_target_rights, td);
 	if ((error = namei(&tond)) != 0) {
 		/* Translate error code for rename("dir1", "dir2/."). */
 		if (error == EISDIR && fvp->v_type == VDIR)
 			error = EINVAL;
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
 	error = vn_start_write(fvp, &mp, V_NOWAIT);
 	if (error != 0) {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tvp != NULL)
 			vput(tvp);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		vrele(tond.ni_startdir);
 		if (fromnd.ni_startdir != NULL)
 			vrele(fromnd.ni_startdir);
 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
 		if (error != 0)
 			return (error);
 		goto again;
 	}
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			error = EISDIR;
 			goto out;
 		}
 #ifdef CAPABILITIES
 		if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) {
 			/*
 			 * If the target already exists we require CAP_UNLINKAT
 			 * from 'newfd', when newfd was used for the lookup.
 			 */
 			error = cap_check(&tond.ni_filecaps.fc_rights,
 			    &cap_unlinkat_rights);
 			if (error != 0)
 				goto out;
 		}
 #endif
 	}
 	if (fvp == tdvp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * If the source is the same as the destination (that is, if they
 	 * are links to the same vnode), then there is nothing to do.
 	 */
 	if (fvp == tvp)
 		error = ERESTART;
 #ifdef MAC
 	else
 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
 #endif
 out:
 	if (error == 0) {
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 	} else {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tvp != NULL)
 			vput(tvp);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
 	vrele(tond.ni_startdir);
 	vn_finished_write(mp);
 out1:
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
 	if (error == ERESTART)
 		return (0);
 	if (error == ERELOOKUP)
 		goto again;
 	return (error);
 }
 
 /*
  * Make a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkdir_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_mkdir(struct thread *td, struct mkdir_args *uap)
 {
 
 	return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mkdirat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 };
 #endif
 int
 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
 {
 
 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg,
     int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_MODE(mode);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 	    NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR,
 	    segflg, path, fd, &cap_mkdirat_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VDIR;
 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask;
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 #ifdef MAC
 out:
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 	vn_finished_write(mp);
 	if (error == ERELOOKUP)
 		goto restart;
 	return (error);
 }
 
 /*
  * Remove a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rmdir_args {
 	char	*path;
 };
 #endif
 int
 sys_rmdir(struct thread *td, struct rmdir_args *uap)
 {
 
 	return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
 	    0));
 }
 
 int
 kern_frmdirat(struct thread *td, int dfd, const char *path, int fd,
     enum uio_seg pathseg, int flag)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct file *fp;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error;
 
 	fp = NULL;
 	if (fd != FD_NONE) {
-		error = getvnode(td, fd, cap_rights_init_one(&rights, CAP_LOOKUP),
-		    &fp);
+		error = getvnode(td, fd, cap_rights_init_one(&rights,
+		    CAP_LOOKUP), &fp);
 		if (error != 0)
 			return (error);
 	}
 
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
 	    at2cnpflags(flag, AT_RESOLVE_BENEATH),
 	    pathseg, path, dfd, &cap_unlinkat_rights, td);
 	if ((error = namei(&nd)) != 0)
 		goto fdout;
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (nd.ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_vflag & VV_ROOT) {
 		error = EBUSY;
 		goto out;
 	}
 
 	if (fp != NULL && fp->f_vnode != vp) {
 		if (VN_IS_DOOMED(fp->f_vnode))
 			error = EBADF;
 		else
 			error = EDEADLK;
 		goto out;
 	}
 
 #ifdef MAC
 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 	    &nd.ni_cnd);
 	if (error != 0)
 		goto out;
 #endif
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(vp);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto fdout;
 		goto restart;
 	}
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (nd.ni_dvp == vp)
 		vrele(nd.ni_dvp);
 	else
 		vput(nd.ni_dvp);
 	if (error == ERELOOKUP)
 		goto restart;
 fdout:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
 int
 freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count,
     long *basep, void (*func)(struct freebsd11_dirent *))
 {
 	struct freebsd11_dirent dstdp;
 	struct dirent *dp, *edp;
 	char *dirbuf;
 	off_t base;
 	ssize_t resid, ucount;
 	int error;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	count = min(count, 64 * 1024);
 
 	dirbuf = malloc(count, M_TEMP, M_WAITOK);
 
 	error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid,
 	    UIO_SYSSPACE);
 	if (error != 0)
 		goto done;
 	if (basep != NULL)
 		*basep = base;
 
 	ucount = 0;
 	for (dp = (struct dirent *)dirbuf,
 	    edp = (struct dirent *)&dirbuf[count - resid];
 	    ucount < count && dp < edp; ) {
 		if (dp->d_reclen == 0)
 			break;
 		MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0));
 		if (dp->d_namlen >= sizeof(dstdp.d_name))
 			continue;
 		dstdp.d_type = dp->d_type;
 		dstdp.d_namlen = dp->d_namlen;
 		dstdp.d_fileno = dp->d_fileno;		/* truncate */
 		if (dstdp.d_fileno != dp->d_fileno) {
 			switch (ino64_trunc_error) {
 			default:
 			case 0:
 				break;
 			case 1:
 				error = EOVERFLOW;
 				goto done;
 			case 2:
 				dstdp.d_fileno = UINT32_MAX;
 				break;
 			}
 		}
 		dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) +
 		    ((dp->d_namlen + 1 + 3) &~ 3);
 		bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
 		bzero(dstdp.d_name + dstdp.d_namlen,
 		    dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) -
 		    dstdp.d_namlen);
 		MPASS(dstdp.d_reclen <= dp->d_reclen);
 		MPASS(ucount + dstdp.d_reclen <= count);
 		if (func != NULL)
 			func(&dstdp);
 		error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen);
 		if (error != 0)
 			break;
 		dp = (struct dirent *)((char *)dp + dp->d_reclen);
 		ucount += dstdp.d_reclen;
 	}
 
 done:
 	free(dirbuf, M_TEMP);
 	if (error == 0)
 		td->td_retval[0] = ucount;
 	return (error);
 }
 #endif /* COMPAT */
 
 #ifdef COMPAT_43
 static void
 ogetdirentries_cvt(struct freebsd11_dirent *dp)
 {
 #if (BYTE_ORDER == LITTLE_ENDIAN)
 	/*
 	 * The expected low byte of dp->d_namlen is our dp->d_type.
 	 * The high MBZ byte of dp->d_namlen is our dp->d_namlen.
 	 */
 	dp->d_type = dp->d_namlen;
 	dp->d_namlen = 0;
 #else
 	/*
 	 * The dp->d_type is the high byte of the expected dp->d_namlen,
 	 * so must be zero'ed.
 	 */
 	dp->d_type = 0;
 #endif
 }
 
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ogetdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
 {
 	long loff;
 	int error;
 
 	error = kern_ogetdirentries(td, uap, &loff);
 	if (error == 0)
 		error = copyout(&loff, uap->basep, sizeof(long));
 	return (error);
 }
 
 int
 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
     long *ploff)
 {
 	long base;
 	int error;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (uap->count > 64 * 1024)
 		return (EINVAL);
 
 	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
 	    &base, ogetdirentries_cvt);
 
 	if (error == 0 && uap->basep != NULL)
 		error = copyout(&base, uap->basep, sizeof(long));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD11)
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_getdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 freebsd11_getdirentries(struct thread *td,
     struct freebsd11_getdirentries_args *uap)
 {
 	long base;
 	int error;
 
 	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
 	    &base, NULL);
 
 	if (error == 0 && uap->basep != NULL)
 		error = copyout(&base, uap->basep, sizeof(long));
 	return (error);
 }
 
 int
 freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap)
 {
 	struct freebsd11_getdirentries_args ap;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	return (freebsd11_getdirentries(td, &ap));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 int
 sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
 {
 	off_t base;
 	int error;
 
 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
 	    NULL, UIO_USERSPACE);
 	if (error != 0)
 		return (error);
 	if (uap->basep != NULL)
 		error = copyout(&base, uap->basep, sizeof(off_t));
 	return (error);
 }
 
 int
 kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
     off_t *basep, ssize_t *residp, enum uio_seg bufseg)
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	off_t loff;
 	int error, eofflag;
 	off_t foffset;
 
 	AUDIT_ARG_FD(fd);
 	if (count > IOSIZE_MAX)
 		return (EINVAL);
 	auio.uio_resid = count;
 	error = getvnode(td, fd, &cap_read_rights, &fp);
 	if (error != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
 	if (vp->v_type != VDIR) {
 		error = EINVAL;
 		goto fail;
 	}
 	aiov.iov_base = buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
 	foffset = auio.uio_offset;
 	if (error != 0) {
 		VOP_UNLOCK(vp);
 		goto fail;
 	}
 	if (count == auio.uio_resid &&
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
 
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
 		foffset = 0;
 		vput(tvp);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp);
 	*basep = loff;
 	if (residp != NULL)
 		*residp = auio.uio_resid;
 	td->td_retval[0] = count - auio.uio_resid;
 fail:
 	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Set the mode mask for creation of filesystem nodes.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct umask_args {
 	int	newmask;
 };
 #endif
 int
 sys_umask(struct thread *td, struct umask_args *uap)
 {
 	struct pwddesc *pdp;
 
 	pdp = td->td_proc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	td->td_retval[0] = pdp->pd_cmask;
 	pdp->pd_cmask = uap->newmask & ALLPERMS;
 	PWDDESC_XUNLOCK(pdp);
 	return (0);
 }
 
 /*
  * Void all references to file by ripping underlying filesystem away from
  * vnode.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct revoke_args {
 	char	*path;
 };
 #endif
 int
 sys_revoke(struct thread *td, struct revoke_args *uap)
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE_NOTHING(&nd);
 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_revoke(td->td_ucred, vp);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 	if (error != 0)
 		goto out;
 	if (td->td_ucred->cr_uid != vattr.va_uid) {
 		error = priv_check(td, PRIV_VFS_ADMIN);
 		if (error != 0)
 			goto out;
 	}
 	if (devfs_usecount(vp) > 0)
 		VOP_REVOKE(vp, REVOKEALL);
 out:
 	vput(vp);
 	return (error);
 }
 
 /*
  * Convert a user file descriptor to a kernel file entry and check that, if it
  * is a capability, the correct rights are present. A reference on the file
  * entry is held upon returning.
  */
 int
 getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * The file could be not of the vnode type, or it may be not
 	 * yet fully initialized, in which case the f_vnode pointer
 	 * may be set, but f_ops is still badfileops.  E.g.,
 	 * devfs_open() transiently create such situation to
 	 * facilitate csw d_fdopen().
 	 *
 	 * Dupfdopen() handling in kern_openat() installs the
 	 * half-baked file into the process descriptor table, allowing
 	 * other thread to dereference it. Guard against the race by
 	 * checking f_ops.
 	 */
 	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 /*
  * Get an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lgetfh_args {
 	char *fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
 {
 
 	return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname,
 	    UIO_USERSPACE, uap->fhp, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getfh_args {
 	char *fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 sys_getfh(struct thread *td, struct getfh_args *uap)
 {
 
 	return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE,
 	    uap->fhp, UIO_USERSPACE));
 }
 
 /*
  * syscall for the rpc.lockd to use to translate an open descriptor into
  * a NFS file handle.
  *
  * warning: do not remove the priv_check() call or this becomes one giant
  * security hole.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfhat_args {
 	int fd;
 	char *path;
 	fhandle_t *fhp;
 	int flags;
 };
 #endif
 int
 sys_getfhat(struct thread *td, struct getfhat_args *uap)
 {
 
 	if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 	return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->fhp, UIO_USERSPACE));
 }
 
 int
 kern_getfhat(struct thread *td, int flags, int fd, const char *path,
     enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg)
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error != 0)
 		return (error);
 	NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path,
 	    fd, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	if (error == 0) {
 		if (fhseg == UIO_USERSPACE)
 			error = copyout(&fh, fhp, sizeof (fh));
 		else
 			memcpy(fhp, &fh, sizeof(fh));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fhlink_args {
 	fhandle_t *fhp;
 	const char *to;
 };
 #endif
 int
 sys_fhlink(struct thread *td, struct fhlink_args *uap)
 {
 
 	return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fhlinkat_args {
 	fhandle_t *fhp;
 	int tofd;
 	const char *to;
 };
 #endif
 int
 sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap)
 {
 
 	return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp));
 }
 
 static int
 kern_fhlinkat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, fhandle_t *fhp)
 {
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error != 0)
 		return (error);
 	error = copyin(fhp, &fh, sizeof(fh));
 	if (error != 0)
 		return (error);
 	do {
 		bwillwrite();
 		if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 			return (ESTALE);
 		error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
 		vfs_unbusy(mp);
 		if (error != 0)
 			return (error);
 		VOP_UNLOCK(vp);
 		error = kern_linkat_vp(td, vp, fd, path, pathseg);
 	} while (error == EAGAIN || error == ERELOOKUP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fhreadlink_args {
 	fhandle_t *fhp;
 	char *buf;
 	size_t bufsize;
 };
 #endif
 int
 sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap)
 {
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error != 0)
 		return (error);
 	if (uap->bufsize > IOSIZE_MAX)
 		return (EINVAL);
 	error = copyin(uap->fhp, &fh, sizeof(fh));
 	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
 	vfs_unbusy(mp);
 	if (error != 0)
 		return (error);
 	error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td);
 	vput(vp);
 	return (error);
 }
 
 /*
  * syscall for the rpc.lockd to use to translate a NFS file handle into an
  * open descriptor.
  *
  * warning: do not remove the priv_check() call or this becomes one giant
  * security hole.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhopen_args {
 	const struct fhandle *u_fhp;
 	int flags;
 };
 #endif
 int
 sys_fhopen(struct thread *td, struct fhopen_args *uap)
 {
 	return (kern_fhopen(td, uap->u_fhp, uap->flags));
 }
 
 int
 kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
 	struct file *fp;
 	int fmode, error;
 	int indx;
 
 	error = priv_check(td, PRIV_VFS_FHOPEN);
 	if (error != 0)
 		return (error);
 	indx = -1;
 	fmode = FFLAGS(flags);
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 		return (EINVAL);
 	error = copyin(u_fhp, &fhp, sizeof(fhp));
 	if (error != 0)
 		return(error);
 	/* find the mount point */
 	mp = vfs_busyfs(&fhp.fh_fsid);
 	if (mp == NULL)
 		return (ESTALE);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
 	if (error != 0)
 		return (error);
 
 	error = falloc_noinstall(td, &fp);
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	/*
 	 * An extra reference on `fp' has been held for us by
 	 * falloc_noinstall().
 	 */
 
 #ifdef INVARIANTS
 	td->td_dupfd = -1;
 #endif
 	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
 	if (error != 0) {
 		KASSERT(fp->f_ops == &badfileops,
 		    ("VOP_OPEN in fhopen() set f_ops"));
 		KASSERT(td->td_dupfd < 0,
 		    ("fhopen() encountered fdopen()"));
 
 		vput(vp);
 		goto bad;
 	}
 #ifdef INVARIANTS
 	td->td_dupfd = 0;
 #endif
 	fp->f_vnode = vp;
 	finit_vnode(fp, fmode, NULL, &vnops);
 	VOP_UNLOCK(vp);
 	if ((fmode & O_TRUNC) != 0) {
 		error = fo_truncate(fp, 0, td->td_ucred, td);
 		if (error != 0)
 			goto bad;
 	}
 
 	error = finstall(td, fp, &indx, fmode, NULL);
 bad:
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (error);
 }
 
 /*
  * Stat an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstat_args {
 	struct fhandle *u_fhp;
 	struct stat *sb;
 };
 #endif
 int
 sys_fhstat(struct thread *td, struct fhstat_args *uap)
 {
 	struct stat sb;
 	struct fhandle fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fh));
 	if (error != 0)
 		return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->sb, sizeof(sb));
 	return (error);
 }
 
 int
 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTAT);
 	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
 	if (error != 0)
 		return (error);
 	error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td);
 	vput(vp);
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct statfs *buf;
 };
 #endif
 int
 sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
 {
 	struct statfs *sfp;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error != 0)
 		return (error);
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sfp);
 	if (error == 0)
 		error = copyout(sfp, uap->buf, sizeof(*sfp));
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 int
 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTATFS);
 	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 	if (error != 0) {
 		vfs_unbusy(mp);
 		return (error);
 	}
 	vput(vp);
 	error = prison_canseemount(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #endif
 	error = VFS_STATFS(mp, buf);
 out:
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * Unlike madvise(2), we do not make a best effort to remember every
  * possible caching hint.  Instead, we remember the last setting with
  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
  * region of any current setting.
  */
 int
 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
     int advice)
 {
 	struct fadvise_info *fa, *new;
 	struct file *fp;
 	struct vnode *vp;
 	off_t end;
 	int error;
 
 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
 		return (EINVAL);
 	AUDIT_ARG_VALUE(advice);
 	switch (advice) {
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_NOREUSE:
 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
 		break;
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_WILLNEED:
 	case POSIX_FADV_DONTNEED:
 		new = NULL;
 		break;
 	default:
 		return (EINVAL);
 	}
 	/* XXX: CAP_POSIX_FADVISE? */
 	AUDIT_ARG_FD(fd);
 	error = fget(td, fd, &cap_no_rights, &fp);
 	if (error != 0)
 		goto out;
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 		error = ESPIPE;
 		goto out;
 	}
 	if (fp->f_type != DTYPE_VNODE) {
 		error = ENODEV;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG) {
 		error = ENODEV;
 		goto out;
 	}
 	if (len == 0)
 		end = OFF_MAX;
 	else
 		end = offset + len - 1;
 	switch (advice) {
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_NOREUSE:
 		/*
 		 * Try to merge any existing non-standard region with
 		 * this new region if possible, otherwise create a new
 		 * non-standard region for this request.
 		 */
 		mtx_pool_lock(mtxpool_sleep, fp);
 		fa = fp->f_advice;
 		if (fa != NULL && fa->fa_advice == advice &&
 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
 			if (offset < fa->fa_start)
 				fa->fa_start = offset;
 			if (end > fa->fa_end)
 				fa->fa_end = end;
 		} else {
 			new->fa_advice = advice;
 			new->fa_start = offset;
 			new->fa_end = end;
 			fp->f_advice = new;
 			new = fa;
 		}
 		mtx_pool_unlock(mtxpool_sleep, fp);
 		break;
 	case POSIX_FADV_NORMAL:
 		/*
 		 * If a the "normal" region overlaps with an existing
 		 * non-standard region, trim or remove the
 		 * non-standard region.
 		 */
 		mtx_pool_lock(mtxpool_sleep, fp);
 		fa = fp->f_advice;
 		if (fa != NULL) {
 			if (offset <= fa->fa_start && end >= fa->fa_end) {
 				new = fa;
 				fp->f_advice = NULL;
 			} else if (offset <= fa->fa_start &&
 			    end >= fa->fa_start)
 				fa->fa_start = end + 1;
 			else if (offset <= fa->fa_end && end >= fa->fa_end)
 				fa->fa_end = offset - 1;
 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
 				/*
 				 * If the "normal" region is a middle
 				 * portion of the existing
 				 * non-standard region, just remove
 				 * the whole thing rather than picking
 				 * one side or the other to
 				 * preserve.
 				 */
 				new = fa;
 				fp->f_advice = NULL;
 			}
 		}
 		mtx_pool_unlock(mtxpool_sleep, fp);
 		break;
 	case POSIX_FADV_WILLNEED:
 	case POSIX_FADV_DONTNEED:
 		error = VOP_ADVISE(vp, offset, end, advice);
 		break;
 	}
 out:
 	if (fp != NULL)
 		fdrop(fp, td);
 	free(new, M_FADVISE);
 	return (error);
 }
 
 int
 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
 {
 	int error;
 
 	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
 	    uap->advice);
 	return (kern_posix_error(td, error));
 }
 
 int
 kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
     off_t *outoffp, size_t len, unsigned int flags)
 {
 	struct file *infp, *outfp;
 	struct vnode *invp, *outvp;
 	int error;
 	size_t retlen;
 	void *rl_rcookie, *rl_wcookie;
 	off_t savinoff, savoutoff;
 
 	infp = outfp = NULL;
 	rl_rcookie = rl_wcookie = NULL;
 	savinoff = -1;
 	error = 0;
 	retlen = 0;
 
 	if (flags != 0) {
 		error = EINVAL;
 		goto out;
 	}
 	if (len > SSIZE_MAX)
 		/*
 		 * Although the len argument is size_t, the return argument
 		 * is ssize_t (which is signed).  Therefore a size that won't
 		 * fit in ssize_t can't be returned.
 		 */
 		len = SSIZE_MAX;
 
 	/* Get the file structures for the file descriptors. */
 	error = fget_read(td, infd, &cap_read_rights, &infp);
 	if (error != 0)
 		goto out;
 	if (infp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (infp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	error = fget_write(td, outfd, &cap_write_rights, &outfp);
 	if (error != 0)
 		goto out;
 	if (outfp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (outfp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Set the offset pointers to the correct place. */
 	if (inoffp == NULL)
 		inoffp = &infp->f_offset;
 	if (outoffp == NULL)
 		outoffp = &outfp->f_offset;
 	savinoff = *inoffp;
 	savoutoff = *outoffp;
 
 	invp = infp->f_vnode;
 	outvp = outfp->f_vnode;
 	/* Sanity check the f_flag bits. */
 	if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
 	    (infp->f_flag & FREAD) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	/* If len == 0, just return 0. */
 	if (len == 0)
 		goto out;
 
 	/*
 	 * If infp and outfp refer to the same file, the byte ranges cannot
 	 * overlap.
 	 */
 	if (invp == outvp && ((savinoff <= savoutoff && savinoff + len >
 	    savoutoff) || (savinoff > savoutoff && savoutoff + len >
 	    savinoff))) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Range lock the byte ranges for both invp and outvp. */
 	for (;;) {
 		rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
 		    len);
 		rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp +
 		    len);
 		if (rl_rcookie != NULL)
 			break;
 		vn_rangelock_unlock(outvp, rl_wcookie);
 		rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
 		vn_rangelock_unlock(invp, rl_rcookie);
 	}
 
 	retlen = len;
 	error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
 	    flags, infp->f_cred, outfp->f_cred, td);
 out:
 	if (rl_rcookie != NULL)
 		vn_rangelock_unlock(invp, rl_rcookie);
 	if (rl_wcookie != NULL)
 		vn_rangelock_unlock(outvp, rl_wcookie);
 	if (savinoff != -1 && (error == EINTR || error == ERESTART)) {
 		*inoffp = savinoff;
 		*outoffp = savoutoff;
 	}
 	if (outfp != NULL)
 		fdrop(outfp, td);
 	if (infp != NULL)
 		fdrop(infp, td);
 	td->td_retval[0] = retlen;
 	return (error);
 }
 
 int
 sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
 {
 	off_t inoff, outoff, *inoffp, *outoffp;
 	int error;
 
 	inoffp = outoffp = NULL;
 	if (uap->inoffp != NULL) {
 		error = copyin(uap->inoffp, &inoff, sizeof(off_t));
 		if (error != 0)
 			return (error);
 		inoffp = &inoff;
 	}
 	if (uap->outoffp != NULL) {
 		error = copyin(uap->outoffp, &outoff, sizeof(off_t));
 		if (error != 0)
 			return (error);
 		outoffp = &outoff;
 	}
 	error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
 	    outoffp, uap->len, uap->flags);
 	if (error == 0 && uap->inoffp != NULL)
 		error = copyout(inoffp, uap->inoffp, sizeof(off_t));
 	if (error == 0 && uap->outoffp != NULL)
 		error = copyout(outoffp, uap->outoffp, sizeof(off_t));
 	return (error);
 }
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index f0914c569828..44cbf1a7893b 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1,3451 +1,3451 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/disk.h>
 #include <sys/fail.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/prng.h>
 #include <sys/sx.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
 static fo_kqfilter_t	vn_kqfilter;
 static fo_stat_t	vn_statfile;
 static fo_close_t	vn_closefile;
 static fo_mmap_t	vn_mmap;
 static fo_fallocate_t	vn_fallocate;
 
 struct 	fileops vnops = {
 	.fo_read = vn_io_fault,
 	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
 	.fo_kqfilter = vn_kqfilter,
 	.fo_stat = vn_statfile,
 	.fo_close = vn_closefile,
 	.fo_chmod = vn_chmod,
 	.fo_chown = vn_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap = vn_mmap,
 	.fo_fallocate = vn_fallocate,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 const u_int io_hold_cnt = 16;
 static int vn_io_fault_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 static int vn_io_fault_prefault = 0;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 static int vn_io_pgcache_read_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
     &vn_io_pgcache_read_enable, 0,
     "Enable copying from page cache for reads, avoiding fs");
 static u_long vn_io_faults_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 
 static int vfs_allow_read_dir = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
     &vfs_allow_read_dir, 0,
     "Enable read(2) of directory by root for filesystems that support it");
 
 /*
  * Returns true if vn_io_fault mode of handling the i/o request should
  * be used.
  */
 static bool
 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 {
 	struct mount *mp;
 
 	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 	    (mp = vp->v_mount) != NULL &&
 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 }
 
 /*
  * Structure used to pass arguments to vn_io_fault1(), to do either
  * file- or vnode-based I/O calls.
  */
 struct vn_io_fault_args {
 	enum {
 		VN_IO_FAULT_FOP,
 		VN_IO_FAULT_VOP
 	} kind;
 	struct ucred *cred;
 	int flags;
 	union {
 		struct fop_args_tag {
 			struct file *fp;
 			fo_rdwr_t *doio;
 		} fop_args;
 		struct vop_args_tag {
 			struct vnode *vp;
 		} vop_args;
 	} args;
 };
 
 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
     struct vn_io_fault_args *args, struct thread *td);
 
 int
 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 {
 	struct thread *td = ndp->ni_cnd.cn_thread;
 
 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 }
 
 static uint64_t
 open2nameif(int fmode, u_int vn_open_flags)
 {
 	uint64_t res;
 
 	res = ISOPEN | LOCKLEAF;
 	if ((fmode & O_RESOLVE_BENEATH) != 0)
 		res |= RBENEATH;
 	if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 		res |= AUDITVNODE1;
 	if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 		res |= NOCAPCHECK;
 	return (res);
 }
 
 /*
  * Common code for vnode open operations via a name lookup.
  * Lookup the vnode and invoke VOP_CREATE if needed.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  *
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
     struct ucred *cred, struct file *fp)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct thread *td = ndp->ni_cnd.cn_thread;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode, error;
 	bool first_open;
 
 restart:
 	first_open = false;
 	fmode = *flagp;
 	if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 	    O_EXCL | O_DIRECTORY))
 		return (EINVAL);
 	else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 		/*
 		 * Set NOCACHE to avoid flushing the cache when
 		 * rolling in many files at once.
 		 *
 		 * Set NC_KEEPPOSENTRY to keep positive entries if they already
 		 * exist despite NOCACHE.
 		 */
 		ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 			bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				vput(ndp->ni_dvp);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH)) != 0)
 					return (error);
 				NDREINIT(ndp);
 				goto restart;
 			}
 			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 				ndp->ni_cnd.cn_flags |= MAKEENTRY;
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
 			    &ndp->ni_cnd, vap);
 			if (error == 0)
 #endif
 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 				    &ndp->ni_cnd, vap);
 			vp = ndp->ni_vp;
 			if (error == 0 && (fmode & O_EXCL) != 0 &&
 			    (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
 				VI_LOCK(vp);
 				vp->v_iflag |= VI_FOPENING;
 				VI_UNLOCK(vp);
 				first_open = true;
 			}
 			VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
 			    false);
 			vn_finished_write(mp);
 			if (error) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				if (error == ERELOOKUP) {
 					NDREINIT(ndp);
 					goto restart;
 				}
 				return (error);
 			}
 			fmode &= ~O_TRUNC;
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			if (vp->v_type == VDIR) {
 				error = EISDIR;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 		ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 		    FOLLOW;
 		if ((fmode & FWRITE) == 0)
 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	error = vn_open_vnode(vp, fmode, cred, td, fp);
 	if (first_open) {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_FOPENING;
 		wakeup(vp);
 		VI_UNLOCK(vp);
 	}
 	if (error)
 		goto bad;
 	*flagp = fmode;
 	return (0);
 bad:
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	vput(vp);
 	*flagp = fmode;
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 static int
 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 {
 	struct flock lf;
 	int error, lock_flags, type;
 
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 	if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 		return (0);
 	KASSERT(fp != NULL, ("open with flock requires fp"));
 	if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 		return (EOPNOTSUPP);
 
 	lock_flags = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 	type = F_FLOCK;
 	if ((fmode & FNONBLOCK) == 0)
 		type |= F_WAIT;
 	if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 		type |= F_FIRSTOPEN;
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 	if (error == 0)
 		fp->f_flag |= FHASLOCK;
 
 	vn_lock(vp, lock_flags | LK_RETRY);
 	return (error);
 }
 
 /*
  * Common code for vnode open operations once a vnode is located.
  * Check permissions, and call the VOP_OPEN routine.
  */
 int
 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
     struct thread *td, struct file *fp)
 {
 	accmode_t accmode;
 	int error;
 
 	if (vp->v_type == VLNK)
 		return (EMLINK);
 	if (vp->v_type == VSOCK)
 		return (EOPNOTSUPP);
 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 		return (ENOTDIR);
 	accmode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR)
 			return (EISDIR);
 		accmode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		accmode |= VREAD;
 	if (fmode & FEXEC)
 		accmode |= VEXEC;
 	if ((fmode & O_APPEND) && (fmode & FWRITE))
 		accmode |= VAPPEND;
 #ifdef MAC
 	if (fmode & O_CREAT)
 		accmode |= VCREAT;
 	if (fmode & O_VERIFY)
 		accmode |= VVERIFY;
 	error = mac_vnode_check_open(cred, vp, accmode);
 	if (error)
 		return (error);
 
 	accmode &= ~(VCREAT | VVERIFY);
 #endif
 	if ((fmode & O_CREAT) == 0 && accmode != 0) {
 		error = VOP_ACCESS(vp, accmode, cred, td);
 		if (error != 0)
 			return (error);
 	}
 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 	error = VOP_OPEN(vp, fmode, cred, td, fp);
 	if (error != 0)
 		return (error);
 
 	error = vn_open_vnode_advlock(vp, fmode, fp);
 	if (error == 0 && (fmode & FWRITE) != 0) {
 		error = VOP_ADD_WRITECOUNT(vp, 1);
 		if (error == 0) {
 			CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 			     __func__, vp, vp->v_writecount);
 		}
 	}
 
 	/*
 	 * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 	 * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 	 * Arrange for that by having fdrop() to use vn_closefile().
 	 */
 	if (error != 0) {
 		fp->f_flag |= FOPENFAILED;
 		fp->f_vnode = vp;
 		if (fp->f_ops == &badfileops) {
 			fp->f_type = DTYPE_VNODE;
 			fp->f_ops = &vnops;
 		}
 		vref(vp);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (error);
 
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  * It is racy.
  */
 int
 vn_writechk(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (VOP_IS_TEXT(vp))
 		return (ETXTBSY);
 
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 static int
 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
     struct thread *td, bool keep_ref)
 {
 	struct mount *mp;
 	int error, lock_flags;
 
 	if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 	    MNT_EXTENDED_SHARED(vp->v_mount))
 		lock_flags = LK_SHARED;
 	else
 		lock_flags = LK_EXCLUSIVE;
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, lock_flags | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	if (keep_ref)
 		VOP_UNLOCK(vp);
 	else
 		vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
     struct thread *td)
 {
 
 	return (vn_close1(vp, flags, file_cred, td, false));
 }
 
 /*
  * Heuristic to detect sequential operation.
  */
 static int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
 	enum uio_rw rw;
 
 	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 
 	rw = uio->uio_rw;
 	if (fp->f_flag & FRDAHEAD)
 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 
 	/*
 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 	 * that the first I/O is normally considered to be slightly
 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
 	 * unless previous seeks have reduced f_seqcount to 0, in which
 	 * case offset 0 is not special.
 	 */
 	if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 	    uio->uio_offset == fp->f_nextoff[rw]) {
 		/*
 		 * f_seqcount is in units of fixed-size blocks so that it
 		 * depends mainly on the amount of sequential I/O and not
 		 * much on the number of sequential I/O's.  The fixed size
 		 * of 16384 is hard-coded here since it is (not quite) just
 		 * a magic size that works well here.  This size is more
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
 		if (uio->uio_resid >= IO_SEQMAX * 16384)
 			fp->f_seqcount[rw] = IO_SEQMAX;
 		else {
 			fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 			if (fp->f_seqcount[rw] > IO_SEQMAX)
 				fp->f_seqcount[rw] = IO_SEQMAX;
 		}
 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 	}
 
 	/* Not sequential.  Quickly draw-down sequentiality. */
 	if (fp->f_seqcount[rw] > 1)
 		fp->f_seqcount[rw] = 1;
 	else
 		fp->f_seqcount[rw] = 0;
 	return (0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error, lock_flags;
 
 	if (offset < 0 && vp->v_type != VCHR)
 		return (EINVAL);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((ioflg & IO_RANGELOCKED) == 0) {
 			if (rw == UIO_READ) {
 				rl_cookie = vn_rangelock_rlock(vp, offset,
 				    offset + len);
 			} else if ((ioflg & IO_APPEND) != 0) {
 				rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 			} else {
 				rl_cookie = vn_rangelock_wlock(vp, offset,
 				    offset + len);
 			}
 		} else
 			rl_cookie = NULL;
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
 				goto out;
 			if (MNT_SHARED_WRITES(mp) ||
 			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 				lock_flags = LK_SHARED;
 			else
 				lock_flags = LK_EXCLUSIVE;
 		} else
 			lock_flags = LK_SHARED;
 		vn_lock(vp, lock_flags | LK_RETRY);
 	} else
 		rl_cookie = NULL;
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
 			error = mac_vnode_check_read(active_cred, file_cred,
 			    vp);
 		else
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
 		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (do_vn_io_fault(vp, &auio)) {
 			args.kind = VN_IO_FAULT_VOP;
 			args.cred = cred;
 			args.flags = ioflg;
 			args.args.vop_args.vp = vp;
 			error = vn_io_fault1(vp, &auio, &args, td);
 		} else if (rw == UIO_READ) {
 			error = VOP_READ(vp, &auio, ioflg, cred);
 		} else /* if (rw == UIO_WRITE) */ {
 			error = VOP_WRITE(vp, &auio, ioflg, cred);
 		}
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		VOP_UNLOCK(vp);
 		if (mp != NULL)
 			vn_finished_write(mp);
 	}
  out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, size_t *aresid, struct thread *td)
 {
 	int error = 0;
 	ssize_t iaresid;
 
 	do {
 		int chunk;
 
 		/*
 		 * Force `offset' to a multiple of MAXBSIZE except possibly
 		 * for the first chunk, so that filesystems only need to
 		 * write full blocks except possibly for the first and last
 		 * chunks.
 		 */
 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 
 		if (chunk > len)
 			chunk = len;
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		iaresid = 0;
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, active_cred, file_cred, &iaresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base = (char *)base + chunk;
 		kern_yield(PRI_USER);
 	} while (len);
 	if (aresid)
 		*aresid = len + iaresid;
 	return (error);
 }
 
 #if OFF_MAX <= LONG_MAX
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	volatile short *flagsp;
 	off_t res;
 	short state;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOLOCK) != 0)
 		return (atomic_load_long(&fp->f_offset));
 
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	flagsp = &fp->f_vnread_flags;
 	if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 		return (atomic_load_long(&fp->f_offset));
 
 	sleepq_lock(&fp->f_vnread_flags);
 	state = atomic_load_16(flagsp);
 	for (;;) {
 		if ((state & FOFFSET_LOCKED) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
 			    FOFFSET_LOCKED))
 				continue;
 			break;
 		}
 		if ((state & FOFFSET_LOCK_WAITING) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
 			    state | FOFFSET_LOCK_WAITING))
 				continue;
 		}
 		DROP_GIANT();
 		sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 		sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 		PICKUP_GIANT();
 		sleepq_lock(&fp->f_vnread_flags);
 		state = atomic_load_16(flagsp);
 	}
 	res = atomic_load_long(&fp->f_offset);
 	sleepq_release(&fp->f_vnread_flags);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	volatile short *flagsp;
 	short state;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOUPDATE) == 0)
 		atomic_store_long(&fp->f_offset, val);
 	if ((flags & FOF_NEXTOFF_R) != 0)
 		fp->f_nextoff[UIO_READ] = val;
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 
 	if ((flags & FOF_NOLOCK) != 0)
 		return;
 
 	flagsp = &fp->f_vnread_flags;
 	state = atomic_load_16(flagsp);
 	if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 	    atomic_cmpset_rel_16(flagsp, state, 0))
 		return;
 
 	sleepq_lock(&fp->f_vnread_flags);
 	MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 	MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 	fp->f_vnread_flags = 0;
 	sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 	sleepq_release(&fp->f_vnread_flags);
 }
 #else
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	struct mtx *mtxp;
 	off_t res;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOLOCK) == 0) {
 		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 			    "vofflock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
 	}
 	res = fp->f_offset;
 	mtx_unlock(mtxp);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	struct mtx *mtxp;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOUPDATE) == 0)
 		fp->f_offset = val;
 	if ((flags & FOF_NEXTOFF_R) != 0)
 		fp->f_nextoff[UIO_READ] = val;
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 	if ((flags & FOF_NOLOCK) == 0) {
 		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 		    ("Lost FOFFSET_LOCKED"));
 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 			wakeup(&fp->f_vnread_flags);
 		fp->f_vnread_flags = 0;
 	}
 	mtx_unlock(mtxp);
 }
 #endif
 
 void
 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = foffset_lock(fp, flags);
 }
 
 void
 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		foffset_unlock(fp, uio->uio_offset, flags);
 }
 
 static int
 get_advice(struct file *fp, struct uio *uio)
 {
 	struct mtx *mtxp;
 	int ret;
 
 	ret = POSIX_FADV_NORMAL;
 	if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 		return (ret);
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if (fp->f_advice != NULL &&
 	    uio->uio_offset >= fp->f_advice->fa_start &&
 	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 		ret = fp->f_advice->fa_advice;
 	mtx_unlock(mtxp);
 	return (ret);
 }
 
 int
 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 {
 	vm_object_t obj;
 	vm_page_t ma[io_hold_cnt + 2];
 	off_t off, vsz;
 	ssize_t resid;
 	int error, i, j;
 
 	MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 	obj = atomic_load_ptr(&vp->v_object);
 	if (obj == NULL)
 		return (EJUSTRETURN);
 
 	/*
 	 * Depends on type stability of vm_objects.
 	 */
 	vm_object_pip_add(obj, 1);
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		/*
 		 * Note that object might be already reused from the
 		 * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 		 * we recheck for DOOMED vnode state after all pages
 		 * are busied, and retract then.
 		 *
 		 * But we check for OBJ_DEAD to ensure that we do not
 		 * busy pages while vm_object_terminate_pages()
 		 * processes the queue.
 		 */
 		error = EJUSTRETURN;
 		goto out_pip;
 	}
 
 	resid = uio->uio_resid;
 	off = uio->uio_offset;
 	for (i = 0; resid > 0; i++) {
 		MPASS(i < io_hold_cnt + 2);
 		ma[i] = vm_page_grab_unlocked(obj, atop(off),
 		    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_NOWAIT);
 		if (ma[i] == NULL)
 			break;
 
 		/*
 		 * Skip invalid pages.  Valid mask can be partial only
 		 * at EOF, and we clip later.
 		 */
 		if (vm_page_none_valid(ma[i])) {
 			vm_page_sunbusy(ma[i]);
 			break;
 		}
 
 		resid -= PAGE_SIZE;
 		off += PAGE_SIZE;
 	}
 	if (i == 0) {
 		error = EJUSTRETURN;
 		goto out_pip;
 	}
 
 	/*
 	 * Check VIRF_DOOMED after we busied our pages.  Since
 	 * vgonel() terminates the vnode' vm_object, it cannot
 	 * process past pages busied by us.
 	 */
 	if (VN_IS_DOOMED(vp)) {
 		error = EJUSTRETURN;
 		goto out;
 	}
 
 	resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 	if (resid > uio->uio_resid)
 		resid = uio->uio_resid;
 
 	/*
 	 * Unlocked read of vnp_size is safe because truncation cannot
 	 * pass busied page.  But we load vnp_size into a local
 	 * variable so that possible concurrent extension does not
 	 * break calculation.
 	 */
 #if defined(__powerpc__) && !defined(__powerpc64__)
 	vsz = obj->un_pager.vnp.vnp_size;
 #else
 	vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
 #endif
 	if (uio->uio_offset >= vsz) {
 		error = EJUSTRETURN;
 		goto out;
 	}
 	if (uio->uio_offset + resid > vsz)
 		resid = vsz - uio->uio_offset;
 
 	error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
 
 out:
 	for (j = 0; j < i; j++) {
 		if (error == 0)
 			vm_page_reference(ma[j]);
 		vm_page_sunbusy(ma[j]);
 	}
 out_pip:
 	vm_object_pip_wakeup(obj);
 	if (error != 0)
 		return (error);
 	return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
     struct thread *td)
 {
 	struct vnode *vp;
 	off_t orig_offset;
 	int error, ioflag;
 	int advice;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	/*
 	 * Try to read from page cache.  VIRF_DOOMED check is racy but
 	 * allows us to avoid unneeded work outright.
 	 */
 	if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
 	    (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
 		error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
 		if (error == 0) {
 			fp->f_nextoff[UIO_READ] = uio->uio_offset;
 			return (0);
 		}
 		if (error != EJUSTRETURN)
 			return (error);
 	}
 
 	advice = get_advice(fp, uio);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* Disable read-ahead for random I/O. */
 		break;
 	}
 	orig_offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff[UIO_READ] = uio->uio_offset;
 	VOP_UNLOCK(vp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    orig_offset != uio->uio_offset)
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
 		 * for the backing file after a POSIX_FADV_NOREUSE
 		 * read(2).
 		 */
 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
     struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	off_t orig_offset;
 	int error, ioflag, lock_flags;
 	int advice;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	if (vp->v_type == VREG)
 		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
 	/*
 	 * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
 	 * implementations that don't understand IO_DATASYNC fall back to full
 	 * O_SYNC behavior.
 	 */
 	if (fp->f_flag & O_DSYNC)
 		ioflag |= IO_SYNC | IO_DATASYNC;
 	mp = NULL;
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto unlock;
 
 	advice = get_advice(fp, uio);
 
 	if (MNT_SHARED_WRITES(mp) ||
 	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
 		lock_flags = LK_SHARED;
 	} else {
 		lock_flags = LK_EXCLUSIVE;
 	}
 
 	vn_lock(vp, lock_flags | LK_RETRY);
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
 	}
 	orig_offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
 	VOP_UNLOCK(vp);
 	if (vp->v_type != VCHR)
 		vn_finished_write(mp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    orig_offset != uio->uio_offset)
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
 		 * for the backing file after a POSIX_FADV_NOREUSE
 		 * write(2).
 		 */
 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 unlock:
 	return (error);
 }
 
 /*
  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
  * prevent the following deadlock:
  *
  * Assume that the thread A reads from the vnode vp1 into userspace
  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
  * currently not resident, then system ends up with the call chain
  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
  * backed by the pages of vnode vp1, and some page in buf2 is not
  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
  *
  * To prevent the lock order reversal and deadlock, vn_io_fault() does
  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
  * Instead, it first tries to do the whole range i/o with pagefaults
  * disabled. If all pages in the i/o buffer are resident and mapped,
  * VOP will succeed (ignoring the genuine filesystem errors).
  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
  * i/o in chunks, with all pages in the chunk prefaulted and held
  * using vm_fault_quick_hold_pages().
  *
  * Filesystems using this deadlock avoidance scheme should use the
  * array of the held pages from uio, saved in the curthread->td_ma,
  * instead of doing uiomove().  A helper function
  * vn_io_fault_uiomove() converts uiomove request into
  * uiomove_fromphys() over td_ma array.
  *
  * Since vnode locks do not cover the whole i/o anymore, rangelocks
  * make the current i/o request atomic with respect to other i/os and
  * truncations.
  */
 
 /*
  * Decode vn_io_fault_args and perform the corresponding i/o.
  */
 static int
 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
     struct thread *td)
 {
 	int error, save;
 
 	error = 0;
 	save = vm_fault_disable_pagefaults();
 	switch (args->kind) {
 	case VN_IO_FAULT_FOP:
 		error = (args->args.fop_args.doio)(args->args.fop_args.fp,
 		    uio, args->cred, args->flags, td);
 		break;
 	case VN_IO_FAULT_VOP:
 		if (uio->uio_rw == UIO_READ) {
 			error = VOP_READ(args->args.vop_args.vp, uio,
 			    args->flags, args->cred);
 		} else if (uio->uio_rw == UIO_WRITE) {
 			error = VOP_WRITE(args->args.vop_args.vp, uio,
 			    args->flags, args->cred);
 		}
 		break;
 	default:
 		panic("vn_io_fault_doio: unknown kind of io %d %d",
 		    args->kind, uio->uio_rw);
 	}
 	vm_fault_enable_pagefaults(save);
 	return (error);
 }
 
 static int
 vn_io_fault_touch(char *base, const struct uio *uio)
 {
 	int r;
 
 	r = fubyte(base);
 	if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
 		return (EFAULT);
 	return (0);
 }
 
 static int
 vn_io_fault_prefault_user(const struct uio *uio)
 {
 	char *base;
 	const struct iovec *iov;
 	size_t len;
 	ssize_t resid;
 	int error, i;
 
 	KASSERT(uio->uio_segflg == UIO_USERSPACE,
 	    ("vn_io_fault_prefault userspace"));
 
 	error = i = 0;
 	iov = uio->uio_iov;
 	resid = uio->uio_resid;
 	base = iov->iov_base;
 	len = iov->iov_len;
 	while (resid > 0) {
 		error = vn_io_fault_touch(base, uio);
 		if (error != 0)
 			break;
 		if (len < PAGE_SIZE) {
 			if (len != 0) {
 				error = vn_io_fault_touch(base + len - 1, uio);
 				if (error != 0)
 					break;
 				resid -= len;
 			}
 			if (++i >= uio->uio_iovcnt)
 				break;
 			iov = uio->uio_iov + i;
 			base = iov->iov_base;
 			len = iov->iov_len;
 		} else {
 			len -= PAGE_SIZE;
 			base += PAGE_SIZE;
 			resid -= PAGE_SIZE;
 		}
 	}
 	return (error);
 }
 
 /*
  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
  * into args and call vn_io_fault1() to handle faults during the user
  * mode buffer accesses.
  */
 static int
 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
     struct thread *td)
 {
 	vm_page_t ma[io_hold_cnt + 2];
 	struct uio *uio_clone, short_uio;
 	struct iovec short_iovec[1];
 	vm_page_t *prev_td_ma;
 	vm_prot_t prot;
 	vm_offset_t addr, end;
 	size_t len, resid;
 	ssize_t adv;
 	int error, cnt, saveheld, prev_td_ma_cnt;
 
 	if (vn_io_fault_prefault) {
 		error = vn_io_fault_prefault_user(uio);
 		if (error != 0)
 			return (error); /* Or ignore ? */
 	}
 
 	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
 
 	/*
 	 * The UFS follows IO_UNIT directive and replays back both
 	 * uio_offset and uio_resid if an error is encountered during the
 	 * operation.  But, since the iovec may be already advanced,
 	 * uio is still in an inconsistent state.
 	 *
 	 * Cache a copy of the original uio, which is advanced to the redo
 	 * point using UIO_NOCOPY below.
 	 */
 	uio_clone = cloneuio(uio);
 	resid = uio->uio_resid;
 
 	short_uio.uio_segflg = UIO_USERSPACE;
 	short_uio.uio_rw = uio->uio_rw;
 	short_uio.uio_td = uio->uio_td;
 
 	error = vn_io_fault_doio(args, uio, td);
 	if (error != EFAULT)
 		goto out;
 
 	atomic_add_long(&vn_io_faults_cnt, 1);
 	uio_clone->uio_segflg = UIO_NOCOPY;
 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
 	uio_clone->uio_segflg = uio->uio_segflg;
 
 	saveheld = curthread_pflags_set(TDP_UIOHELD);
 	prev_td_ma = td->td_ma;
 	prev_td_ma_cnt = td->td_ma_cnt;
 
 	while (uio_clone->uio_resid != 0) {
 		len = uio_clone->uio_iov->iov_len;
 		if (len == 0) {
 			KASSERT(uio_clone->uio_iovcnt >= 1,
 			    ("iovcnt underflow"));
 			uio_clone->uio_iov++;
 			uio_clone->uio_iovcnt--;
 			continue;
 		}
 		if (len > ptoa(io_hold_cnt))
 			len = ptoa(io_hold_cnt);
 		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
 		end = round_page(addr + len);
 		if (end < addr) {
 			error = EFAULT;
 			break;
 		}
 		cnt = atop(end - trunc_page(addr));
 		/*
 		 * A perfectly misaligned address and length could cause
 		 * both the start and the end of the chunk to use partial
 		 * page.  +2 accounts for such a situation.
 		 */
 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
 		    addr, len, prot, ma, io_hold_cnt + 2);
 		if (cnt == -1) {
 			error = EFAULT;
 			break;
 		}
 		short_uio.uio_iov = &short_iovec[0];
 		short_iovec[0].iov_base = (void *)addr;
 		short_uio.uio_iovcnt = 1;
 		short_uio.uio_resid = short_iovec[0].iov_len = len;
 		short_uio.uio_offset = uio_clone->uio_offset;
 		td->td_ma = ma;
 		td->td_ma_cnt = cnt;
 
 		error = vn_io_fault_doio(args, &short_uio, td);
 		vm_page_unhold_pages(ma, cnt);
 		adv = len - short_uio.uio_resid;
 
 		uio_clone->uio_iov->iov_base =
 		    (char *)uio_clone->uio_iov->iov_base + adv;
 		uio_clone->uio_iov->iov_len -= adv;
 		uio_clone->uio_resid -= adv;
 		uio_clone->uio_offset += adv;
 
 		uio->uio_resid -= adv;
 		uio->uio_offset += adv;
 
 		if (error != 0 || adv == 0)
 			break;
 	}
 	td->td_ma = prev_td_ma;
 	td->td_ma_cnt = prev_td_ma_cnt;
 	curthread_pflags_restore(saveheld);
 out:
 	free(uio_clone, M_IOV);
 	return (error);
 }
 
 static int
 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	fo_rdwr_t *doio;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error;
 
 	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
 	vp = fp->f_vnode;
 
 	/*
 	 * The ability to read(2) on a directory has historically been
 	 * allowed for all users, but this can and has been the source of
 	 * at least one security issue in the past.  As such, it is now hidden
 	 * away behind a sysctl for those that actually need it to use it, and
 	 * restricted to root when it's turned on to make it relatively safe to
 	 * leave on for longer sessions of need.
 	 */
 	if (vp->v_type == VDIR) {
 		KASSERT(uio->uio_rw == UIO_READ,
 		    ("illegal write attempted on a directory"));
 		if (!vfs_allow_read_dir)
 			return (EISDIR);
 		if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
 			return (EISDIR);
 	}
 
 	foffset_lock_uio(fp, uio, flags);
 	if (do_vn_io_fault(vp, uio)) {
 		args.kind = VN_IO_FAULT_FOP;
 		args.args.fop_args.fp = fp;
 		args.args.fop_args.doio = doio;
 		args.cred = active_cred;
 		args.flags = flags | FOF_OFFSET;
 		if (uio->uio_rw == UIO_READ) {
 			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		} else if ((fp->f_flag & O_APPEND) != 0 ||
 		    (flags & FOF_OFFSET) == 0) {
 			/* For appenders, punt and lock the whole range. */
 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 		} else {
 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		}
 		error = vn_io_fault1(vp, uio, &args, td);
 		vn_rangelock_unlock(vp, rl_cookie);
 	} else {
 		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 	}
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 /*
  * Helper function to perform the requested uiomove operation using
  * the held pages for io->uio_iov[0].iov_base buffer instead of
  * copyin/copyout.  Access to the pages with uiomove_fromphys()
  * instead of iov_base prevents page faults that could occur due to
  * pmap_collect() invalidating the mapping created by
  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
  * object cleanup revoking the write access from page mappings.
  *
  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
  * instead of plain uiomove().
  */
 int
 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
 {
 	struct uio transp_uio;
 	struct iovec transp_iov[1];
 	struct thread *td;
 	size_t adv;
 	int error, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove(data, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	transp_iov[0].iov_base = data;
 	transp_uio.uio_iov = &transp_iov[0];
 	transp_uio.uio_iovcnt = 1;
 	if (xfersize > uio->uio_resid)
 		xfersize = uio->uio_resid;
 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
 	transp_uio.uio_offset = 0;
 	transp_uio.uio_segflg = UIO_SYSSPACE;
 	/*
 	 * Since transp_iov points to data, and td_ma page array
 	 * corresponds to original uio->uio_iov, we need to invert the
 	 * direction of the i/o operation as passed to
 	 * uiomove_fromphys().
 	 */
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		transp_uio.uio_rw = UIO_READ;
 		break;
 	case UIO_READ:
 		transp_uio.uio_rw = UIO_WRITE;
 		break;
 	}
 	transp_uio.uio_td = uio->uio_td;
 	error = uiomove_fromphys(td->td_ma,
 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
 	    xfersize, &transp_uio);
 	adv = xfersize - transp_uio.uio_resid;
 	pgadv =
 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
 	uio->uio_iov->iov_len -= adv;
 	uio->uio_resid -= adv;
 	uio->uio_offset += adv;
 	return (error);
 }
 
 int
 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
     struct uio *uio)
 {
 	struct thread *td;
 	vm_offset_t iov_base;
 	int cnt, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove_fromphys(ma, offset, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
 	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
 		    offset, cnt);
 		break;
 	case UIO_READ:
 		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
 		    cnt);
 		break;
 	}
 	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
 	uio->uio_iov->iov_len -= cnt;
 	uio->uio_resid -= cnt;
 	uio->uio_offset += cnt;
 	return (0);
 }
 
 /*
  * File table truncate routine.
  */
 static int
 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	int error;
 
 	vp = fp->f_vnode;
 
 retry:
 	/*
 	 * Lock the whole range for truncation.  Otherwise split i/o
 	 * might happen partly before and partly after the truncation.
 	 */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error)
 		goto out;
 #endif
 	error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
 	    fp->f_cred);
 out:
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 out1:
 	vn_rangelock_unlock(vp, rl_cookie);
 	if (error == ERELOOKUP)
 		goto retry;
 	return (error);
 }
 
 /*
  * Truncate a file that is already locked.
  */
 int
 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
     struct ucred *cred)
 {
 	struct vattr vattr;
 	int error;
 
 	error = VOP_ADD_WRITECOUNT(vp, 1);
 	if (error == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		if (sync)
 			vattr.va_vaflags |= VA_SYNC;
 		error = VOP_SETATTR(vp, &vattr, cred);
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 	}
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 static int
 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp = fp->f_vnode;
 	int error;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td);
 	VOP_UNLOCK(vp);
 
 	return (error);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vattr vattr;
 	struct vnode *vp;
 	struct fiobmap2_arg *bmarg;
 	int error;
 
 	vp = fp->f_vnode;
 	switch (vp->v_type) {
 	case VDIR:
 	case VREG:
 		switch (com) {
 		case FIONREAD:
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			error = VOP_GETATTR(vp, &vattr, active_cred);
 			VOP_UNLOCK(vp);
 			if (error == 0)
 				*(int *)data = vattr.va_size - fp->f_offset;
 			return (error);
 		case FIOBMAP2:
 			bmarg = (struct fiobmap2_arg *)data;
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 #ifdef MAC
 			error = mac_vnode_check_read(active_cred, fp->f_cred,
 			    vp);
 			if (error == 0)
 #endif
 				error = VOP_BMAP(vp, bmarg->bn, NULL,
 				    &bmarg->bn, &bmarg->runp, &bmarg->runb);
 			VOP_UNLOCK(vp);
 			return (error);
 		case FIONBIO:
 		case FIOASYNC:
 			return (0);
 		default:
 			return (VOP_IOCTL(vp, com, data, fp->f_flag,
 			    active_cred, td));
 		}
 		break;
 	case VCHR:
 		return (VOP_IOCTL(vp, com, data, fp->f_flag,
 		    active_cred, td));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 	int error;
 
 	vp = fp->f_vnode;
 #if defined(MAC) || defined(AUDIT)
 	if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(vp);
 		error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 		VOP_UNLOCK(vp);
 		if (error != 0)
 			return (error);
 	}
 #endif
 	error = VOP_POLL(vp, events, fp->f_cred, td);
 	return (error);
 }
 
 /*
  * Acquire the requested lock and then check for validity.  LK_RETRY
  * permits vn_lock to return doomed vnodes.
  */
 static int __noinline
 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
     int error)
 {
 
 	KASSERT((flags & LK_RETRY) == 0 || error == 0,
 	    ("vn_lock: error %d incompatible with flags %#x", error, flags));
 
 	if (error == 0)
 		VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
 
 	if ((flags & LK_RETRY) == 0) {
 		if (error == 0) {
 			VOP_UNLOCK(vp);
 			error = ENOENT;
 		}
 		return (error);
 	}
 
 	/*
 	 * LK_RETRY case.
 	 *
 	 * Nothing to do if we got the lock.
 	 */
 	if (error == 0)
 		return (0);
 
 	/*
 	 * Interlock was dropped by the call in _vn_lock.
 	 */
 	flags &= ~LK_INTERLOCK;
 	do {
 		error = VOP_LOCK1(vp, flags, file, line);
 	} while (error != 0);
 	return (0);
 }
 
 int
 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
 {
 	int error;
 
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vn_lock: no locktype (%d passed)", flags));
 	VNPASS(vp->v_holdcnt > 0, vp);
 	error = VOP_LOCK1(vp, flags, file, line);
 	if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
 		return (_vn_lock_fallback(vp, flags, file, line, error));
 	return (0);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 	bool ref;
 
 	vp = fp->f_vnode;
 	fp->f_ops = &badfileops;
-	ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
+	ref = (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
 
 	error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
 
 	if (__predict_false(ref)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 		vrele(vp);
 	}
 	return (error);
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 static int
 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
 {
 	struct mount_pcpu *mpcpu;
 	int error, mflags;
 
 	if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
 	    vfs_op_thread_enter(mp, mpcpu)) {
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 		vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return (0);
 	}
 
 	if (mplocked)
 		mtx_assert(MNT_MTX(mp), MA_OWNED);
 	else
 		MNT_ILOCK(mp);
 
 	error = 0;
 
 	/*
 	 * Check on status of suspension.
 	 */
 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 	    mp->mnt_susp_owner != curthread) {
 		mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
 		    (flags & PCATCH) : 0) | (PUSER - 1);
 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 			if (flags & V_NOWAIT) {
 				error = EWOULDBLOCK;
 				goto unlock;
 			}
 			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
 			    "suspfs", 0);
 			if (error)
 				goto unlock;
 		}
 	}
 	if (flags & V_XSLEEP)
 		goto unlock;
 	mp->mnt_writeopcount++;
 unlock:
 	if (error != 0 || (flags & V_XSLEEP) != 0)
 		MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 int
 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error;
 
 	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
 	    ("V_MNTREF requires mp"));
 
 	error = 0;
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	if (vp == NULL && (flags & V_MNTREF) == 0)
 		vfs_ref(mp);
 
 	return (vn_start_write_refed(mp, flags, false));
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error;
 
 	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
 	    ("V_MNTREF requires mp"));
 
  retry:
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL && (flags & V_MNTREF) == 0)
 		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 		mp->mnt_secondary_writes++;
 		mp->mnt_secondary_accwrites++;
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if (flags & V_NOWAIT) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
 	    ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
 	    "suspfs", 0);
 	vfs_rel(mp);
 	if (error == 0)
 		goto retry;
 	return (error);
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int c;
 
 	if (mp == NULL)
 		return;
 
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
 		vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REL(mp);
 	c = --mp->mnt_writeopcount;
 	if (mp->mnt_vfs_ops == 0) {
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	if (c < 0)
 		vfs_dump_mount_counters(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
 		wakeup(&mp->mnt_writeopcount);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Filesystem secondary write operation has completed. If we are
  * suspending and this operation is the last one, notify the suspender
  * that the suspension is now in effect.
  */
 void
 vn_finished_secondary_write(struct mount *mp)
 {
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	mp->mnt_secondary_writes--;
 	if (mp->mnt_secondary_writes < 0)
 		panic("vn_finished_secondary_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_secondary_writes <= 0)
 		wakeup(&mp->mnt_secondary_writes);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Request a filesystem to suspend write operations.
  */
 int
 vfs_write_suspend(struct mount *mp, int flags)
 {
 	int error;
 
 	vfs_op_enter(mp);
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	if (mp->mnt_susp_owner == curthread) {
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		return (EALREADY);
 	}
 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 
 	/*
 	 * Unmount holds a write reference on the mount point.  If we
 	 * own busy reference and drain for writers, we deadlock with
 	 * the reference draining in the unmount path.  Callers of
 	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
 	 * vfs_busy() reference is owned and caller is not in the
 	 * unmount context.
 	 */
 	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		return (EBUSY);
 	}
 
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	mp->mnt_susp_owner = curthread;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
 		vfs_write_resume(mp, 0);
 		/* vfs_write_resume does vfs_op_exit() for us */
 	}
 	return (error);
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(struct mount *mp, int flags)
 {
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 				       MNTK_SUSPENDED);
 		mp->mnt_susp_owner = NULL;
 		wakeup(&mp->mnt_writeopcount);
 		wakeup(&mp->mnt_flag);
 		curthread->td_pflags &= ~TDP_IGNSUSP;
 		if ((flags & VR_START_WRITE) != 0) {
 			MNT_REF(mp);
 			mp->mnt_writeopcount++;
 		}
 		MNT_IUNLOCK(mp);
 		if ((flags & VR_NO_SUSPCLR) == 0)
 			VFS_SUSP_CLEAN(mp);
 		vfs_op_exit(mp);
 	} else if ((flags & VR_START_WRITE) != 0) {
 		MNT_REF(mp);
 		vn_start_write_refed(mp, 0, true);
 	} else {
 		MNT_IUNLOCK(mp);
 	}
 }
 
 /*
  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
  * methods.
  */
 int
 vfs_write_suspend_umnt(struct mount *mp)
 {
 	int error;
 
 	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
 	    ("vfs_write_suspend_umnt: recursed"));
 
 	/* dounmount() already called vn_start_write(). */
 	for (;;) {
 		vn_finished_write(mp);
 		error = vfs_write_suspend(mp, 0);
 		if (error != 0) {
 			vn_start_write(NULL, &mp, V_WAIT);
 			return (error);
 		}
 		MNT_ILOCK(mp);
 		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			break;
 		MNT_IUNLOCK(mp);
 		vn_start_write(NULL, &mp, V_WAIT);
 	}
 	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 	wakeup(&mp->mnt_flag);
 	MNT_IUNLOCK(mp);
 	curthread->td_pflags |= TDP_IGNSUSP;
 	return (0);
 }
 
 /*
  * Implement kqueues for files by translating it to vnode operation.
  */
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (VOP_KQFILTER(fp->f_vnode, kn));
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 	    td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute removal as kernel */
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp);
 	}
 
 	return (error);
 }
 
 static int
 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 
 	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
 }
 
 int
 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 {
 
 	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
 	    lkflags, rvp));
 }
 
 int
 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
     int lkflags, struct vnode **rvp)
 {
 	struct mount *mp;
 	int ltype, error;
 
 	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
 	mp = vp->v_mount;
 	ltype = VOP_ISLOCKED(vp);
 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 	    ("vn_vget_ino: vp not locked"));
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, ltype | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0)
 			return (ENOENT);
 		if (VN_IS_DOOMED(vp)) {
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp);
 	error = alloc(mp, alloc_arg, lkflags, rvp);
 	vfs_unbusy(mp);
 	if (error != 0 || *rvp != vp)
 		vn_lock(vp, ltype | LK_RETRY);
 	if (VN_IS_DOOMED(vp)) {
 		if (error == 0) {
 			if (*rvp == vp)
 				vunref(vp);
 			else
 				vput(*rvp);
 		}
 		error = ENOENT;
 	}
 	return (error);
 }
 
 int
 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
     struct thread *td)
 {
 
 	if (vp->v_type != VREG || td == NULL)
 		return (0);
 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
 	    lim_cur(td, RLIMIT_FSIZE)) {
 		PROC_LOCK(td->td_proc);
 		kern_psignal(td->td_proc, SIGXFSZ);
 		PROC_UNLOCK(td->td_proc);
 		return (EFBIG);
 	}
 	return (0);
 }
 
 int
 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp);
 #endif
 	return (setfmode(td, active_cred, vp, mode));
 }
 
 int
 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp);
 #endif
 	return (setfown(td, active_cred, vp, uid, gid));
 }
 
 void
 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_object_t object;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 int
 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 {
 	struct vattr va;
 	daddr_t bn, bnp;
 	uint64_t bsize;
 	off_t noff;
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("Wrong command %lu", cmd));
 
 	if (vn_lock(vp, LK_SHARED) != 0)
 		return (EBADF);
 	if (vp->v_type != VREG) {
 		error = ENOTTY;
 		goto unlock;
 	}
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error != 0)
 		goto unlock;
 	noff = *off;
 	if (noff >= va.va_size) {
 		error = ENXIO;
 		goto unlock;
 	}
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
 	    noff % bsize) {
 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 		if (error == EOPNOTSUPP) {
 			error = ENOTTY;
 			goto unlock;
 		}
 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
 			noff = bn * bsize;
 			if (noff < *off)
 				noff = *off;
 			goto unlock;
 		}
 	}
 	if (noff > va.va_size)
 		noff = va.va_size;
 	/* noff == va.va_size. There is an implicit hole at the end of file. */
 	if (cmd == FIOSEEKDATA)
 		error = ENXIO;
 unlock:
 	VOP_UNLOCK(vp);
 	if (error == 0)
 		*off = noff;
 	return (error);
 }
 
 int
 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct ucred *cred;
 	struct vnode *vp;
 	struct vattr vattr;
 	off_t foffset, size;
 	int error, noneg;
 
 	cred = td->td_ucred;
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 	noneg = (vp->v_type != VCHR);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (noneg &&
 		    (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		VOP_UNLOCK(vp);
 		if (error)
 			break;
 
 		/*
 		 * If the file references a disk device, then fetch
 		 * the media size and use that to determine the ending
 		 * offset.
 		 */
 		if (vattr.va_size == 0 && vp->v_type == VCHR &&
 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
 			vattr.va_size = size;
 		if (noneg &&
 		    (vattr.va_size > OFF_MAX ||
 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += vattr.va_size;
 		break;
 	case L_SET:
 		break;
 	case SEEK_DATA:
 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 		if (error == ENOTTY)
 			error = EINVAL;
 		break;
 	case SEEK_HOLE:
 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 		if (error == ENOTTY)
 			error = EINVAL;
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0 && noneg && offset < 0)
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
 	VFS_KNOTE_UNLOCKED(vp, 0);
 	td->td_uretoff.tdu_off = offset;
 drop:
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 int
 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	int error;
 
 	/*
 	 * Grant permission if the caller is the owner of the file, or
 	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
 	 * on the file.  If the time pointer is null, then write
 	 * permission on the file is also sufficient.
 	 *
 	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
 	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
 	 * will be allowed to set the times [..] to the current
 	 * server time.
 	 */
 	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
 	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
 		error = VOP_ACCESS(vp, VWRITE, cred, td);
 	return (error);
 }
 
 int
 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct vnode *vp;
 	int error;
 
 	if (fp->f_type == DTYPE_FIFO)
 		kif->kf_type = KF_TYPE_FIFO;
 	else
 		kif->kf_type = KF_TYPE_VNODE;
 	vp = fp->f_vnode;
 	vref(vp);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fill_kinfo_vnode(vp, kif);
 	vrele(vp);
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 static inline void
 vn_fill_junk(struct kinfo_file *kif)
 {
 	size_t len, olen;
 
 	/*
 	 * Simulate vn_fullpath returning changing values for a given
 	 * vp during e.g. coredump.
 	 */
 	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
 	olen = strlen(kif->kf_path);
 	if (len < olen)
 		strcpy(&kif->kf_path[len - 1], "$");
 	else
 		for (; olen < len; olen++)
 			strcpy(&kif->kf_path[olen], "A");
 }
 
 int
 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
 {
 	struct vattr va;
 	char *fullpath, *freepath;
 	int error;
 
 	kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
 	freepath = NULL;
 	fullpath = "-";
 	error = vn_fullpath(vp, &fullpath, &freepath);
 	if (error == 0) {
 		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 
 	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
 		vn_fill_junk(kif);
 	);
 
 	/*
 	 * Retrieve vnode attributes.
 	 */
 	va.va_fsid = VNOVAL;
 	va.va_rdev = NODEV;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 	VOP_UNLOCK(vp);
 	if (error != 0)
 		return (error);
 	if (va.va_fsid != VNOVAL)
 		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 	else
 		kif->kf_un.kf_file.kf_file_fsid =
 		    vp->v_mount->mnt_stat.f_fsid.val[0];
 	kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
 	    kif->kf_un.kf_file.kf_file_fsid; /* truncate */
 	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
 	kif->kf_un.kf_file.kf_file_size = va.va_size;
 	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
 	kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
 	    kif->kf_un.kf_file.kf_file_rdev; /* truncate */
 	return (0);
 }
 
 int
 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_in pkm;
 #endif
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	boolean_t writecounted;
 	int error;
 
 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
 	/*
 	 * POSIX shared-memory objects are defined to have
 	 * kernel persistence, and are not defined to support
 	 * read(2)/write(2) -- or even open(2).  Thus, we can
 	 * use MAP_ASYNC to trade on-disk coherence for speed.
 	 * The shm_open(3) library routine turns on the FPOSIXSHM
 	 * flag to request this behavior.
 	 */
 	if ((fp->f_flag & FPOSIXSHM) != 0)
 		flags |= MAP_NOSYNC;
 #endif
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.  Note that we only worry about
 	 * writability if mapping is shared; in this case,
 	 * current and max prot are dictated by the open file.
 	 * XXX use the vnode instead?  Problem is: what
 	 * credentials do we use for determination? What if
 	 * proc does a setuid?
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	} else {
 		maxprot |= VM_PROT_WRITE;
 		cap_maxprot |= VM_PROT_WRITE;
 	}
 	maxprot &= cap_maxprot;
 
 	/*
 	 * For regular files and shared memory, POSIX requires that
 	 * the value of foff be a legitimate offset within the data
 	 * object.  In particular, negative offsets are invalid.
 	 * Blocking negative offsets and overflows here avoids
 	 * possible wraparound or user-level access into reserved
 	 * ranges of the data object later.  In contrast, POSIX does
 	 * not dictate how offsets are used by device drivers, so in
 	 * the case of a device mapping a negative offset is passed
 	 * on.
 	 */
 	if (
 #ifdef _LP64
 	    size > OFF_MAX ||
 #endif
 	    foff > OFF_MAX - size)
 		return (EINVAL);
 
 	writecounted = FALSE;
 	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
 	    &foff, &object, &writecounted);
 	if (error != 0)
 		return (error);
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, writecounted, td);
 	if (error != 0) {
 		/*
 		 * If this mapping was accounted for in the vnode's
 		 * writecount, then undo that now.
 		 */
 		if (writecounted)
 			vm_pager_release_writecount(object, 0, size);
 		vm_object_deallocate(object);
 	}
 #ifdef HWPMC_HOOKS
 	/* Inform hwpmc(4) if an executable is being mapped. */
 	if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
 		if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
 			pkm.pm_file = vp;
 			pkm.pm_address = (uintptr_t) *addr;
 			PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
 		}
 	}
 #endif
 	return (error);
 }
 
 void
 vn_fsid(struct vnode *vp, struct vattr *va)
 {
 	fsid_t *f;
 
 	f = &vp->v_mount->mnt_stat.f_fsid;
 	va->va_fsid = (uint32_t)f->val[1];
 	va->va_fsid <<= sizeof(f->val[1]) * NBBY;
 	va->va_fsid += (uint32_t)f->val[0];
 }
 
 int
 vn_fsync_buf(struct vnode *vp, int waitfor)
 {
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 	struct mount *mp;
 	int error, maxretry;
 
 	error = 0;
 	maxretry = 10000;     /* large, arbitrarily chosen */
 	mp = NULL;
 	if (vp->v_type == VCHR) {
 		VI_LOCK(vp);
 		mp = vp->v_rdev->si_mountpt;
 		VI_UNLOCK(vp);
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 loop1:
 	/*
 	 * MARK/SCAN initialization to avoid infinite loops.
 	 */
         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 		bp->b_vflags &= ~BV_SCANNED;
 		bp->b_error = 0;
 	}
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 loop2:
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
 			    BO_LOCKPTR(bo)) != 0) {
 				BO_LOCK(bo);
 				goto loop1;
 			}
 			BO_LOCK(bo);
 		}
 		BO_UNLOCK(bo);
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("fsync: not dirty");
 		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
 			vfs_bio_awrite(bp);
 		} else {
 			bremfree(bp);
 			bawrite(bp);
 		}
 		if (maxretry < 1000)
 			pause("dirty", hz < 1000 ? 1 : hz / 1000);
 		BO_LOCK(bo);
 		goto loop2;
 	}
 
 	/*
 	 * If synchronous the caller expects us to completely resolve all
 	 * dirty buffers in the system.  Wait for in-progress I/O to
 	 * complete (which could include background bitmap writes), then
 	 * retry if dirty blocks still exist.
 	 */
 	if (waitfor == MNT_WAIT) {
 		bufobj_wwait(bo, 0, 0);
 		if (bo->bo_dirty.bv_cnt > 0) {
 			/*
 			 * If we are unable to write any of these buffers
 			 * then we fail now rather than trying endlessly
 			 * to write them out.
 			 */
 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 				if ((error = bp->b_error) != 0)
 					break;
 			if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
 			    (error == 0 && --maxretry >= 0))
 				goto loop1;
 			if (error == 0)
 				error = EAGAIN;
 		}
 	}
 	BO_UNLOCK(bo);
 	if (error != 0)
 		vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
 
 	return (error);
 }
 
 /*
  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
  * to do the actual copy.
  * vn_generic_copy_file_range() is factored out, so it can be called
  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
  * different file systems.
  */
 int
 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
     struct ucred *outcred, struct thread *fsize_td)
 {
 	int error;
 	size_t len;
 	uint64_t uval;
 
 	len = *lenp;
 	*lenp = 0;		/* For error returns. */
 	error = 0;
 
 	/* Do some sanity checks on the arguments. */
 	if (invp->v_type == VDIR || outvp->v_type == VDIR)
 		error = EISDIR;
 	else if (*inoffp < 0 || *outoffp < 0 ||
 	    invp->v_type != VREG || outvp->v_type != VREG)
 		error = EINVAL;
 	if (error != 0)
 		goto out;
 
 	/* Ensure offset + len does not wrap around. */
 	uval = *inoffp;
 	uval += len;
 	if (uval > INT64_MAX)
 		len = INT64_MAX - *inoffp;
 	uval = *outoffp;
 	uval += len;
 	if (uval > INT64_MAX)
 		len = INT64_MAX - *outoffp;
 	if (len == 0)
 		goto out;
 
 	/*
 	 * If the two vnode are for the same file system, call
 	 * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
 	 * which can handle copies across multiple file systems.
 	 */
 	*lenp = len;
 	if (invp->v_mount == outvp->v_mount)
 		error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
 		    lenp, flags, incred, outcred, fsize_td);
 	else
 		error = vn_generic_copy_file_range(invp, inoffp, outvp,
 		    outoffp, lenp, flags, incred, outcred, fsize_td);
 out:
 	return (error);
 }
 
 /*
  * Test len bytes of data starting at dat for all bytes == 0.
  * Return true if all bytes are zero, false otherwise.
  * Expects dat to be well aligned.
  */
 static bool
 mem_iszero(void *dat, int len)
 {
 	int i;
 	const u_int *p;
 	const char *cp;
 
 	for (p = dat; len > 0; len -= sizeof(*p), p++) {
 		if (len >= sizeof(*p)) {
 			if (*p != 0)
 				return (false);
 		} else {
 			cp = (const char *)p;
 			for (i = 0; i < len; i++, cp++)
 				if (*cp != '\0')
 					return (false);
 		}
 	}
 	return (true);
 }
 
 /*
  * Look for a hole in the output file and, if found, adjust *outoffp
  * and *xferp to skip past the hole.
  * *xferp is the entire hole length to be written and xfer2 is how many bytes
  * to be written as 0's upon return.
  */
 static off_t
 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
 {
 	int error;
 	off_t delta;
 
 	if (*holeoffp == 0 || *holeoffp <= *outoffp) {
 		*dataoffp = *outoffp;
 		error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
 		    curthread);
 		if (error == 0) {
 			*holeoffp = *dataoffp;
 			error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
 			    curthread);
 		}
 		if (error != 0 || *holeoffp == *dataoffp) {
 			/*
 			 * Since outvp is unlocked, it may be possible for
 			 * another thread to do a truncate(), lseek(), write()
 			 * creating a hole at startoff between the above
 			 * VOP_IOCTL() calls, if the other thread does not do
 			 * rangelocking.
 			 * If that happens, *holeoffp == *dataoffp and finding
 			 * the hole has failed, so disable vn_skip_hole().
 			 */
 			*holeoffp = -1;	/* Disable use of vn_skip_hole(). */
 			return (xfer2);
 		}
 		KASSERT(*dataoffp >= *outoffp,
 		    ("vn_skip_hole: dataoff=%jd < outoff=%jd",
 		    (intmax_t)*dataoffp, (intmax_t)*outoffp));
 		KASSERT(*holeoffp > *dataoffp,
 		    ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
 		    (intmax_t)*holeoffp, (intmax_t)*dataoffp));
 	}
 
 	/*
 	 * If there is a hole before the data starts, advance *outoffp and
 	 * *xferp past the hole.
 	 */
 	if (*dataoffp > *outoffp) {
 		delta = *dataoffp - *outoffp;
 		if (delta >= *xferp) {
 			/* Entire *xferp is a hole. */
 			*outoffp += *xferp;
 			*xferp = 0;
 			return (0);
 		}
 		*xferp -= delta;
 		*outoffp += delta;
 		xfer2 = MIN(xfer2, *xferp);
 	}
 
 	/*
 	 * If a hole starts before the end of this xfer2, reduce this xfer2 so
 	 * that the write ends at the start of the hole.
 	 * *holeoffp should always be greater than *outoffp, but for the
 	 * non-INVARIANTS case, check this to make sure xfer2 remains a sane
 	 * value.
 	 */
 	if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
 		xfer2 = *holeoffp - *outoffp;
 	return (xfer2);
 }
 
 /*
  * Write an xfer sized chunk to outvp in blksize blocks from dat.
  * dat is a maximum of blksize in length and can be written repeatedly in
  * the chunk.
  * If growfile == true, just grow the file via vn_truncate_locked() instead
  * of doing actual writes.
  * If checkhole == true, a hole is being punched, so skip over any hole
  * already in the output file.
  */
 static int
 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
 {
 	struct mount *mp;
 	off_t dataoff, holeoff, xfer2;
 	int error, lckf;
 
 	/*
 	 * Loop around doing writes of blksize until write has been completed.
 	 * Lock/unlock on each loop iteration so that a bwillwrite() can be
 	 * done for each iteration, since the xfer argument can be very
 	 * large if there is a large hole to punch in the output file.
 	 */
 	error = 0;
 	holeoff = 0;
 	do {
 		xfer2 = MIN(xfer, blksize);
 		if (checkhole) {
 			/*
 			 * Punching a hole.  Skip writing if there is
 			 * already a hole in the output file.
 			 */
 			xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
 			    &dataoff, &holeoff, cred);
 			if (xfer == 0)
 				break;
 			if (holeoff < 0)
 				checkhole = false;
 			KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
 			    (intmax_t)xfer2));
 		}
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(outvp, &mp, V_WAIT);
 		if (error != 0)
 			break;
 		if (growfile) {
 			error = vn_lock(outvp, LK_EXCLUSIVE);
 			if (error == 0) {
 				error = vn_truncate_locked(outvp, outoff + xfer,
 				    false, cred);
 				VOP_UNLOCK(outvp);
 			}
 		} else {
 			if (MNT_SHARED_WRITES(mp))
 				lckf = LK_SHARED;
 			else
 				lckf = LK_EXCLUSIVE;
 			error = vn_lock(outvp, lckf);
 			if (error == 0) {
 				error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
 				    outoff, UIO_SYSSPACE, IO_NODELOCKED,
 				    curthread->td_ucred, cred, NULL, curthread);
 				outoff += xfer2;
 				xfer -= xfer2;
 				VOP_UNLOCK(outvp);
 			}
 		}
 		if (mp != NULL)
 			vn_finished_write(mp);
 	} while (!growfile && xfer > 0 && error == 0);
 	return (error);
 }
 
 /*
  * Copy a byte range of one file to another.  This function can handle the
  * case where invp and outvp are on different file systems.
  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
  * is no better file system specific way to do it.
  */
 int
 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
 {
 	struct vattr va;
 	struct mount *mp;
 	struct uio io;
 	off_t startoff, endoff, xfer, xfer2;
 	u_long blksize;
 	int error, interrupted;
 	bool cantseek, readzeros, eof, lastblock;
 	ssize_t aresid;
 	size_t copylen, len, rem, savlen;
 	char *dat;
 	long holein, holeout;
 
 	holein = holeout = 0;
 	savlen = len = *lenp;
 	error = 0;
 	interrupted = 0;
 	dat = NULL;
 
 	error = vn_lock(invp, LK_SHARED);
 	if (error != 0)
 		goto out;
 	if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
 		holein = 0;
 	VOP_UNLOCK(invp);
 
 	mp = NULL;
 	error = vn_start_write(outvp, &mp, V_WAIT);
 	if (error == 0)
 		error = vn_lock(outvp, LK_EXCLUSIVE);
 	if (error == 0) {
 		/*
 		 * If fsize_td != NULL, do a vn_rlimit_fsize() call,
 		 * now that outvp is locked.
 		 */
 		if (fsize_td != NULL) {
 			io.uio_offset = *outoffp;
 			io.uio_resid = len;
 			error = vn_rlimit_fsize(outvp, &io, fsize_td);
 			if (error != 0)
 				error = EFBIG;
 		}
 		if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
 			holeout = 0;
 		/*
 		 * Holes that are past EOF do not need to be written as a block
 		 * of zero bytes.  So, truncate the output file as far as
 		 * possible and then use va.va_size to decide if writing 0
 		 * bytes is necessary in the loop below.
 		 */
 		if (error == 0)
 			error = VOP_GETATTR(outvp, &va, outcred);
 		if (error == 0 && va.va_size > *outoffp && va.va_size <=
 		    *outoffp + len) {
 #ifdef MAC
 			error = mac_vnode_check_write(curthread->td_ucred,
 			    outcred, outvp);
 			if (error == 0)
 #endif
 				error = vn_truncate_locked(outvp, *outoffp,
 				    false, outcred);
 			if (error == 0)
 				va.va_size = *outoffp;
 		}
 		VOP_UNLOCK(outvp);
 	}
 	if (mp != NULL)
 		vn_finished_write(mp);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Set the blksize to the larger of the hole sizes for invp and outvp.
 	 * If hole sizes aren't available, set the blksize to the larger 
 	 * f_iosize of invp and outvp.
 	 * This code expects the hole sizes and f_iosizes to be powers of 2.
 	 * This value is clipped at 4Kbytes and 1Mbyte.
 	 */
 	blksize = MAX(holein, holeout);
 
 	/* Clip len to end at an exact multiple of hole size. */
 	if (blksize > 1) {
 		rem = *inoffp % blksize;
 		if (rem > 0)
 			rem = blksize - rem;
 		if (len > rem && len - rem > blksize)
 			len = savlen = rounddown(len - rem, blksize) + rem;
 	}
 
 	if (blksize <= 1)
 		blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
 		    outvp->v_mount->mnt_stat.f_iosize);
 	if (blksize < 4096)
 		blksize = 4096;
 	else if (blksize > 1024 * 1024)
 		blksize = 1024 * 1024;
 	dat = malloc(blksize, M_TEMP, M_WAITOK);
 
 	/*
 	 * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
 	 * to find holes.  Otherwise, just scan the read block for all 0s
 	 * in the inner loop where the data copying is done.
 	 * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
 	 * support holes on the server, but do not support FIOSEEKHOLE.
 	 */
 	eof = false;
 	while (len > 0 && error == 0 && !eof && interrupted == 0) {
 		endoff = 0;			/* To shut up compilers. */
 		cantseek = true;
 		startoff = *inoffp;
 		copylen = len;
 
 		/*
 		 * Find the next data area.  If there is just a hole to EOF,
 		 * FIOSEEKDATA should fail and then we drop down into the
 		 * inner loop and create the hole on the outvp file.
 		 * (I do not know if any file system will report a hole to
 		 *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
 		 *  will fail for those file systems.)
 		 *
 		 * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
 		 * the code just falls through to the inner copy loop.
 		 */
 		error = EINVAL;
 		if (holein > 0)
 			error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
 			    incred, curthread);
 		if (error == 0) {
 			endoff = startoff;
 			error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
 			    incred, curthread);
 			/*
 			 * Since invp is unlocked, it may be possible for
 			 * another thread to do a truncate(), lseek(), write()
 			 * creating a hole at startoff between the above
 			 * VOP_IOCTL() calls, if the other thread does not do
 			 * rangelocking.
 			 * If that happens, startoff == endoff and finding
 			 * the hole has failed, so set an error.
 			 */
 			if (error == 0 && startoff == endoff)
 				error = EINVAL; /* Any error. Reset to 0. */
 		}
 		if (error == 0) {
 			if (startoff > *inoffp) {
 				/* Found hole before data block. */
 				xfer = MIN(startoff - *inoffp, len);
 				if (*outoffp < va.va_size) {
 					/* Must write 0s to punch hole. */
 					xfer2 = MIN(va.va_size - *outoffp,
 					    xfer);
 					memset(dat, 0, MIN(xfer2, blksize));
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer2, blksize, false,
 					    holeout > 0, outcred);
 				}
 
 				if (error == 0 && *outoffp + xfer >
 				    va.va_size && xfer == len)
 					/* Grow last block. */
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize, true,
 					    false, outcred);
 				if (error == 0) {
 					*inoffp += xfer;
 					*outoffp += xfer;
 					len -= xfer;
 					if (len < savlen)
 						interrupted = sig_intr();
 				}
 			}
 			copylen = MIN(len, endoff - startoff);
 			cantseek = false;
 		} else {
 			cantseek = true;
 			startoff = *inoffp;
 			copylen = len;
 			error = 0;
 		}
 
 		xfer = blksize;
 		if (cantseek) {
 			/*
 			 * Set first xfer to end at a block boundary, so that
 			 * holes are more likely detected in the loop below via
 			 * the for all bytes 0 method.
 			 */
 			xfer -= (*inoffp % blksize);
 		}
 		/* Loop copying the data block. */
 		while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
 			if (copylen < xfer)
 				xfer = copylen;
 			error = vn_lock(invp, LK_SHARED);
 			if (error != 0)
 				goto out;
 			error = vn_rdwr(UIO_READ, invp, dat, xfer,
 			    startoff, UIO_SYSSPACE, IO_NODELOCKED,
 			    curthread->td_ucred, incred, &aresid,
 			    curthread);
 			VOP_UNLOCK(invp);
 			lastblock = false;
 			if (error == 0 && aresid > 0) {
 				/* Stop the copy at EOF on the input file. */
 				xfer -= aresid;
 				eof = true;
 				lastblock = true;
 			}
 			if (error == 0) {
 				/*
 				 * Skip the write for holes past the initial EOF
 				 * of the output file, unless this is the last
 				 * write of the output file at EOF.
 				 */
 				readzeros = cantseek ? mem_iszero(dat, xfer) :
 				    false;
 				if (xfer == len)
 					lastblock = true;
 				if (!cantseek || *outoffp < va.va_size ||
 				    lastblock || !readzeros)
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize,
 					    readzeros && lastblock &&
 					    *outoffp >= va.va_size, false,
 					    outcred);
 				if (error == 0) {
 					*inoffp += xfer;
 					startoff += xfer;
 					*outoffp += xfer;
 					copylen -= xfer;
 					len -= xfer;
 					if (len < savlen)
 						interrupted = sig_intr();
 				}
 			}
 			xfer = blksize;
 		}
 	}
 out:
 	*lenp = savlen - len;
 	free(dat, M_TEMP);
 	return (error);
 }
 
 static int
 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	off_t olen, ooffset;
 	int error;
 #ifdef AUDIT
 	int audited_vnode1 = 0;
 #endif
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	/* Allocating blocks may take a long time, so iterate. */
 	for (;;) {
 		olen = len;
 		ooffset = offset;
 
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 		if (error != 0)
 			break;
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vn_finished_write(mp);
 			break;
 		}
 #ifdef AUDIT
 		if (!audited_vnode1) {
 			AUDIT_ARG_VNODE1(vp);
 			audited_vnode1 = 1;
 		}
 #endif
 #ifdef MAC
 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
 		if (error == 0)
 #endif
 			error = VOP_ALLOCATE(vp, &offset, &len);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 
 		if (olen + ooffset != offset + len) {
 			panic("offset + len changed from %jx/%jx to %jx/%jx",
 			    ooffset, olen, offset, len);
 		}
 		if (error != 0 || len == 0)
 			break;
 		KASSERT(olen > len, ("Iteration did not make progress?"));
 		maybe_yield();
 	}
 
 	return (error);
 }
 
 static u_long vn_lock_pair_pause_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
     &vn_lock_pair_pause_cnt, 0,
     "Count of vn_lock_pair deadlocks");
 
 u_int vn_lock_pair_pause_max;
 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
     &vn_lock_pair_pause_max, 0,
     "Max ticks for vn_lock_pair deadlock avoidance sleep");
 
 static void
 vn_lock_pair_pause(const char *wmesg)
 {
 	atomic_add_long(&vn_lock_pair_pause_cnt, 1);
 	pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
 }
 
 /*
  * Lock pair of vnodes vp1, vp2, avoiding lock order reversal.
  * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1
  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
  * can be NULL.
  *
  * The function returns with both vnodes exclusively locked, and
  * guarantees that it does not create lock order reversal with other
  * threads during its execution.  Both vnodes could be unlocked
  * temporary (and reclaimed).
  */
 void
 vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
     bool vp2_locked)
 {
 	int error;
 
 	if (vp1 == NULL && vp2 == NULL)
 		return;
 	if (vp1 != NULL) {
 		if (vp1_locked)
 			ASSERT_VOP_ELOCKED(vp1, "vp1");
 		else
 			ASSERT_VOP_UNLOCKED(vp1, "vp1");
 	} else {
 		vp1_locked = true;
 	}
 	if (vp2 != NULL) {
 		if (vp2_locked)
 			ASSERT_VOP_ELOCKED(vp2, "vp2");
 		else
 			ASSERT_VOP_UNLOCKED(vp2, "vp2");
 	} else {
 		vp2_locked = true;
 	}
 	if (!vp1_locked && !vp2_locked) {
 		vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
 		vp1_locked = true;
 	}
 
 	for (;;) {
 		if (vp1_locked && vp2_locked)
 			break;
 		if (vp1_locked && vp2 != NULL) {
 			if (vp1 != NULL) {
 				error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT,
 				    __FILE__, __LINE__);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(vp1);
 				vp1_locked = false;
 				vn_lock_pair_pause("vlp1");
 			}
 			vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
 			vp2_locked = true;
 		}
 		if (vp2_locked && vp1 != NULL) {
 			if (vp2 != NULL) {
 				error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT,
 				    __FILE__, __LINE__);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(vp2);
 				vp2_locked = false;
 				vn_lock_pair_pause("vlp2");
 			}
 			vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
 			vp1_locked = true;
 		}
 	}
 	if (vp1 != NULL)
 		ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
 	if (vp2 != NULL)
 		ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
 }